# MA23M002 - HW 3 - Gaussian discriminant analysis

# Q1

1)  Generate the data points for all classes c ∈ {0, . . . C − 1}, and compute
the approximations ˆµc and Σˆ
c that are based on the data points. Also,
generate a second set of points, with known tlabels, hat will be used to test
the classification function

In [None]:
import numpy as np

def getArtificialData(mean, cov, nx, nt):
    x = np.random.multivariate_normal(mean, cov, nx)  # Generate data points

    mu = np.mean(x, axis=0) # Mean on data points
    sigma = np.cov(x.T) # Covariance of data points

    t = np.random.multivariate_normal(mean, cov, nt)      # Generate test points
    return x, t, mu, sigma

In [None]:
nx = [40, 80, 20]  # Number of data points
nt = [10, 20, 5]   # Number of test points
c = 3 # Total classes
D = 2 # Total features
mean = [[0, 0], [7, 5], [-5, 5]]     # Mean of the 3 classes
cov = [[[2, 1], [1, 50]], [[3, 1], [1, 3]], [[5, 2], [2, 3]]] # Covariance of 3 classes

In [None]:
# Store data of each classes
data_dict = {}    # Each class with key Ci and value as list [data points, test points, mean, cov]
for i in range(c):
  x, t, mu, sigma = getArtificialData(mean[i], cov[i], nx[i], nt[i])
  #print(len(t))
  data_dict["C"+str(i)] = [x, t, mu, sigma]

In [None]:
# Merging the test points of each class into one long vector
test_points = np.concatenate((data_dict["C0"][1],data_dict["C1"][1],data_dict["C2"][1]))
print(test_points.shape)

(35, 2)


# Q2

2)  Evaluate the multivariate Gaussian distribution on the test points of all
classes to obtain the probabilities p(X = t|Y = c), where t runs through
all test points and c runs through all classes.

In [None]:
import numpy as np
import scipy

def evaluateMultiVarGauss(t, mu, Sgm):
    """
    Parameters:
        t: t[n,d] is the d-th component of the n-th point
        mu: mean
        Sgm: covariance matrix
    Returns:
        p: p[n] is the probability density function at the n-th point.
    """
    multivar_distribution = scipy.stats.multivariate_normal(mu, Sgm)  # Creating a multivariate normal distribution

    # Calculate the probability density function for each test points and storing them in list p
    p = []
    for i in range(t.shape[0]):
        pdf_val = multivar_distribution.pdf(t[i])
        p.append(pdf_val)
    return p

In [None]:
# Evaluating the multivariate Gaussian distribution on the test points for Class 0
mu = data_dict['C0'][2]
sigma = data_dict['C0'][3]
# Probabilities of test points w.r.t C0
p_list0 = evaluateMultiVarGauss(test_points, mu, sigma)

In [None]:
# Evaluating the multivariate Gaussian distribution on the test points for Class 1
mu = data_dict['C1'][2]
sigma = data_dict['C1'][3]
# Probabilities of test points w.r.t C1
p_list1 = evaluateMultiVarGauss(test_points, mu, sigma)

In [None]:
# Evaluating the multivariate Gaussian distribution on the test points for Class 2
mu = data_dict['C2'][2]
sigma = data_dict['C2'][3]
# Probabilities of test points w.r.t C2
p_list2 = evaluateMultiVarGauss(test_points, mu, sigma)

In [None]:
# Showing the probabilities of testpoints for each class
import pandas as pd
data = {'Class 0': p_list0, 'Class 1': p_list1, 'Class 2': p_list2}
df = pd.DataFrame(data)

In [None]:
df

Unnamed: 0,Class 0,Class 1,Class 2
0,0.0005422797,7.919291e-07,0.0004505025
1,0.0005951074,0.006927263,9.499317e-15
2,0.008432732,3.240793e-06,0.0005064477
3,0.01496501,1.709936e-06,1.016669e-14
4,0.01999488,1.248193e-06,3.744651e-22
5,0.01534815,5.10815e-06,1.604745e-10
6,0.002930024,1.179468e-09,0.03497997
7,0.01450922,6.713828e-06,6.843184e-08
8,0.009199138,8.505535e-07,1.185295e-07
9,0.01661811,6.763396e-06,8.761069e-26


# Q3

3) Compute the probabilities p(Y = c|X = t) on the test points using Bayes’
formula and for every test point find the value of c with the maximal probability

In [None]:
def calculateTestSet(t, pXY, yEx):
    """
    Calculate the test set results and count mislabelled points.

    Parameters:
        t:  coordinates of the test points: t[n,d]
        pXY: conditional probabilities: pXY[n,c]
        yEx:  exact labels of the test points: y[n]
        classProb: Probability of each class P(c=ci)
    Returns:
        predictedProb: classProbs[n] - probability of each class for point t[n] - [pC1, pC2, pC3]
        predictedClass: class where the point t[n] belongs to - (by finding max probability among 3 classes)
        misClass: # of mislabelled points
    """
    # Calculate class probabilities
    _, counts = np.unique(yEx, return_counts=True)
    classProb = counts / len(yEx)

    predictedProb = []
    predictedClass = []
    misClass = 0

    for i in range(len(t)):
      denom = pXY[i][0]*classProb[0]+pXY[i][1]*classProb[1]+pXY[i][2]*classProb[2]
      pC1 = (pXY[i][0]*classProb[0])/denom
      pC2 = (pXY[i][1]*classProb[1])/denom
      pC3 = (pXY[i][2]*classProb[2])/denom
      if pC1 > pC2 and pC1 > pC3:
        cls = 0
      elif pC2 > pC3 and pC2 > pC1:
        cls = 1
      else:
        cls = 2
      predictedProb.append([pC1, pC2, pC3])
      predictedClass.append(cls)
      if cls!= yEx[i]:
        misClass += 1
    return predictedProb, predictedClass, misClass

In [None]:
# Conditional probabilities pXY -
pXY = df.values
# Exact labels yEx -
yEx = np.concatenate(([0]*10, [1]*20, [2]*5))

In [None]:
_, predicted_cls, misClass = calculateTestSet(test_points, pXY, yEx)

In [None]:
print(f"\nNumber of misclassified points: {misClass}")


Number of misclassified points: 2


 Testing tied covariances (i.e., linear decision boundaries) calculated using give equation $\frac{1}{N}\sum_{c} N_cSgm_c$

In [None]:
tied_cov = (2*data_dict['C0'][3] + 4*data_dict['C1'][3] + data_dict['C2'][3])/7        # {N0/N, N1/N, N2/N} = {2/7, 4/7, 1/7}
print(tied_cov)

[[ 3.14352333  0.68247902]
 [ 0.68247902 11.06787416]]


In [None]:
# Evaluating the multivariate Gaussian distribution on the test points for Class 0 with tied covariances...
mu = data_dict['C0'][2]
sigma = tied_cov
# Probabilities of test points w.r.t C0
p_list0 = evaluateMultiVarGauss(test_points, mu, sigma)

In [None]:
# Evaluating the multivariate Gaussian distribution on the test points for Class 1
mu = data_dict['C1'][2]
sigma = tied_cov
# Probabilities of test points w.r.t C1
p_list1 = evaluateMultiVarGauss(test_points, mu, sigma)

In [None]:
# Evaluating the multivariate Gaussian distribution on the test points for Class 2
mu = data_dict['C2'][2]
sigma = tied_cov
# Probabilities of test points w.r.t C2
p_list2 = evaluateMultiVarGauss(test_points, mu, sigma)

In [None]:
pXY = np.column_stack((p_list0, p_list1, p_list2))
pXY.shape

(35, 3)

In [None]:
_, predicted_cls_tied, misClass_tied = calculateTestSet(test_points, pXY, yEx)

In [None]:
print(f"\nNumber of misclassified points with tied covariance: {misClass_tied}")


Number of misclassified points with tied covariance: 3


(3) Test the methodology on the iris data set, that was mentioned in the lecture.


In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
iris_data = iris.data

In [None]:
def getIrisData(data):
  x = data[:40,:]         # data points
  mu = np.mean(x, axis=0) # Mean on data points
  sigma = np.cov(x.T)     # Covariance of data points
  t = data[40:50,:]       # test points
  return [x, t, mu, sigma]

In [None]:
# Store data of each classes
iris_data_dict = {}    # Each class with key Ci and value as list [data points, test points, mean, cov]
iris_data_dict["C0"] = getIrisData(iris_data[:50,:])
iris_data_dict["C1"] = getIrisData(iris_data[50:100,:])
iris_data_dict["C2"] = getIrisData(iris_data[100:150,:])

In [None]:
# Merging the test points of each class into one long vector
iris_test_points = np.concatenate((iris_data_dict["C0"][1],iris_data_dict["C1"][1],iris_data_dict["C2"][1]))
print(iris_test_points.shape)

(30, 4)


In [None]:
# Evaluating the multivariate Gaussian distribution on the iris test points
iris_p_list0 = evaluateMultiVarGauss(iris_test_points, iris_data_dict["C0"][2], iris_data_dict["C0"][3])  # For class 0
iris_p_list1 = evaluateMultiVarGauss(iris_test_points, iris_data_dict["C1"][2], iris_data_dict["C1"][3])  # For class 1
iris_p_list2 = evaluateMultiVarGauss(iris_test_points, iris_data_dict["C2"][2], iris_data_dict["C2"][3])  # For class 2

In [None]:
# Conditional probabilities pXY -
iris_pXY = np.array([iris_p_list0,iris_p_list1,iris_p_list2]).T
print(iris_pXY.shape)

(30, 3)


In [None]:
# Exact labels yEx -
iris_yEx = np.concatenate(([0]*10, [1]*10, [2]*10))

In [None]:
_, iris_predicted_cls, iris_misClass = calculateTestSet(iris_test_points, iris_pXY, iris_yEx)

In [None]:
print(f"\nNumber of misclassified points: {iris_misClass}")


Number of misclassified points: 0


END OF HW 3

