# DISCRIMINANT ANALYSIS

In this coding assignment you are to implement a Minimum Risk Bayes Decision Theoretic classifier and use it to classify the test examples in the provided datasets.  
Assume the following:
1. All conditional density functions are multivariate Gaussian
2. Each class has its own covariance matrix
3. Equally likely prior probabilities
4. 0-1 loss function


## Training Phase

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load training data - 135 observations, 4 features, 3 classes, 
df = pd.read_csv("/iris_corrupted_training_data.csv")
print(df.head())
df = df.values
tr_data = df

# Load validation data - 15 samples
df = pd.read_csv("/iris_validation_data.csv")
print(df.head())
df = df.values
val_data = df

   sepal_length   sepal_width   petal_length   petal_width   class
0        5.7147        2.6743         3.2696       1.65440       2
1        5.1734        3.7374         5.9442       3.00050       3
2        7.3776        3.1505         3.3543       0.64839       2
3        6.4908        2.3983         3.3917       1.54950       2
4        6.8182        3.4016         4.7495       0.57970       3
   sepal_length   sepal_width   petal_length   petal_width   class
0           4.4           2.9            1.4           0.2       1
1           6.7           3.0            5.2           2.3       3
2           4.9           3.1            1.5           0.2       1
3           5.1           2.5            3.0           1.1       2
4           6.1           3.0            4.6           1.4       2


In [21]:
### Determine the various components of the discriminant functions

# TO DO: 
# Find the mean of each class
#  u1, u2, u3 are the 1x4 mean vectors for tr_data1, tr_data2, tr_data3 matrices
#  where tr_data1 is a 45x4 matrix consisting of all class 1 examples. Same is
#  true for tr_data2 and tr_data3. Hence dimensions of u1, u2, u3 = 1x4

tr_data1 = (tr_data[tr_data[:,4] == 1])[:,0:4]
tr_data2 = (tr_data[tr_data[:,4] == 2])[:,0:4]
tr_data3 = (tr_data[tr_data[:,4] == 3])[:,0:4]

u1 = tr_data1.mean(axis=0)
u2 = tr_data2.mean(axis=0)
u3 = tr_data3.mean(axis=0)

mean_matrix1 = np.array(45*[4*[1.1]])
mean_matrix2 = np.array(45*[4*[1.1]])
mean_matrix3 = np.array(45*[4*[1.1]])

for i in range(0, 4):
  mean_matrix1[:,i] = u1[i]

for i in range(0, 4):
  mean_matrix2[:,i] = u2[i]

for i in range(0, 4):
  mean_matrix3[:,i] = u3[i]

# TO DO:
# Find the covariance of each class
#  cov1, cov2, cov3 are the covariance matrices of 
#      tr_data1, tr_data2, tr_data3
#  dimension cov1, cov2, cov3 must be 4x4
cov1 = np.matmul((tr_data1-mean_matrix1).transpose(), (tr_data1-mean_matrix1))/45
cov2 = np.matmul((tr_data2-mean_matrix2).transpose(), (tr_data2-mean_matrix2))/45
cov3 = np.matmul((tr_data3-mean_matrix3).transpose(), (tr_data3-mean_matrix3))/45

# TO DO: 
# Compute the determinant of cov* and its log. These are scalar quantities
detlog_cv1 = np.log(np.linalg.det(cov1))
detlog_cv2 = np.log(np.linalg.det(cov2))
detlog_cv3 = np.log(np.linalg.det(cov3))

# TO DO:
# Compute the inverse of cov*
#   These are matrices of size 4x4
#   Hint: you may use np.linalg.inv
cov_inv1 = np.linalg.inv(cov1)
cov_inv2 = np.linalg.inv(cov2)
cov_inv3 = np.linalg.inv(cov3)

# Equally likely proir prob.
log_prior = np.log(1/3)

In [22]:
# print the mean vectors
print(u1)
print(u2)
print(u3)

[4.80081778 3.48799556 1.26920989 0.34787733]
[6.06588222 2.82287978 4.26241333 1.10785197]
[6.42966    2.95656956 5.55874667 1.92476547]


## Validation phase

In [23]:
# Evaluate the model accuracy with the validation dataset
# The dimension of the validation dataset, val_data, is 15x5. The first four
# columns are the feature columns and the last column is the class label column

# For each sample, compute the discriminant function (g1, g2, g3) corresponding to each class
# Assume equal prior = 1/3
# The predicted class label is associated with the largest of g1, g2, g3
# Count the number of correctly predicted labels

correct_class = 0;  # number of correctly predicted label

for i in range(0, len(val_data)):
    test_data = val_data[i][0:4]
    y = val_data[i][4]

    # TO DO: compute g1, g2, g3
    g1 = (-(test_data - u1).dot(cov_inv1).dot((test_data - u1).transpose())/2) - (detlog_cv1/2) + log_prior
    g2 = (-(test_data - u2).dot(cov_inv2).dot((test_data - u2).transpose())/2) - (detlog_cv2/2) + log_prior
    g3 = (-(test_data - u3).dot(cov_inv3).dot((test_data - u3).transpose())/2) - (detlog_cv3/2) + log_prior 

    # TO DO: 
    #  Now find the predicted class y_hat, compare it with the true label y
    #  and count the number of corectly predicted labels (correct_class)
    #  Recall this is a classification problem, hence y_hat should be 
    #  a discrete value (1, 2 or 3)
    if g1>g2 and g1>g3:
      yhat = 1
    elif g2>g1 and g2>g3:
      yhat = 2
    else:
      yhat = 3

    if (yhat == y):
        correct_class = correct_class + 1;

print('Classification accuracy = ', '{0:.4f}'. format(correct_class/15))

Classification accuracy =  0.9333
