# DISCRIMINANT ANALYSIS

In this coding assignment you are to implement a Minimum Risk Bayes Decision Theoretic classifier and use it to classify the test examples in the provided datasets.  
Assume the following:
1. All conditional density functions are multivariate Gaussian
2. Each class has its own covariance matrix
3. Equally likely prior probabilities
4. 0-1 loss function


## Training Phase

In [37]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
train_path = '/content/drive/MyDrive/Data/bayes-classifier/iris_corrupted_training_data.csv'
test_path = '/content/drive/MyDrive/Data/bayes-classifier/iris_validation_data.csv'

In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load training data - 135 observations, 4 features, 3 classes, 
df = pd.read_csv(train_path)
print(df.head())
df = df.values
tr_data = df

# Load validation data - 15 samples
df = pd.read_csv(test_path)
print(df.head())
df = df.values
val_data = df

   sepal_length   sepal_width   petal_length   petal_width   class
0        5.7147        2.6743         3.2696       1.65440       2
1        5.1734        3.7374         5.9442       3.00050       3
2        7.3776        3.1505         3.3543       0.64839       2
3        6.4908        2.3983         3.3917       1.54950       2
4        6.8182        3.4016         4.7495       0.57970       3
   sepal_length   sepal_width   petal_length   petal_width   class
0           4.4           2.9            1.4           0.2       1
1           6.7           3.0            5.2           2.3       3
2           4.9           3.1            1.5           0.2       1
3           5.1           2.5            3.0           1.1       2
4           6.1           3.0            4.6           1.4       2


In [43]:
# Compute various components of the disriminant functions
tr_data1 = tr_data[np.where(tr_data[:,4]==1),:]  # shape = 1,45,5
tr_data2 = tr_data[np.where(tr_data[:,4]==2),:]
tr_data3 = tr_data[np.where(tr_data[:,4]==3),:]
[i,j,k] = np.shape(tr_data1)

tr_data1 = tr_data1.reshape(j,k)  # reshape to 2D (45,5), last col is label col
tr_data2 = tr_data2.reshape(j,k)
tr_data3 = tr_data3.reshape(j,k)

# Size of tr_data* are now 45x4
tr_data1 = tr_data1[:,0:4]
tr_data2 = tr_data2[:,0:4]
tr_data3 = tr_data3[:,0:4]

# TO DO: 
# Find the mean of each class
#  u1, u2, u3 are the 1x4 mean vectors for tr_data1, tr_data2, tr_data3 matrices
#  Note: dimension of each of tr_data is 45x4, 
#        hence dimensions of u1, u2, u3 = 1x4
# Hint: use np.mean

u1,u2,u3 = [],[],[]


for i in range(np.shape(tr_data1)[1]):
    u1.append(np.mean(tr_data1[:,i]))
    u2.append(np.mean(tr_data2[:,i]))
    u3.append(np.mean(tr_data3[:,i]))

# TO DO:
# Find the covariance of each class
#  cov1, cov2, cov3 are the covariance matrices of 
#      tr_data1, tr_data2, tr_data3
#  dimension cov1, cov2, cov3 must be 4x4
# Hint: use np.cov, np.tranpose
cov1 = np.cov(np.transpose(tr_data1))
cov2 = np.cov(np.transpose(tr_data2))
cov3 = np.cov(np.transpose(tr_data3))

# TO DO: 
# Compute the determinant of cov* and its log. These are scalar quantities
#  Hint: use np.log, np.linalg.det

D1 = np.linalg.det(cov1)
D2 = np.linalg.det(cov2)
D3 = np.linalg.det(cov3)
l1 = np.log(D1)
l2 = np.log(D2)
l3 = np.log(D3)

# TO DO:
# Compute the inverse of cov*
#   These are matrices of size 4x4
#   Hint: use np.linalg.inv

I1 = np.linalg.inv(cov1)
I2 = np.linalg.inv(cov2)
I3 = np.linalg.inv(cov3)

# Equally likely proir prob.
log_prior = np.log(1/3)

In [44]:
# print the mean vectors and the covariance matrices
print(u1)
print(u2)
print(u3)
print(cov1)
print(cov2)
print(cov3)

[4.800817777777778, 3.4879955555555555, 1.2692098888888892, 0.34787733333333337]
[6.065882222222222, 2.8228797777777777, 4.262413333333333, 1.1078519666666666]
[6.42966, 2.956569555555556, 5.558746666666666, 1.9247654666666667]
[[ 0.73847372 -0.09788292  0.162097    0.09430334]
 [-0.09788292  1.04517177  0.08250472  0.06122466]
 [ 0.162097    0.08250472  0.75386746  0.07747734]
 [ 0.09430334  0.06122466  0.07747734  0.51347455]]
[[ 1.02666705  0.16051089  0.28736137 -0.10850815]
 [ 0.16051089  0.80414317  0.20221368 -0.07318826]
 [ 0.28736137  0.20221368  0.74048204 -0.04380217]
 [-0.10850815 -0.07318826 -0.04380217  0.69674064]]
[[1.36272732 0.26608677 0.44568822 0.30336696]
 [0.26608677 1.03934606 0.12853287 0.18437967]
 [0.44568822 0.12853287 0.69605886 0.23021863]
 [0.30336696 0.18437967 0.23021863 0.85756954]]


## Validation phase

In [45]:
# Evaluate model accuracy with validation dataset
# The dimension of the validation dataset, val_data, is 15x5. The first four
# columns are the feature columns and the last column is the class label column

# For each sample, compute the discriminant function (g1, g2, g3) corresponding to each class
# Assume equal prior = 1/3
# The predicted class label is associated with the largest of g1, g2, g3
# Count the number of correctly predicted labels

correct_class = 0;  # number of correctly predicted label

for i in range(0, len(val_data)):
    
    x = val_data[i,0:4]  # test sample's feature vector (transpose) 1x4
    y = val_data[i,4]    # test samples's true label
    
    # TO DO: compute g1, g2, g3
    g1 = - (0.5)* np.dot(np.dot(np.transpose(x-u1),inv1),(x-u1)) - (0.5)*l1 + log_prior
    g2 = - (0.5)* np.dot(np.dot(np.transpose(x-u2),inv2),(x-u2)) - (0.5)*l2 + log_prior
    g3 = - (0.5)* np.dot(np.dot(np.transpose(x-u3),inv3),(x-u3)) - (0.5)*l3 + log_prior



    # TO DO: 
    #  Now find the predicted class y_hat, compare it with the true label y
    #  and count the number of corectly predicted labels (correct_class)
    #  Recall this is a classification problem, hence y_hat should be 
    #  a discrete value (1, 2 or 3)
    g = [g1,g2,g3]
    if(max(g)==g1):
        yhat=1
    elif(max(g)==g2):
        yhat = 2
    else:
        yhat=3
    
    if (yhat == y):
        correct_class = correct_class + 1;

print('Classification accuracy = ', '{0:.4f}'. format(correct_class/15))

Classification accuracy =  0.9333
