# DISCRIMINANT ANALYSIS

In this coding assignment you are to implement a Minimum Risk Bayes Decision Theoretic classifier. Use the training set to train the classifier and the validation set to evaluate the accuracy. 

Assume the following:
1. All conditional density functions are multivariate Gaussian
2. Each class has its own covariance matrix
3. Equally likely prior probabilities
4. 0-1 loss function


## Load datasets

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load training data - 135 observations, 4 features, 3 classes, 
df = pd.read_csv("iris_corrupted_training_dataset.csv")
print(df.head())
df = df.values
train_data = df

# Load validation data - 15 samples
df = pd.read_csv("iris_validation_dataset.csv")
print(df.head())
df = df.values
val_data = df

   sepal_length   sepal_width   petal_length   petal_width   class
0        5.7147        2.6743         3.2696       1.65440       2
1        5.1734        3.7374         5.9442       3.00050       3
2        7.3776        3.1505         3.3543       0.64839       2
3        6.4908        2.3983         3.3917       1.54950       2
4        6.8182        3.4016         4.7495       0.57970       3
   sepal_length   sepal_width   petal_length   petal_width   class
0           4.4           2.9            1.4           0.2       1
1           6.7           3.0            5.2           2.3       3
2           4.9           3.1            1.5           0.2       1
3           5.1           2.5            3.0           1.1       2
4           6.1           3.0            4.6           1.4       2


In [None]:
## Your code goes here ...

# Compute various components of the disriminant functions
train_data1 = train_data[np.where(train_data[:,4]==1),:]  
train_data2 = train_data[np.where(train_data[:,4]==2),:]
train_data3 = train_data[np.where(train_data[:,4]==3),:]
[i,j,k] = np.shape(train_data1)

# reshape to 2D (45,5), last col is label col
train_data1 = train_data1.reshape(j,k)  
train_data2 = train_data2.reshape(j,k)
train_data3 = train_data3.reshape(j,k)

# Size of tr_data* are now 45x4
train_data1 = train_data1[:,0:4]
train_data2 = train_data2[:,0:4]
train_data3 = train_data3[:,0:4]

# Find the mean of each class
u1,u2,u3 = [],[],[]

for i in range(np.shape(train_data1)[1]):
    u1.append(np.mean(train_data1[:,i]))
    u2.append(np.mean(train_data2[:,i]))
    u3.append(np.mean(train_data3[:,i]))

# Find the covariance of each class
cov1 = np.cov(np.transpose(train_data1))
cov2 = np.cov(np.transpose(train_data2))
cov3 = np.cov(np.transpose(train_data3))

# Compute the determinant of cov* and its log
D1 = np.linalg.det(cov1)
D2 = np.linalg.det(cov2)
D3 = np.linalg.det(cov3)
l1 = np.log(D1)
l2 = np.log(D2)
l3 = np.log(D3)

# Compute the inverse of cov*
I1 = np.linalg.inv(cov1)
I2 = np.linalg.inv(cov2)
I3 = np.linalg.inv(cov3)

# Equally likely prior problem
log_prior = np.log(1/3)

In [5]:
print(u1)
print(u2)
print(u3)
print(cov1)
print(cov2)
print(cov3)

[4.800817777777778, 3.4879955555555555, 1.2692098888888892, 0.34787733333333337]
[6.065882222222222, 2.8228797777777777, 4.262413333333333, 1.1078519666666666]
[6.42966, 2.956569555555556, 5.558746666666666, 1.9247654666666667]
[[ 0.73847372 -0.09788292  0.162097    0.09430334]
 [-0.09788292  1.04517177  0.08250472  0.06122466]
 [ 0.162097    0.08250472  0.75386746  0.07747734]
 [ 0.09430334  0.06122466  0.07747734  0.51347455]]
[[ 1.02666705  0.16051089  0.28736137 -0.10850815]
 [ 0.16051089  0.80414317  0.20221368 -0.07318826]
 [ 0.28736137  0.20221368  0.74048204 -0.04380217]
 [-0.10850815 -0.07318826 -0.04380217  0.69674064]]
[[1.36272732 0.26608677 0.44568822 0.30336696]
 [0.26608677 1.03934606 0.12853287 0.18437967]
 [0.44568822 0.12853287 0.69605886 0.23021863]
 [0.30336696 0.18437967 0.23021863 0.85756954]]


In [None]:
correct_class = 0;  

for i in range(0, len(val_data)):
    x = val_data[i,0:4]  
    y = val_data[i,4]    
    
    g1 = - (0.5)* np.dot(np.dot(np.transpose(x-u1),I1),(x-u1)) - (0.5)*l1 + log_prior
    g2 = - (0.5)* np.dot(np.dot(np.transpose(x-u2),I2),(x-u2)) - (0.5)*l2 + log_prior
    g3 = - (0.5)* np.dot(np.dot(np.transpose(x-u3),I3),(x-u3)) - (0.5)*l3 + log_prior 
    
    g = [g1,g2,g3]
    if(max(g)==g1):
        yhat=1
    elif(max(g)==g2):
        yhat = 2
    else:
        yhat=3
    
    if (yhat == y):
        correct_class = correct_class + 1;

print('Classification accuracy = ', '{0:.4f}'. format(correct_class/15))

Classification accuracy =  0.9333
