**Name : Archana Kalburgi**

**CWID : 10469491**

**Solution to Question 3**

Steps involved:

1. Load and read the pima-indians-diabetes data 

2. Calculate the mean vectors for different classes 

3. Calculate Within-class scatter matrix s_w

4. Calculate Between-class scatter matrix s_b

5. Solve the generalized eigenvalue problem for the matrix

6. Select linear discriminants for the new feature subspace

7. Sort the eigenvectors by decreasing eigenvalues

8. Choose k eigenvectors with the largest eigenvalues

9. Transform the samples onto the new subspace

10. Train a classifier using MLE after the data have been projected.

11. Report accuracy for 10 runs by randomly splitting the data 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# data
data_raw = pd.read_csv("/content/drive/MyDrive/ML_Assignments/ML_Assign_test/pima-indians-diabetes.csv")

c0_label = data_raw[data_raw["Class variable"]==0]
c0 = c0_label.iloc[:, :-1].to_numpy()

c1_label = data_raw[data_raw["Class variable"]==1]
c1 = c1_label.iloc[:, :-1].to_numpy() 

In [None]:
# mean vectors for different classes
mean0 = c0.mean(axis=0)
mean1 = c1.mean(axis=0)
data = data_raw.iloc[:, :-1].to_numpy()
mean_overall = data.mean(axis=0)

In [None]:
# Calculate Within-class scatter matrix s_w
features = data.shape[1]
s_w0 = (c0 - mean0).T.dot(c0 - mean0)
s_w1 = (c1 - mean1).T.dot(c1 - mean1)

s_w = s_w0 + s_w1
s_w

In [None]:
# Calculate Between-class scatter matrix s_b
mean_diff0 = (mean0 - mean_overall).reshape(features, 1)
mean_diff1 = (mean1 - mean_overall).reshape(features, 1)

nc0 = c0.shape[0]
nc1 = c1.shape[0]

s_b0 = nc0 * mean_diff0.dot(mean_diff0.T)
s_b1 = nc1 * mean_diff1.dot(mean_diff1.T)

s_b = s_b0 + s_b1
s_b

array([[4.28799019e+02, 8.55520056e+03, 7.22280214e+02, 6.83864094e+02,
        8.62805647e+03, 1.32341125e+03, 3.30326459e+01, 1.60755745e+03],
       [8.55520056e+03, 1.70689422e+05, 1.44106022e+04, 1.36441415e+04,
        1.72143009e+05, 2.64040919e+04, 6.59052140e+02, 3.20732460e+04],
       [7.22280214e+02, 1.44106022e+04, 1.21662757e+03, 1.15191846e+03,
        1.45333226e+04, 2.22918832e+03, 5.56410475e+01, 2.70781156e+03],
       [6.83864094e+02, 1.36441415e+04, 1.15191846e+03, 1.09065105e+03,
        1.37603347e+04, 2.11062385e+03, 5.26816515e+01, 2.56379043e+03],
       [8.62805647e+03, 1.72143009e+05, 1.45333226e+04, 1.37603347e+04,
        1.73608976e+05, 2.66289486e+04, 6.64664615e+02, 3.23463811e+04],
       [1.32341125e+03, 2.64040919e+04, 2.22918832e+03, 2.11062385e+03,
        2.66289486e+04, 4.08447144e+03, 1.01949336e+02, 4.96143772e+03],
       [3.30326459e+01, 6.59052140e+02, 5.56410475e+01, 5.26816515e+01,
        6.64664615e+02, 1.01949336e+02, 2.54467863e+00, 1.

In [None]:
# Solve the generalized eigenvalue problem for the matrix
A = np.linalg.inv(s_w).dot(s_b)

In [None]:
# Select linear discriminants for the new feature subspace
evalues, evector = np.linalg.eig(A)
evector = evector.T

In [None]:
# Sort the eigenvectors by decreasing eigenvalues
idx = np.argsort(abs(evalues))[::-1]
evalues = evalues[idx]
evector = evector[idx]

# Choose k eigenvectors with the largest eigenvalues
discriminants = evector[0:1] # bcoz we have on 2 dimensions, optimal direction

# Transform the samples onto the new subspace -> project the data
x_projected = np.dot(data, discriminants.T)
len(x_projected)

768

In [None]:
target = data_raw.iloc[:, 8]

ddf = pd.DataFrame(x_projected, columns = ["FDA"])
ddf 

df_with_class = pd.concat([ddf, pd.DataFrame(target)], axis= 1)
df_with_class

# data_raw.head(5)

Unnamed: 0,FDA,Class variable
0,-10.076782+0.000000j,1
1,-5.753172+0.000000j,0
2,-10.643429+0.000000j,1
3,-5.568082+0.000000j,0
4,-11.291041+0.000000j,1
...,...,...
763,-8.216005+0.000000j,0
764,-8.115602+0.000000j,0
765,-7.338209+0.000000j,0
766,-8.030298+0.000000j,1


In [None]:
# Train a classifier using MLE after the data have been projected.

def noramal_eq(xi, mu, sigma_inv, scalar):
  pp = (-1/2)*np.dot(np.matmul(xi - mu, sigma_inv), xi - mu)
  return scalar * (np.e**pp)

# calculating mean, sigma, sigma_inverse, scalar
def components(x):
  mu = np.mean(x, axis=0)
  sigma = np.cov(x, rowvar=False)
  sigma_inv = np.linalg.inv(sigma)
  scalar = 1/np.sqrt(((2*np.pi)**x.shape[1])*np.linalg.det(sigma))
  return (mu, sigma_inv, scalar)

# computing likelihood
def likelihood(x, mu, sigma_inv, scalar):
  return [noramal_eq(x, mu, sigma_inv, scalar) for x in range(x.shape[0])]

# computing the accuracy 
def predit(train_x, train_y, test_x, test_y):
  # compute train accuracy(on train data)
  mu0, sigma_inv0, scalar0 = components(train_x)
  l0 = likelihood(train_x, mu0, sigma_inv0, scalar0 )
  mu1, sigma_inv1, scalar1 = components(train_x)
  l1 = likelihood(train_x, mu1, sigma_inv1, scalar1)

  predicted_y = np.array([1 if ll1 > ll0 else 0 for (ll0, ll1) in zip(l0, l1)])
  train_n_correct = sum([1 if predicted_y[i] == train_y[i] else 0 for i in range(train_y.shape[0])])
  
  # compute test accuracy (on test data)
  tl_0 = likelihood(test_x, mu0, sigma_inv0, scalar0 )
  tl_1 = likelihood(test_x, mu1, sigma_inv1, scalar1 )

  predited_test_y = np.array([1 if ll1 > ll0 else 0 for (ll0, ll1) in zip(tl_0, tl_1)])
  test_n_correct = sum([1 if predicted_y[i] == test_y[i] else 0 for i in range(test_y.shape[0])])

  return (train_n_correct / train_y.shape[0], test_n_correct / test_y.shape[0]) 

In [None]:
from sklearn.model_selection import train_test_split

train_10scores = []
test_10scores = []
for i in range(1,11):
    print("----------------------------------------")
    (x_train, x_test, y_train, y_test) = train_test_split(df_with_class, df_with_class.iloc[:,1], train_size=0.5)
    train_score, test_score = predit(x_train.iloc[:,0:3].to_numpy(), y_train.to_numpy(), x_test.iloc[:,0:3].to_numpy(), y_test.to_numpy())
    np.array(train_10scores.append(train_score))
    np.array(test_10scores.append(test_score))
    print(f"Train Score is {train_score}")
    print(f"Test Score is {test_score}") 

# printing out the mean and standard deviation of all the 10 accuracy scores of train and test data 
print("-------------------------------------------------")
print("\n")
# print(f"Mean of the accuracies for train data = {np.mean(train_10scores)}")
print(f"Average classification accuracy over 10 runs = {round(np.mean(test_10scores),4)}") 


----------------------------------------
Train Score is 0.6640625
Test Score is 0.6380208333333334
----------------------------------------
Train Score is 0.6588541666666666
Test Score is 0.6432291666666666
----------------------------------------
Train Score is 0.65625
Test Score is 0.6458333333333334
----------------------------------------
Train Score is 0.6302083333333334
Test Score is 0.671875
----------------------------------------
Train Score is 0.6510416666666666
Test Score is 0.6510416666666666
----------------------------------------
Train Score is 0.6354166666666666
Test Score is 0.6666666666666666
----------------------------------------
Train Score is 0.6588541666666666
Test Score is 0.6432291666666666
----------------------------------------
Train Score is 0.6484375
Test Score is 0.6536458333333334
----------------------------------------
Train Score is 0.6666666666666666
Test Score is 0.6354166666666666
----------------------------------------
Train Score is 0.653645833