In [0]:
# importing necessary libraries
from sklearn import datasets
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
 
# loading the iris dataset
iris = datasets.load_iris()
 
# X -> features, y -> label
X = iris.data
Y = iris.target
 
# dividing X, y into train and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 0)
 
# training a KNN classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 1).fit(X_train, Y_train)
 
# accuracy on X_test
accuracy = knn.score(X_test, Y_test)
print (accuracy)
 
# creating a confusion matrix
knn_predictions = knn.predict(X_test) 
cm = confusion_matrix(Y_test, knn_predictions)

0.9736842105263158


In [41]:
!pip install natsort



In [0]:
# HTTP
from requests import get

# OS and IO
from io import BytesIO

# Unzipping
from zipfile import ZipFile

# Science and shi*
import scipy
import matplotlib.pyplot as plt
import numpy as np

# For sorting files
from natsort import natsorted

# Read files in order
request = get('http://www.cl.cam.ac.uk/Research/DTG/attarchive/pub/data/att_faces.zip')
zip_file = ZipFile(BytesIO(request.content))
files = zip_file.namelist()
files = natsorted(files)
zip_file.extractall()

# Images to training and test data matrices:
y = np.arange(1, 41, 1)
y = np.repeat(y, 10).reshape(400, 1)
ytest = y[1::2, ::]
ytraining = y[0::2, ::]

D = np.array(np.empty([0, 10304]))
for f in files:
    if f.endswith(".pgm"):
        arr = scipy.misc.imread(f)
        # The array is transposed because I checked its dimensions and they were 112X92
        arr = arr.T
        D = np.vstack((D, arr.flatten()))
Dtest = D[1::2, ::]
Dtraining = D[0::2, ::]


# 200 training examples with 10304 feature/pixel
assert Dtraining.shape == (200, 10304)

# 200 test examples with 10304 feature/pixel
assert Dtest.shape == (200, 10304)



**Sci-kit LDA learn on iris data**

In [0]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
Y = iris.target
clf = LDA()
clf.fit(X, Y)
# eigen vectors
clf.scalings_
#np.unique(iris.target)


array([[ 0.81926852,  0.03285975],
       [ 1.5478732 ,  2.15471106],
       [-2.18494056, -0.93024679],
       [-2.85385002,  2.8060046 ]])

**LDA from scratch on iris data**

In [44]:
Xtest = X[1::2, ::]
Xtraining = X[0::2, ::]
Ytest = Y[1::2]
Ytraining = Y[0::2]
iris_means = np.array(np.empty([0, 4]))
for i in range(0, 150, 50):
    user = X[i:i+50:, ::]
    iris_mean = np.mean(user, axis=0)
    iris_means = np.vstack((iris_means, iris_mean))
iris_mean_sample = np.mean(X, axis=0)

iris_n = iris_means.shape[0]
overall_mean = np.mean(X, axis=0)
iris_B = np.zeros((4,4))
for i in range(0, 3): 
    overall_mean = overall_mean.reshape(4,1) 
    mean_vec = iris_means[i].reshape(4,1)
    iris_B += 50 * (mean_vec - overall_mean).dot((mean_vec - overall_mean).T)
    i += 1
iris_Z = np.array(np.empty([0, 4]))
j = 0
for i in range(0, 150, 50):
    user = X[i:i+50:, ::]
    iris_Z = np.vstack((iris_Z, user-iris_means[j].T))
    j += 1
    
iris_S = iris_Z.T.dot(iris_Z)
iris_S_inverse = np.linalg.inv(iris_S)
iris_S_inverse_B = iris_S_inverse.dot(iris_B)
eigen_values, eigen_vectors = np.linalg.eig(iris_S_inverse_B)
print(eigen_vectors)


[[ 0.20490976 -0.00898234  0.68226001 -0.142006  ]
 [ 0.38714331 -0.58899857 -0.44093159  0.38002784]
 [-0.54648218  0.25428655 -0.46521433  0.44439255]
 [-0.71378517 -0.76703217  0.35167633 -0.7987042 ]]


**LDA**

In [45]:
means = np.array(np.empty([0, 10304]))
for i in range(0, 200, 5):
    user = Dtraining[i:i+5:, ::]
    mean = np.mean(user, axis=0)
    means = np.vstack((means, mean))
    
print(means.shape)
n = means.shape[0]
#Because python returns mean vector as row not column vector, therefore we reversed the equation
overall_mean = np.mean(Dtraining, axis=0)
B = np.zeros((10304, 10304))
for i in range(0, n): 
    overall_mean = overall_mean.reshape(10304,1) 
    mean_vec = means[i].reshape(10304,1)
    B += 5 * (mean_vec - overall_mean).dot((mean_vec - overall_mean).T)
    i += 1
Z = np.array(np.empty([0, 10304]))
j = 0
for i in range(0, 200, 5):
    user = Dtraining[i:i+5:, ::]
    Z = np.vstack((Z, user-means[j].T))
    j += 1

#This is S, we don't have to sum all Ss from 1 to 40, I tested that with an example here https://pastebin.com/TABBdB5C    
S = Z.T.dot(Z)
S_inverse = np.linalg.inv(S)
S_inverse_B = S_inverse.dot(B)
eigen_values, eigen_vectors = np.linalg.eig(S_inverse_B)


(40, 10304)


**Projection and classification and calculating accuracy**

In [51]:
U = eigen_vectors[:,range(0,39)]
assert U.shape == (10304, 39)
project_trainning = Dtraining.dot(U)
project_test = Dtest.dot(U)

# importing necessary libraries
from sklearn import datasets
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
 
# training a KNN classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 1).fit(Dtraining, np.ravel(ytraining))
 
# accuracy on Dtest
accuracy = knn.score(Dtest, ytest)
print (accuracy)

0.94
