In [1]:
import numpy as np
import pandas as pd

# Principal Component Analysis (PCA)

In [2]:
class LDA:
    def __init__(self, n_components):
        self.n_components = n_components
        self.X_train = None
        self.y_train = None
        self.Sw = None
        self.Sb = None
        self.eig_vals = None
        self.eig_vecs = None
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.Sw = self._within_class_scatter_matrix()
        self.Sb = self._between_class_scatter_matrix()
        self.eig_vals, self.eig_vecs = self._eigen()
        
    def transform(self, X):
        return np.dot(X, self.eig_vecs.T)
    
    def _within_class_scatter_matrix(self):
        Sw = np.zeros((self.X_train.shape[1], self.X_train.shape[1]))
        for i in range(len(self.X_train)):
            x = self.X_train[i].reshape(self.X_train.shape[1], 1)
            m = self._mean(self.X_train[self.y_train == self.y_train[i]])
            Sw += np.dot((x - m), (x - m).T)
        return Sw
    
    def _between_class_scatter_matrix(self):
        Sb = np.zeros((self.X_train.shape[1], self.X_train.shape[1]))
        m = self._mean(self.X_train)
        for i in range(len(self.X_train)):
            x = self.X_train[i].reshape(self.X_train.shape[1], 1)
            m_i = self._mean(self.X_train[self.y_train == self.y_train[i]])
            Sb += len(self.X_train[self.y_train == self.y_train[i]]) * np.dot((m_i - m), (m_i - m).T)
        return Sb
    
    def _eigen(self):
        eig_vals, eig_vecs = np.linalg.eig(np.dot(np.linalg.inv(self.Sw), self.Sb))
        eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))]
        eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True)
        eig_vals = np.array([eig_pairs[i][0] for i in range(self.n_components)])
        eig_vecs = np.array([eig_pairs[i][1] for i in range(self.n_components)])
        return eig_vals, eig_vecs
    
    def _mean(self, X):
        return np.mean(X, axis=0)

# K-Nearest Neighbors (KNN)

In [3]:
class KNN:
    def __init__(self, k=5):
        self.k = k
        self.X_train = None
        self.y_train = None
        self.distances = None
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def predict(self, X_test):
        self.distances = self._euclidean_dist(self.X_train, X_test)
        pred = []
        for dist in self.distances:
            k_nearest = dist.argsort()[:self.k]
            k_nearest_labels = y_train.iloc[k_nearest]
            pred.append(k_nearest_labels.value_counts().index[0])
        return np.array(pred)
    
    def _euclidean_dist(self, X_train, X_test):
        dist = np.zeros((len(X_test), len(X_train)))
        for i in range(len(X_test)):
            for j in range(len(X_train)):
                dist[i, j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))
        return dist

In [4]:
def covariance(X):
    X = X - np.mean(X, axis=0)
    return np.dot(X.T, X) / (len(X) - 1)

# Split Train and Test Data

In [5]:
def split_train_test(df, y_col):
    train_data, test_data = pd.DataFrame(), pd.DataFrame()
    for y in df[y_col].unique():
        df_y = df[df[y_col] == y]
        test_data = pd.concat([test_data, df_y[:2]], ignore_index=True)
        train_data = pd.concat([train_data, df_y[2:]], ignore_index=True)
    X_train, y_train = train_data.drop(y_col, axis=1), train_data[y_col]
    X_test, y_test = test_data.drop(y_col, axis=1), test_data[y_col]
    return X_train, y_train, X_test, y_test

# Accuracy

In [6]:
def accuracy(y_pred, y_test):
    return np.sum(y_pred == y_test) / len(y_test)

# Model

In [7]:
face_df = pd.read_csv("face.csv")
print(face_df.head())
print(f"Shape: {face_df.shape}")

          0         1         2         3         4         5         6  \
0  0.309917  0.367769  0.417355  0.442149  0.528926  0.607438  0.657025   
1  0.454545  0.471074  0.512397  0.557851  0.595041  0.640496  0.681818   
2  0.318182  0.400826  0.491736  0.528926  0.586777  0.657025  0.681818   
3  0.198347  0.194215  0.194215  0.194215  0.190083  0.190083  0.243802   
4  0.500000  0.545455  0.582645  0.623967  0.648760  0.690083  0.694215   

          7         8         9  ...      4087      4088      4089      4090  \
0  0.677686  0.690083  0.685950  ...  0.669422  0.652893  0.661157  0.475207   
1  0.702479  0.710744  0.702479  ...  0.157025  0.136364  0.148760  0.152893   
2  0.685950  0.702479  0.698347  ...  0.132231  0.181818  0.136364  0.128099   
3  0.404959  0.483471  0.516529  ...  0.636364  0.657025  0.685950  0.727273   
4  0.714876  0.723140  0.731405  ...  0.161157  0.177686  0.173554  0.177686   

       4091      4092      4093      4094      4095  target  
0  0.1

In [8]:
face_df.dropna(inplace=True)

In [9]:
print(face_df["target"].value_counts())

target
0     10
1     10
22    10
23    10
24    10
25    10
26    10
27    10
28    10
29    10
30    10
31    10
32    10
33    10
34    10
35    10
36    10
37    10
38    10
21    10
20    10
19    10
9     10
2     10
3     10
4     10
5     10
6     10
7     10
8     10
10    10
18    10
11    10
12    10
13    10
14    10
15    10
16    10
17    10
39    10
Name: count, dtype: int64


In [10]:
X_train, y_train, X_test, y_test = split_train_test(face_df, "target")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")

X_train: (320, 4096)
X_test: (80, 4096)


In [11]:
lda = LDA(39)
lda.fit(X_train.values, y_train.values)

In [16]:
X_train_lda = lda.transform(X_train.values)
X_test_lda = lda.transform(X_test.values)
print(f"Reduced X_train: {X_train_lda.shape}")
print(f"Reduced X_test: {X_test_lda.shape}")

Reduced X_train: (320, 39)
Reduced X_test: (80, 39)


In [62]:
knn = KNN(4)
knn.fit(X_train_lda, y_train)

In [63]:
y_pred = knn.predict(X_test_lda)
results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
print(results)

  dist[i, j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))


    Actual  Predicted
0        0         12
1        0          7
2        1         28
3        1          6
4        2          6
..     ...        ...
75      37          2
76      38         31
77      38          7
78      39         24
79      39         26

[80 rows x 2 columns]


In [64]:
print(f"Accuracy: {accuracy(y_pred, y_test)}")

Accuracy: 0.025
