In [19]:
import numpy as np
import sympy as sp
import pandas as pd

# Linear Discriminant Analysis (LDA)

In [20]:
class LDA:
    def __init__(self, n_components):
        self.n_components = n_components
        self.X_train = None
        self.y_train = None
        self.Sw = None
        self.Sb = None
        self.eig_vals = None
        self.eig_vecs = None
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.Sw = self._within_class_scatter_matrix()
        self.Sb = self._between_class_scatter_matrix()
        self.eig_vals, self.eig_vecs = self._eigen()
        
    def transform(self, X):
        return np.dot(X, self.eig_vecs.T)
    
    def _within_class_scatter_matrix(self):
        Sw = np.zeros((self.X_train.shape[1], self.X_train.shape[1]))
        for i in range(len(self.X_train)):
            x = self.X_train[i].reshape(self.X_train.shape[1], 1)
            m = self._mean(self.X_train[self.y_train == self.y_train[i]])
            Sw += np.dot((x - m), (x - m).T)
        return Sw
    
    def _between_class_scatter_matrix(self):
        Sb = np.zeros((self.X_train.shape[1], self.X_train.shape[1]))
        m = self._mean(self.X_train)
        for i in range(len(self.X_train)):
            x = self.X_train[i].reshape(self.X_train.shape[1], 1)
            m_i = self._mean(self.X_train[self.y_train == self.y_train[i]])
            Sb += len(self.X_train[self.y_train == self.y_train[i]]) * np.dot((m_i - m), (m_i - m).T)
        return Sb
    
    def _eigen(self):
        eig_vals, eig_vecs = np.linalg.eig(np.dot(np.linalg.inv(self.Sw), self.Sb))
        eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))]
        eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True)
        eig_vals = np.array([eig_pairs[i][0] for i in range(self.n_components)])
        eig_vecs = np.array([eig_pairs[i][1] for i in range(self.n_components)])
        return eig_vals, eig_vecs
    
    def _mean(self, X):
        return np.mean(X, axis=0)

# K-Nearest Neighbors (KNN)

In [21]:
class KNN:
    def __init__(self, k=5):
        self.k = k
        self.X_train = None
        self.y_train = None
        self.distances = None
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def predict(self, X_test):
        self.distances = self._euclidean_dist(self.X_train, X_test)
        pred = []
        for dist in self.distances:
            k_nearest = dist.argsort()[:self.k]
            k_nearest_labels = y_train.iloc[k_nearest]
            pred.append(k_nearest_labels.value_counts().index[0])
        return np.array(pred)
    
    def _euclidean_dist(self, X_train, X_test):
        dist = np.zeros((len(X_test), len(X_train)))
        for i in range(len(X_test)):
            for j in range(len(X_train)):
                dist[i, j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))
        return dist

In [22]:
def covariance(X):
    X = X - np.mean(X, axis=0)
    return np.dot(X.T, X) / (len(X) - 1)

# Split Train and Test Data

In [23]:
def split_train_test(df, y_col):
    train_data, test_data = pd.DataFrame(), pd.DataFrame()
    for y in df[y_col].unique():
        df_y = df[df[y_col] == y]
        test_data = pd.concat([test_data, df_y[:10]], ignore_index=True)
        train_data = pd.concat([train_data, df_y[10:]], ignore_index=True)
    X_train, y_train = train_data.drop(y_col, axis=1), train_data[y_col]
    X_test, y_test = test_data.drop(y_col, axis=1), test_data[y_col]
    return X_train, y_train, X_test, y_test

# Accuracy

In [24]:
def accuracy(y_pred, y_test):
    return np.sum(y_pred == y_test) / len(y_test)

# Model

In [25]:
gender_df = pd.read_csv("gender.csv").drop("Unnamed: 0", axis=1)
gender_df.rename(columns={'Unnamed: 1': 'Gender'}, inplace=True)
print(gender_df.head())

  Gender         0         1         2         3         4         5  \
0   male -0.066420  0.151611  0.027740  0.052771 -0.066105 -0.041232   
1   male -0.030614  0.049667  0.008084 -0.050324  0.007649 -0.063818   
2   male -0.096178  0.061127  0.035326 -0.035388 -0.090728 -0.018634   
3   male -0.103057  0.085044  0.078333 -0.035873 -0.028163  0.004924   
4   male -0.125815  0.120046  0.023131 -0.042901  0.038215 -0.049677   

          6         7         8  ...       118       119       120       121  \
0 -0.002637 -0.158467  0.130467  ...  0.025989 -0.001087  0.027260 -0.046754   
1 -0.019530 -0.119905  0.186553  ...  0.044229 -0.023900 -0.028108  0.040618   
2 -0.024315 -0.139786  0.052211  ...  0.111141  0.059436 -0.029222  0.042115   
3  0.007829 -0.017016  0.114907  ...  0.100793 -0.002644 -0.023388  0.029497   
4 -0.054258 -0.130758  0.173457  ...  0.090197  0.067527  0.039926  0.047469   

        122       123       124       125       126       127  
0 -0.118619 -0.163774 

In [26]:
X_train, y_train, X_test, y_test = split_train_test(gender_df, "Gender")
print(f"X_train: {len(X_train)}")
print(f"X_test: {len(X_test)}")

X_train: 780
X_test: 20


In [27]:
lda = LDA(1)
lda.fit(X_train.values, y_train.values)

In [28]:
X_train_lda = lda.transform(X_train.values)
X_test_lda = lda.transform(X_test.values)

In [29]:
knn = KNN(5)
knn.fit(X_train_lda, y_train)

In [30]:
y_pred = knn.predict(X_test_lda)
results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
print(results)

    Actual Predicted
0     male      male
1     male      male
2     male    female
3     male      male
4     male    female
5     male    female
6     male      male
7     male      male
8     male      male
9     male      male
10  female    female
11  female      male
12  female    female
13  female    female
14  female      male
15  female      male
16  female      male
17  female    female
18  female    female
19  female    female


  dist[i, j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))


In [31]:
print(f"Accuracy: {accuracy(knn.predict(X_test_lda), y_test)}")

  dist[i, j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))


Accuracy: 0.65
