In [121]:
import numpy as np
import pandas as pd

# Principal Component Analysis (PCA)

In [122]:
class PCA:
    def __init__(self, cumvar_threshold=0.95):
        self.cumvar_threshold = cumvar_threshold
        self.components = None
        self.explained_variance = None
        self.explained_variance_ratio = None
        self.cumulative_variance_ratio = None
        self.n_components = None
        
    def fit(self, X):
        # covariance matrix
        cov = covariance(X)
        # eigenvalues and eigenvectors
        eigenvalues, eigenvectors = np.linalg.eig(cov)
        # sort eigenvalues and eigenvectors
        idx = eigenvalues.argsort()[::-1]
        eigenvalues = eigenvalues[idx]
        eigenvectors = eigenvectors[:, idx]
        # explained variance
        self.explained_variance = eigenvalues
        self.explained_variance_ratio = eigenvalues / eigenvalues.sum()
        self.cumulative_variance_ratio = np.cumsum(self.explained_variance_ratio)
        # number of components
        self.n_components = np.argmax(self.cumulative_variance_ratio >= self.cumvar_threshold) + 1
        # components
        self.components = eigenvectors[:, :self.n_components]
    
    def transform(self, X):
        return np.dot(X, self.components)

# K-Nearest Neighbors (KNN)

In [123]:
class KNN:
    def __init__(self, k=5):
        self.k = k
        self.X_train = None
        self.y_train = None
        self.distances = None
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def predict(self, X_test):
        self.distances = self._euclidean_dist(self.X_train, X_test)
        pred = []
        for dist in self.distances:
            k_nearest = dist.argsort()[:self.k]
            k_nearest_labels = y_train.iloc[k_nearest]
            pred.append(k_nearest_labels.value_counts().index[0])
        return np.array(pred)
    
    def _euclidean_dist(self, X_train, X_test):
        dist = np.zeros((len(X_test), len(X_train)))
        for i in range(len(X_test)):
            for j in range(len(X_train)):
                dist[i, j] = np.sqrt(np.sum((X_test[i] - X_train[j])**2))
        return dist

In [124]:
def covariance(X):
    X = X - np.mean(X, axis=0)
    return np.dot(X.T, X) / (len(X) - 1)

# Split Train and Test Data

In [125]:
def split_train_test(df, y_col):
    train_data, test_data = pd.DataFrame(), pd.DataFrame()
    for y in df[y_col].unique():
        df_y = df[df[y_col] == y]
        test_data = pd.concat([test_data, df_y[:10]], ignore_index=True)
        train_data = pd.concat([train_data, df_y[10:]], ignore_index=True)
    X_train, y_train = train_data.drop(y_col, axis=1), train_data[y_col]
    X_test, y_test = test_data.drop(y_col, axis=1), test_data[y_col]
    return X_train, y_train, X_test, y_test

# Accuracy

In [126]:
def accuracy(y_pred, y_test):
    return np.sum(y_pred == y_test) / len(y_test)

# Model

In [127]:
gender_df = pd.read_csv("gender.csv").drop("Unnamed: 0", axis=1)
gender_df.rename(columns={'Unnamed: 1': 'Gender'}, inplace=True)
print(gender_df.head())

Unnamed: 0,Gender,0,1,2,3,4,5,6,7,8,...,118,119,120,121,122,123,124,125,126,127
0,male,-0.066420,0.151611,0.027740,0.052771,-0.066105,-0.041232,-0.002637,-0.158467,0.130467,...,0.025989,-0.001087,0.027260,-0.046754,-0.118619,-0.163774,-0.000590,-0.076400,0.107497,0.001567
1,male,-0.030614,0.049667,0.008084,-0.050324,0.007649,-0.063818,-0.019530,-0.119905,0.186553,...,0.044229,-0.023900,-0.028108,0.040618,-0.146579,-0.141244,0.016162,0.017638,0.080610,-0.015930
2,male,-0.096178,0.061127,0.035326,-0.035388,-0.090728,-0.018634,-0.024315,-0.139786,0.052211,...,0.111141,0.059436,-0.029222,0.042115,-0.222173,-0.116908,0.093428,0.017391,0.057652,0.086116
3,male,-0.103057,0.085044,0.078333,-0.035873,-0.028163,0.004924,0.007829,-0.017016,0.114907,...,0.100793,-0.002644,-0.023388,0.029497,-0.139830,-0.119243,0.005306,-0.015100,0.161575,0.062462
4,male,-0.125815,0.120046,0.023131,-0.042901,0.038215,-0.049677,-0.054258,-0.130758,0.173457,...,0.090197,0.067527,0.039926,0.047469,-0.056852,-0.076700,0.004966,0.028171,0.026041,0.084135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,female,-0.164731,0.064301,0.058630,-0.017420,-0.157600,-0.022536,0.002864,-0.072739,0.030554,...,0.095115,0.007198,-0.004655,0.023957,-0.170753,-0.136630,0.041614,0.031600,0.019064,0.004384
796,female,-0.095308,0.051095,0.092913,-0.101745,-0.083153,-0.028159,0.009090,-0.114513,0.157421,...,0.056078,0.119846,0.087470,0.017481,-0.096594,-0.084553,0.037709,0.030732,-0.083713,0.064970
797,female,-0.202852,0.037039,0.079731,-0.047156,-0.140062,-0.080246,0.057668,-0.122083,0.165443,...,0.066954,0.035684,-0.023112,-0.030452,-0.154243,-0.188270,0.071086,0.037384,-0.006257,0.039977
798,female,-0.088300,0.063530,0.049627,-0.026011,-0.172773,0.086218,0.042710,-0.161852,0.185083,...,0.039460,0.067547,0.040426,0.028007,-0.154515,-0.127736,0.046967,0.009701,-0.016942,0.048071


In [128]:
X_train, y_train, X_test, y_test = split_train_test(gender_df, "Gender")
print(f"X_train: {len(X_train)}")
print(f"X_test: {len(X_test)}")

X_train: 780
X_test: 20


In [129]:
pca = PCA(0.95)
pca.fit(X_train)
print(f"Number of components: {pca.n_components}")

Number of components: 57


In [130]:
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [131]:
knn = KNN(5)
knn.fit(X_train_pca, y_train)

In [132]:
y_pred = knn.predict(X_test_pca)
results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
print(results)

Unnamed: 0,Actual,Predicted
0,male,male
1,male,male
2,male,male
3,male,male
4,male,male
5,male,male
6,male,male
7,male,female
8,male,male
9,male,male


In [133]:
print(f"Accuracy: {accuracy(knn.predict(X_test_pca), y_test)}")

Accuracy: 0.85
