# Imports

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import time, datetime, itertools, pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC 
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate  
from sklearn.decomposition import KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid

# Helpers

In [2]:
def get_and_scale_mnist_data(train_file_path, test_file_path, n_samples):

  # get train-val data
  train_val_df = pd.read_csv(train_file_path, header=0)
  X_train_val = train_val_df.iloc[:n_samples, 1:].to_numpy()
  y_train_val = train_val_df.iloc[:n_samples, 0].to_numpy()

  # get test data
  test_df = pd.read_csv(test_file_path, header=0)
  X_test = test_df.iloc[:, 1:].to_numpy()
  y_test = test_df.iloc[:, 0].to_numpy()

  # scale data
  scaler = MinMaxScaler()
  X_train_val = scaler.fit_transform(X_train_val)
  X_test = scaler.transform(X_test)

  return X_train_val, X_test, y_train_val, y_test

def find_n_components(X, information_percentage, kernel, n_components):
  kpca = KernelPCA(n_components=n_components, kernel=kernel)
  print(f"{kernel} kernel PCA fitting...")
  kpca.fit(X)
  kpca_eigvals = kpca.eigenvalues_
  kpca_eigvals_total_sum = kpca_eigvals.sum()
  kpca_eigvals_partial_sum = kpca_eigvals[0] 
  n_components = 1
  while kpca_eigvals_partial_sum / kpca_eigvals_total_sum < information_percentage:
    n_components += 1
    kpca_eigvals_partial_sum += kpca_eigvals[n_components-1] 
  information_percentage *= 100
  print(f"To keep {information_percentage}% of the initial information, {n_components} were kept.")
  return n_components

# turn total time from seconds to -> hours, minutes and seconds
def time_to_h_min_sec(t):
  total_time = str(datetime.timedelta(seconds=t))
  time_split = total_time.split(':')
  hours = int(time_split[0])
  minutes = int(time_split[1])
  seconds = round(float(time_split[2]))
  return str(hours) + 'h  ' + str(minutes) + 'min  ' + str(seconds) + 'sec'

# MAIN

In [3]:
TRAIN_FILE_PATH = '/content/mnist_train.csv'
TEST_FILE_PATH = '/content/mnist_test.csv'
N_SAMPLES = 50000
INFORMATION_PERCENTAGE = 0.9
TRAIN_VAL_SPLIT_PERCENTAGE = 0.4
KERNEL = "rbf"

# Get data
X_train_val, X_test, y_train_val, y_test = get_and_scale_mnist_data(TRAIN_FILE_PATH, TEST_FILE_PATH, N_SAMPLES)

# split data
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, shuffle=True, 
                                                  test_size=TRAIN_VAL_SPLIT_PERCENTAGE, random_state=42)

print(X_train.shape, X_val.shape, X_test.shape, y_train_val.shape, y_val.shape, y_test.shape)

(30000, 784) (20000, 784) (9999, 784) (50000,) (20000,) (9999,)


In [9]:
# find number of components so as to keep "information percentage" of the the initial information
n_components = find_n_components(X_train, INFORMATION_PERCENTAGE, KERNEL, X_train.shape[1])

# apply kernel PCA
kpca = KernelPCA(n_components=n_components, kernel="linear")
tic = time.time()
X_train = kpca.fit_transform(X_train)
toc = time.time()
Dt = time_to_h_min_sec(toc - tic)
print(f"kernel PCA - Time elapsed: {Dt}")

X_val = kpca.transform(X_val)
X_test = kpca.transform(X_test)
print(f"Shapes after kernel PCA: {X_train.shape}, {X_val.shape} and {X_test.shape}")

rbf kernel PCA fitting...
To keep 90.0% of the initial information, 112 were kept.
kernel PCA - Time elapsed: 0h  43min  21sec
Shapes after kernel PCA: (30000, 112), (20000, 112) and (9999, 112)


# classify AFTER applying LDA

In [10]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

X_train_lda = lda.transform(X_train)
X_val_lda = lda.transform(X_val)
X_test_lda = lda.transform(X_test)
print(f"Shapes after LDA: {X_train_lda.shape}, {X_val_lda.shape} and {X_test_lda.shape}")

Shapes after LDA: (30000, 9), (20000, 9) and (9999, 9)


In [11]:
# 1. KNN
knn = KNeighborsClassifier(n_neighbors=10, weights="distance")
knn.fit(X_train_lda, y_train)

y_test_preds = knn.predict(X_test_lda)
test_accuracy = float("{:.2f}".format(100*np.mean(y_test_preds == y_test))) 
print(f"Test accuracy = {test_accuracy}%")

Test accuracy = 91.67%


In [12]:
# 2. NCC
ncc = NearestCentroid()
ncc.fit(X_train_lda, y_train)
y_test_preds = ncc.predict(X_test_lda)
test_accuracy = float("{:.2f}".format(100*np.mean(y_test_preds == y_test))) 
print(f"Test accuracy = {test_accuracy}%")

Test accuracy = 87.93%


In [13]:
# 3. SVM

# a) linear kernel
kernel, C = "linear", 10
svm = SVC(C=C, kernel=kernel, decision_function_shape='ovo')
svm.fit(X_train_lda, y_train)
y_test_preds = svm.predict(X_test_lda)
test_accuracy = float("{:.2f}".format(100*np.mean(y_test_preds == y_test))) 
print(f"{kernel} kernel --> Test accuracy = {test_accuracy}%")

# b) RBF kernel
kernel, C, gamma = "rbf", 10, 1
svm = SVC(C=C, kernel=kernel, gamma=gamma, decision_function_shape='ovo')
svm.fit(X_train_lda, y_train)
y_test_preds = svm.predict(X_test_lda)
test_accuracy = float("{:.2f}".format(100*np.mean(y_test_preds == y_test))) 
print(f"{kernel} kernel --> Test accuracy = {test_accuracy}%")

# c) polynomial kernel
kernel, C, degree = "poly", 1e-3, 4
svm = SVC(C=C, kernel=kernel, degree=degree, decision_function_shape='ovo')
svm.fit(X_train_lda, y_train)
y_test_preds = svm.predict(X_test_lda)
test_accuracy = float("{:.2f}".format(100*np.mean(y_test_preds == y_test))) 
print(f"{kernel} kernel --> Test accuracy = {test_accuracy}%")

# d) sigmoid kernel
kernel = "sigmoid"
svm = SVC(kernel=kernel, decision_function_shape='ovo')
svm.fit(X_train_lda, y_train)
y_test_preds = svm.predict(X_test_lda)
test_accuracy = float("{:.2f}".format(100*np.mean(y_test_preds == y_test))) 
print(f"{kernel} kernel --> Test accuracy = {test_accuracy}%")

linear kernel --> Test accuracy = 89.44%
rbf kernel --> Test accuracy = 84.59%
poly kernel --> Test accuracy = 49.78%
sigmoid kernel --> Test accuracy = 80.15%
