# Imports

In [2]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import time, datetime, itertools, pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC 
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate  
from sklearn.decomposition import KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid

# Helpers

In [3]:
def unpickle(file):
  with open(file, 'rb') as fo:
      dict = pickle.load(fo, encoding='bytes')
  return dict

def get_and_scale_cifar10_data(sort_flag, train_samples_per_class=5000, test_samples=10000):
  for batch in range(1, 6):
    data_dict = unpickle(f"data_batch_{batch}")
    if batch == 1:
      X_train = data_dict[b'data']
      y_train = data_dict[b'labels']
    else:
      X_train = np.concatenate((X_train, data_dict[b'data']))
      y_train = np.concatenate((y_train, data_dict[b'labels']))

  data_dict = unpickle("test_batch")
  X_test = data_dict[b'data']
  y_test = np.array(data_dict[b'labels'])

  data_dict = unpickle("batches.meta")
  label_names = list(str(name)[2:-1] for name in data_dict[b'label_names'])
  
  scaler = MinMaxScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  if sort_flag == 1:
    sort_idx = np.argsort(y_train)
    X_train_scaled = X_train_scaled[sort_idx]
    y_train = y_train[sort_idx]
  
  classes = np.unique(y_train)
  n_classes = len(classes)
  for n_class in range(n_classes):
    class_idx = 5000*n_class + np.arange(0, train_samples_per_class)
    if n_class == 0:
      X_train_scaled_sampled = X_train_scaled[class_idx, :]
      y_train_sampled = y_train[class_idx]
    else:
      X_train_scaled_sampled = np.concatenate((X_train_scaled_sampled, X_train_scaled[class_idx, :])) 
      y_train_sampled = np.concatenate((y_train_sampled, y_train[class_idx]))
  
  X_test_scaled_sampled = X_test_scaled[:test_samples, :]
  y_test_sampled = y_test[:test_samples]

  print(X_train_scaled_sampled.shape, y_train_sampled.shape, X_test_scaled_sampled.shape, y_test_sampled.shape)

  return X_train_scaled_sampled, X_test_scaled_sampled, y_train_sampled, y_test_sampled, label_names

def find_n_components(X, information_percentage, kernel, n_components):
  kpca = KernelPCA(n_components=n_components, kernel=kernel)
  print(f"{kernel} kernel PCA fitting...")
  kpca.fit(X)
  kpca_eigvals = kpca.eigenvalues_
  kpca_eigvals_total_sum = kpca_eigvals.sum()
  kpca_eigvals_partial_sum = kpca_eigvals[0] 
  n_components = 1
  while kpca_eigvals_partial_sum / kpca_eigvals_total_sum < information_percentage:
    n_components += 1
    kpca_eigvals_partial_sum += kpca_eigvals[n_components-1] 
  information_percentage *= 100
  print(f"To keep {information_percentage}% of the initial information, {n_components} were kept.")
  return n_components

# turn total time from seconds to -> hours, minutes and seconds
def time_to_h_min_sec(t):
  total_time = str(datetime.timedelta(seconds=t))
  time_split = total_time.split(':')
  hours = int(time_split[0])
  minutes = int(time_split[1])
  seconds = round(float(time_split[2]))
  return str(hours) + 'h  ' + str(minutes) + 'min  ' + str(seconds) + 'sec'

# MAIN

In [4]:
# constants
train_samples_per_class = 4000
test_samples = 10000
information_percentage = 0.9
kernel = 'rbf'
val_percentage_size = 0.4

# get and scale data
X_train_val, X_test, y_train_val, y_test, label_names = get_and_scale_cifar10_data(sort_flag=1, test_samples=test_samples,
                                                                                   train_samples_per_class=train_samples_per_class)
# split train-validation set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, shuffle=True, 
                                                  test_size=val_percentage_size, random_state=42)

# find number of components so as to keep "information percentage" of the the initial information
n_components = find_n_components(X_train, information_percentage, kernel=kernel, n_components=X_train.shape[1])

# apply kernel PCA
kpca = KernelPCA(n_components=n_components, kernel=kernel)
tic = time.time()
X_train = kpca.fit_transform(X_train)
toc = time.time()
Dt = time_to_h_min_sec(toc - tic)
print(f"kernel PCA - Time elapsed: {Dt}")

X_val = kpca.transform(X_val)
X_test = kpca.transform(X_test)
print(f"Shapes after kernel PCA: {X_train.shape}, {X_val.shape} and {X_test.shape}")

(40000, 3072) (40000,) (10000, 3072) (10000,)
rbf kernel PCA fitting...
To keep 90.0% of the initial information, 167 were kept.
kernel PCA - Time elapsed: 0h  19min  36sec
Shapes after kernel PCA: (24000, 167), (16000, 167) and (10000, 167)


In [5]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

LinearDiscriminantAnalysis()

# classify BEFORE applying LDA

In [43]:
# 1. use LDA as classifier
y_test_preds = lda.predict(X_test)
test_accuracy = float("{:.2f}".format(100*np.mean(y_test_preds == y_test))) 
print(f"Test accuracy = {test_accuracy}%")

Test accuracy = 42.14%


# classify AFTER applying LDA

In [6]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

X_train_lda = lda.transform(X_train)
X_val_lda = lda.transform(X_val)
X_test_lda = lda.transform(X_test)
print(f"Shapes after LDA: {X_train_lda.shape}, {X_val_lda.shape} and {X_test_lda.shape}")

Shapes after LDA: (24000, 9), (16000, 9) and (10000, 9)


In [7]:
# 1. KNN
knn = KNeighborsClassifier(n_neighbors=10, weights="distance")
knn.fit(X_train_lda, y_train)

y_test_preds = knn.predict(X_test_lda)
test_accuracy = float("{:.2f}".format(100*np.mean(y_test_preds == y_test))) 
print(f"Test accuracy = {test_accuracy}%")

Test accuracy = 40.96%


In [8]:
# 2. NCC
ncc = NearestCentroid()
ncc.fit(X_train_lda, y_train)
y_test_preds = ncc.predict(X_test_lda)
test_accuracy = float("{:.2f}".format(100*np.mean(y_test_preds == y_test))) 
print(f"Test accuracy = {test_accuracy}%")

Test accuracy = 42.18%


In [9]:
# 3. SVM

# a) linear kernel
kernel, C = "linear", 100
svm = SVC(C=C, kernel=kernel, decision_function_shape='ovo')
svm.fit(X_train_lda, y_train)
y_test_preds = svm.predict(X_test_lda)
test_accuracy = float("{:.2f}".format(100*np.mean(y_test_preds == y_test))) 
print(f"{kernel} kernel --> Test accuracy = {test_accuracy}%")

# b) RBF kernel
kernel, C, gamma = "rbf", 10, 1
svm = SVC(C=C, kernel=kernel, gamma=gamma, decision_function_shape='ovo')
svm.fit(X_train_lda, y_train)
y_test_preds = svm.predict(X_test_lda)
test_accuracy = float("{:.2f}".format(100*np.mean(y_test_preds == y_test))) 
print(f"{kernel} kernel --> Test accuracy = {test_accuracy}%")

# c) polynomial kernel
kernel, C, degree = "poly", 1, 2
svm = SVC(C=C, kernel=kernel, degree=degree, decision_function_shape='ovo')
svm.fit(X_train_lda, y_train)
y_test_preds = svm.predict(X_test_lda)
test_accuracy = float("{:.2f}".format(100*np.mean(y_test_preds == y_test))) 
print(f"{kernel} kernel --> Test accuracy = {test_accuracy}%")

# d) sigmoid kernel
kernel = "sigmoid"
svm = SVC(kernel=kernel, decision_function_shape='ovo')
svm.fit(X_train_lda, y_train)
y_test_preds = svm.predict(X_test_lda)
test_accuracy = float("{:.2f}".format(100*np.mean(y_test_preds == y_test))) 
print(f"{kernel} kernel --> Test accuracy = {test_accuracy}%")

linear kernel --> Test accuracy = 42.34%
rbf kernel --> Test accuracy = 36.77%
poly kernel --> Test accuracy = 36.99%
sigmoid kernel --> Test accuracy = 27.95%
