# Imports

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import time, datetime, itertools, pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

# Helpers

In [2]:
def unpickle(file):
  with open(file, 'rb') as fo:
      dict = pickle.load(fo, encoding='bytes')
  return dict

def get_and_preprocess_cifar10_data(pca_components, train_val_split_percentage):
  # get train-val data
  for batch in range(1, 6):
    data_dict = unpickle(f"data_batch_{batch}")
    if batch == 1:
      X_train_val = data_dict[b'data']
      y_train_val = data_dict[b'labels']
    else:
      X_train_val = np.concatenate((X_train_val, data_dict[b'data']))
      y_train_val = np.concatenate((y_train_val, data_dict[b'labels']))

  # get test data
  data_dict = unpickle("test_batch")
  X_test = data_dict[b'data']
  y_test = np.array(data_dict[b'labels'])

  # get label names
  data_dict = unpickle("batches.meta")
  label_names = list(str(name)[2:-1] for name in data_dict[b'label_names'])

  # PCA train and test sets
  pca = PCA(n_components=pca_components)
  X_train_val_pca = pca.fit_transform(X_train_val)
  X_test_pca = pca.transform(X_test)

  # scale train and test sets
  scaler = MinMaxScaler()
  X_train_val_pca_scaled = scaler.fit_transform(X_train_val_pca)
  X_test_pca_scaled = scaler.transform(X_test_pca)

  # split train-val to train and validation sets
  X_train_pca_scaled, X_val_pca_scaled, y_train, y_val = train_test_split(X_train_val_pca_scaled, y_train_val,
                                                                          test_size=train_val_split_percentage, 
                                                                          shuffle=True)
  
  print(X_train_pca_scaled.shape, X_val_pca_scaled.shape, X_test_pca_scaled.shape, y_train.shape, y_val.shape, y_test.shape)

  return X_train_pca_scaled, X_val_pca_scaled, X_test_pca_scaled, y_train, y_val, y_test, label_names


def save_results_to_dict(weight, K, time_str, train_accuracy, validation_accuracy):
  
  results_dict = {
      "dataset": "Cifar-10",
      "model": "KNN",
      "K": K,
      "weights": weight,
      "N_train": 30000,
      "N_validation": 20000,
      "N_test": 10000,
      "D": 100,
      "PCA_applied": "YES",
      "k-fold Cross-Validation applied": "NO",
      "Time elapsed": time_str,
      "Train accuracy (%)": train_accuracy,
      "Validation accuracy (%)": validation_accuracy,
  }

  return results_dict

# MAIN

In [3]:
# Get and preprocess data
pca_components = 100
train_val_split_percentage = 0.4
X_train_pca_scaled, X_val_pca_scaled, X_test_pca_scaled,\
y_train, y_val, y_test, label_names = get_and_preprocess_cifar10_data(pca_components, 
                                                                      train_val_split_percentage)

(30000, 100) (20000, 100) (10000, 100) (30000,) (20000,) (10000,)


In [4]:
K_list = [10] #np.arange(500, 201)
weights = ["uniform"]
results = []
for weight in weights:
  for K in K_list:
    print(f"**************** weights: {weight}, K = {K} ****************")

    # start time
    tic = time.time()

    # create and fit our SVM model
    knn = KNeighborsClassifier(n_neighbors=K, weights=weight)
    knn.fit(X_train_pca_scaled, y_train)

    # Get predictions and check accuracy on the Training set
    y_train_preds = knn.predict(X_train_pca_scaled)
    train_accuracy = float("{:.2f}".format(100 * np.mean(y_train_preds == y_train)))
    print(f"train_accuracy = {train_accuracy} %")

    # Get predictions and check accuracy on the Validation set
    y_val_preds = knn.predict(X_val_pca_scaled)
    val_accuracy = float("{:.2f}".format(100 * np.mean(y_val_preds == y_val)))
    print(f"val_accuracy = {val_accuracy} %")
    
    # end time
    toc = time.time()

    # turn total time from seconds to -> hours, minutes and seconds
    Dt = toc - tic
    total_time = str(datetime.timedelta(seconds=Dt))
    time_split = total_time.split(':')
    hours = int(time_split[0])
    minutes = int(time_split[1])
    seconds = round(float(time_split[2]))
    time_str = str(hours) + 'h  ' + str(minutes) + 'min  ' + str(seconds) + 'sec'

    results_dict = save_results_to_dict(weight, K, time_str, train_accuracy, val_accuracy)
    results.append(results_dict)
    print(f"Time elapsed = {hours}h {minutes}min {seconds}sec \n")

# Get predictions on the Test set
y_test_preds = knn.predict(X_test_pca_scaled)
test_accuracy = float("{:.2f}".format(100 * np.mean(y_test_preds == y_test)))
print(f"test_accuracy = {test_accuracy} %")

**************** weights: uniform, K = 10 ****************
train_accuracy = 37.09 %
val_accuracy = 28.55 %
Time elapsed = 0h 0min 35sec 

test_accuracy = 28.19 %


In [None]:
results_df = pd.DataFrame(results)
results_df.to_excel("CIFAR_10_KNN.xlsx")

In [None]:
results_df

Unnamed: 0,dataset,model,K,weights,N_train,N_validation,N_test,D,PCA_applied,k-fold Cross-Validation applied,Time elapsed,Train accuracy (%),Validation accuracy (%)
0,Cifar-10,KNN,1,uniform,30000,20000,10000,100,YES,NO,0h 0min 26sec,100.00,32.06
1,Cifar-10,KNN,2,uniform,30000,20000,10000,100,YES,NO,0h 0min 26sec,60.28,27.51
2,Cifar-10,KNN,3,uniform,30000,20000,10000,100,YES,NO,0h 0min 29sec,53.61,28.82
3,Cifar-10,KNN,4,uniform,30000,20000,10000,100,YES,NO,0h 0min 41sec,49.38,29.35
4,Cifar-10,KNN,5,uniform,30000,20000,10000,100,YES,NO,0h 0min 41sec,45.50,28.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Cifar-10,KNN,46,distance,30000,20000,10000,100,YES,NO,0h 0min 40sec,100.00,24.82
96,Cifar-10,KNN,47,distance,30000,20000,10000,100,YES,NO,0h 0min 40sec,100.00,24.82
97,Cifar-10,KNN,48,distance,30000,20000,10000,100,YES,NO,0h 0min 39sec,100.00,24.68
98,Cifar-10,KNN,49,distance,30000,20000,10000,100,YES,NO,0h 0min 39sec,100.00,24.62
