### The object of this project is to make a comparison of the "classical" knn method and a new method proposed by [Dend et al., 2015] (Deng, Z., Zhu, X., Cheng, D., Zong, M., and Zhang, S. (2016). Efficient knn classification algorithm for big data. Neurocomputing, 195:143–148. Learning for Medical Imaging.)

### To make the comparison, we will use two datasets: GISETTE and OPTDIGITS.
+ GISETTE is is a handwritten digit recognition problem. The problem is to
separate the highly confusible digits ’4’ and ’9’. This dataset is one of five datasets of
the NIPS 2003 feature selection challenge
+ OPTDIGITS is a handwritten digit recognition problem. 

### We will use the "classical" KNN method from the library scikit-learn and we will develop the new method using the library pyspark.

In [1]:
from pyspark import SparkContext
import numpy as np
import pandas as pd

# Data retrieving and preprocessing

## Gisette dataset

In [2]:
# read the file gisette_db/gisette-train.data
# and convert it to a pandas dataframe
gisette_data = pd.read_csv('gisette-db/gisette_train.data', sep=' ', header=None)
gisette_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4991,4992,4993,4994,4995,4996,4997,4998,4999,5000
0,550,0,495,0,0,0,0,976,0,0,...,0,0,991,991,0,0,0,0,983,
1,0,0,0,0,0,0,0,976,0,0,...,475,0,991,0,0,991,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
3,0,0,742,0,0,0,0,684,0,956,...,0,0,0,0,0,674,0,0,838,
4,0,0,0,0,0,0,0,608,0,979,...,0,0,828,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,783,0,0,0,
5996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,921,0,886,0,
5997,0,0,0,0,0,758,0,0,0,522,...,901,0,0,0,0,980,0,0,0,
5998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,690,0,0,0,0,0,


In [3]:
gisette_label = pd.read_csv('gisette-db/gisette_train.labels', sep=' ', header=None)

In [4]:
gisette_label

Unnamed: 0,0
0,1
1,-1
2,1
3,1
4,1
...,...
5995,-1
5996,1
5997,-1
5998,-1


In [5]:
# replace Nan with 0
gisette_data = gisette_data.fillna(0)

## Optdigit dataset

In [6]:
# import optdigits-db/optdigits.tra
# and convert it to a pandas dataframe
optdigits_data = pd.read_csv('optdigits-db/optdigits.tra', sep=',', header=None)
optdigits_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,0,1,6,15,12,1,0,0,0,7,...,0,0,0,6,14,7,1,0,0,0
1,0,0,10,16,6,0,0,0,0,7,...,0,0,0,10,16,15,3,0,0,0
2,0,0,8,15,16,13,0,0,0,1,...,0,0,0,9,14,0,0,0,0,7
3,0,0,0,3,11,16,0,0,0,0,...,0,0,0,0,1,15,2,0,0,4
4,0,0,5,14,4,0,0,0,0,0,...,0,0,0,4,12,14,7,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3818,0,0,5,13,11,2,0,0,0,2,...,0,0,0,8,13,15,10,1,0,9
3819,0,0,0,1,12,1,0,0,0,0,...,0,0,0,0,4,9,0,0,0,4
3820,0,0,3,15,0,0,0,0,0,0,...,0,0,0,4,14,16,9,0,0,6
3821,0,0,6,16,2,0,0,0,0,0,...,0,0,0,5,16,16,16,5,0,6


In [7]:
optdigits_label = optdigits_data[64]
optdigits_data = optdigits_data.drop(64, axis=1)
optdigits_label

0       0
1       0
2       7
3       4
4       6
       ..
3818    9
3819    4
3820    6
3821    6
3822    7
Name: 64, Length: 3823, dtype: int64

# Making a "classical" KNN model for comparison

### We will use 4 metrics to compare the two methods: accuracy, precision, recall and F1-score. We will also measure the time

+ optdigits dataset

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [9]:
optdigit_train_data, optdigit_test_data, optdigit_train_label, optdigit_test_label = train_test_split(optdigits_data, optdigits_label, test_size=0.2, random_state=42)

In [10]:
knn = KNeighborsClassifier(n_neighbors=3)

In [11]:
knn.fit(optdigit_train_data, optdigit_train_label)

KNeighborsClassifier(n_neighbors=3)

In [12]:
# prediction
# record the time
import time
start_opt = time.time()
predict = knn.predict(optdigit_test_data)
end_opt = time.time()


### Accuracy, precision, recall and F1-score

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [14]:
acc_opt = accuracy_score(optdigit_test_label, predict)
prec_opt = precision_score(optdigit_test_label, predict, average='macro')
rec_opt = recall_score(optdigit_test_label, predict, average='macro')
f1_opt = f1_score(optdigit_test_label, predict, average='macro')


In [15]:
print('Accuracy: ', acc_opt, '\nPrecision', prec_opt,
      '\nRecall', rec_opt, '\nF1', f1_opt, '\nTime', end_opt-start_opt, 's')


Accuracy:  0.984313725490196 
Precision 0.9847688494865668 
Recall 0.9848506588806535 
F1 0.984685110583763 
Time 0.07819581031799316 s


+ GISETTE dataset

In [16]:
gisette_train_data, gisette_test_data, gisette_train_label, gisette_test_label = train_test_split(gisette_data, gisette_label, test_size=0.2, random_state=42)

In [17]:
knn = KNeighborsClassifier(n_neighbors=3)

In [18]:
knn.fit(gisette_train_data, gisette_train_label)

  return self._fit(X, y)


KNeighborsClassifier(n_neighbors=3)

In [19]:
start_gisette = time.time()
predict = knn.predict(gisette_test_data)
end_gisette = time.time()


In [20]:
len(predict)


1200

In [21]:
acc_gisette = accuracy_score(gisette_test_label, predict)
prec_gisette = precision_score(gisette_test_label, predict, average='macro')
rec_gisette = recall_score(gisette_test_label, predict, average='macro')
f1_gisette = f1_score(gisette_test_label, predict, average='macro')


In [22]:
print('Accuracy: ', acc_gisette, '\nPrecision', prec_gisette,
      '\nRecall', rec_gisette, '\nF1', f1_gisette, '\nTime', end_gisette-start_gisette, 's')


Accuracy:  0.9691666666666666 
Precision 0.9700124611978327 
Recall 0.9694444521615442 
F1 0.9691618482054487 
Time 0.7807705402374268 s


# Visualizing the results of the classical KNN models

In [26]:
# make a dataframe to store the results of gisette
gisette_result = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1', 'Time'])
gisette_result.append({'Accuracy': acc_gisette, 'Precision': prec_gisette, 'Recall': rec_gisette, 'F1': f1_gisette, 'Time': end_gisette-start_gisette}, ignore_index=True)

  gisette_result.append({'Accuracy': acc_gisette, 'Precision': prec_gisette, 'Recall': rec_gisette, 'F1': f1_gisette, 'Time': end_gisette-start_gisette}, ignore_index=True)


Unnamed: 0,Accuracy,Precision,Recall,F1,Time
0,0.969167,0.970012,0.969444,0.969162,0.780771


In [27]:
opt_result = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1', 'Time'])
opt_result.append({'Accuracy': acc_opt, 'Precision': prec_opt, 'Recall': rec_opt, 'F1': f1_opt, 'Time': end_opt-start_opt}, ignore_index=True)

  opt_result.append({'Accuracy': acc_opt, 'Precision': prec_opt, 'Recall': rec_opt, 'F1': f1_opt, 'Time': end_opt-start_opt}, ignore_index=True)


Unnamed: 0,Accuracy,Precision,Recall,F1,Time
0,0.984314,0.984769,0.984851,0.984685,0.078196


In [34]:
import matplotlib.pyplot as plt
import seaborn as sns

# KNN classification for Big Data with Spark