# K Nearest Neighbour Classifier

In [None]:
#Imports
import numpy as np
from pprint import pprint
from scipy import stats

np.random.seed(42)

from sklearn.datasets import fetch_openml

from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA

from sklearn.pipeline import make_pipeline

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_validate, RandomizedSearchCV, GridSearchCV, cross_val_predict
from sklearn.model_selection import learning_curve

from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, precision_score
from sklearn.metrics import make_scorer

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
mpl.rc('figure', figsize=(8,6))

## Dataset

In [None]:
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

X = X.to_numpy()
y = y.to_numpy()
x_train, y_train, x_test, y_test = X[:60000], y[:60000], X[60000:], y[60000:]

In [None]:
type(y_train[0])

## Binary classification

### Change labels to binary

In [None]:
# Initialize new variable names with all -1
y_train_0 = -1*np.ones((len(y_train)))
y_test_0 = -1*np.ones((len(y_test)))

#find indices of the digit 0 image
indx_0 = np.where(y_train == '0')
y_train_0[indx_0] = 1

indx_0 = np.where(y_test == '0')
y_test_0[indx_0] = 1

### Data visualisation in lower dimension

In [None]:
pipe_pca_2d = make_pipeline(MinMaxScaler(), PCA(n_components=2))
x_train_pca_2d = pipe_pca_2d.fit_transform(x_train)

In [None]:
cmap = ListedColormap(['r', 'b'])
plt.figure(figsize=(8,8))
sns.scatterplot(x = x_train_pca_2d[:,0], y=x_train_pca_2d[:,1], hue=y_train_0, palette=cmap)

In [None]:
pipe_pca_3d = make_pipeline(MinMaxScaler(), PCA(n_components=3))
x_train_pca_3d = pipe_pca_3d.fit_transform(x_train)

In [None]:
import plotly.express as px
fig = px.scatter_3d(x = x_train_pca_3d[:,0],
                y = x_train_pca_3d[:,1],
                z = x_train_pca_3d[:,2],
                color = y_train_0,
                color_discrete_map =cmap,
                opacity = 0.5)
fig.show()

### KNN classifier

In [None]:
pipe_clf_pca_2d = make_pipeline(pipe_pca_2d, KNeighborsClassifier(n_neighbors=3))

* Let's train a model with 10 samples from training set
* Then test the model with 10 datapoints from the test set

In [None]:
index_neg = np.argsort(y_train_0)[:5]
index_pos = np.argsort(y_train_0)[-1:-6:-1]

Sanity check

In [None]:
#Ensure first 10 samples are mix of both classes
print("Pos classes:", y_train_0[index_pos])
print("Neg classes:", y_train_0[index_neg])

In [None]:
# Create a small dataset

x = np.vstack((x_train[index_pos,:], x_train[index_neg,:]))
print(x.shape)
y = np.hstack((y_train_0[index_pos], y_train_0[index_neg]))
print(y.shape)

In [None]:
pipe_clf_pca_2d.fit(x, y)
x_reduced = pipe_clf_pca_2d[0].transform(x)

In [None]:
cmap = ListedColormap(['r', 'b'])
plt.figure(figsize=(6,4))
sns.scatterplot(x = x_reduced[:,0], y=x_reduced[:,1], marker ='o', hue=y, palette=cmap)

In [None]:
y_hat_0 = pipe_clf_pca_2d.predict(x_test[:10, :])
ConfusionMatrixDisplay.from_predictions(y_test_0[:10], y_hat_0)
print('Test labels:', y_test_0[:10])
plt.show()

In [None]:
cmap = ListedColormap(['r', 'b'])
plt.figure(figsize=(6,4))
sns.scatterplot(x = x_reduced[:,0], y=x_reduced[:,1], marker ='o', hue=y, palette=cmap)

x_test_reduced = pipe_clf_pca_2d[0].transform(x_test[:10,:])
sns.scatterplot(x = x_test_reduced[:,0], y=x_test_reduced[:,1],s=100, marker ='*', hue=y_test_0[:10], palette=cmap, legend=None)

dx, dy = -0.1, 0.2
for i in range(10):
    plt.annotate(str(y_hat_0[i]), xy = (x_test_reduced[i,0]+dx, x_test_reduced[i, 1]+dy))

plt.grid(True)
plt.show()

* It would be better if we know the distance of 3 neighbours for each test point
* Let us display the distance and connectivity of neighbours to the test datapoints using `NearestNeighbour` class

In [None]:
from sklearn.neighbors import NearestNeighbors
neighbours = NearestNeighbors(n_neighbors=3)
neighbours.fit(pipe_pca_2d.transform(x))

In [None]:
dist_neighbours, idx_neighbours = neighbours.kneighbors(pipe_pca_2d.transform(x_test[:10]), 3, return_distance=True)


In [None]:
for i in range(3):
    print('Distance: {0}, \nIndex:{1}, \nLables:{2} \nPrediction: {3} '.format(dist_neighbours[i],
                                                                             idx_neighbours[i],
                                                                             y[idx_neighbours[i].flatten()],
                                                                             stats.mode(y[idx_neighbours[i].flatten()]).mode[0]))

In [None]:
pipe_clf_pca_2d.fit(x_train[:10000], y_train_0[:10000])

In [None]:
y_hat_0 = pipe_clf_pca_2d.predict(x_test)
ConfusionMatrixDisplay.from_predictions(y_test_0, y_hat_0)
plt.show()

In [None]:
print(classification_report(y_test_0, y_hat_0))

* Let's vary the `n_neighbours` from k=1 tp 19 and study the performance of the model
* We use the first 10k samples from training set

In [None]:
precision = []
for k in range(1, 20, 2):
    pipe_clf_pca_2d.__n_neighbors = k
    pipe_clf_pca_2d.fit(x_train[:10000], y_train_0[:10000])
    y_hat_0 = pipe_clf_pca_2d.predict(x_test)
    precision.append(precision_score(y_test_0, y_hat_0))

In [None]:
plt.figure(fig)