In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
f,ax = plt.subplots(1,3,figsize=(15,6))
ax[0].imshow(test.iloc[0].reshape(28,28),cmap='binary')
ax[1].imshow(test.iloc[1].reshape(28,28),cmap='binary')
ax[2].imshow(test.iloc[2].reshape(28,28),cmap='binary')

In [None]:
train.describe()

In [None]:
targets = np.array(train['label'].values)

dataset = np.array(train.drop(['label'],axis=1).values)
dataset = np.multiply(dataset, 1./255.)

holdout = np.array(test.values)
holdout = np.multiply(holdout, 1./255.)

In [None]:
dataset.shape, targets.shape, holdout.shape

In [None]:
X = dataset
y = targets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
neighbors = [np.arange(1,8)]
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

In [None]:
max_test_score = 0
n = 0
for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_accuracy[i] = knn.score(X_train, y_train)
    test_accuracy[i] = knn.score(X_test, y_test)
    
    if (test_accuracy[i] >= max_test_score):
        max_test_score = test_accuracy[i]
        n = k

In [None]:
plt.title('Optimizing n_neighbors on KNN')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training Accuracy')
plt.legend()
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=n)
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
print(round(accuracy_score(y_test, y_pred)*100,2),'%')
print('-'*40)
print(classification_report(y_test, y_pred))

In [None]:
predictions = knn.predict(holdout)

In [None]:
submission = pd.DataFrame({
    'ImageId': np.arange(1,holdout.shape[0]+1),
    'Label': predictions
})

In [None]:
submission.to_csv('knn_submission.csv',index=False)