In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import time
from collections import Counter

In [2]:
train_df = pd.read_csv('./digit-recognizer/train.csv')

In [3]:
train_df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X = train_df[['pixel' + str(x) for x in range(784)]]
Y = train_df['label']

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [6]:
X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

## Nearest Neighbor Classifier

In [7]:
class NearestNeighbour():
    accuracy = None
    Y_pred = None
    def __init__(self, X, Y):
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(X, Y)
    def predict(self, X, k= 1):
        """ X is N x D where each row is an example we wish to predict label for """
        num_test = X.shape[0]
        # lets make sure that the output type matches the input type
        self.Y_pred = np.zeros(num_test, dtype = self.Y_train.dtype)

        # loop over all test rows
        for i in range(num_test):
            # find the nearest training image to the i'th test image
            # using the L1 distance (sum of absolute value differences)
            distances = np.sum(np.abs(self.X_train - X[i,:]), axis= 1)
            # get the index with smallest distance
            min_index = np.argmin(distances) 
            # predict the label of the nearest example
            self.Y_pred[i] = self.Y_train[min_index] 

        return self.Y_pred
    
    def test_accuracy(self):
        if self.Y_pred:
            np.count_nonzero(self.Y_pred == self.Y_test) / X_test.shape[0]
        elif self.accuracy is None:
            Y_pred = self.predict(X)
            

In [8]:
nn = NearestNeighbour(X_train, Y_train)

In [9]:
Y_pred = nn.predict(nn.X_test[0:100, :])
np.count_nonzero(Y_pred == nn.Y_test[0:100])

95

In [10]:
Y_pred = nn.predict(nn.X_test[100:200, :], k= 1)
np.count_nonzero(Y_pred == nn.Y_test[100:200])

97

In [11]:
Y_pred = nn.predict(nn.X_test[200:300, :])
np.count_nonzero(Y_pred == nn.Y_test[200:300])

98

In [12]:
Y_pred = nn.predict(nn.X_test[300:400, :])
np.count_nonzero(Y_pred == nn.Y_test[300:400])

95

In [13]:
counts = Counter(X_train[0])
counts.most_common(1)

[(0, 598)]

### CAUTION
Due to limitation of computations the file is not executed for all *X_test*... This limitation is more of computation limitations of the particular computer and not for all... 

It can be said that the Nearest Neighbour Classifier has accuracy well above 90%

## K-Nearest Neighbor Classifier

In [19]:
class kNearestNeighbour():
    accuracy = None
    Y_pred = None
    k = 5
    def __init__(self, X, Y, k= 5):
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(X, Y)
        self.k = k
    def predict(self, X):
        """ X is N x D where each row is an example we wish to predict label for """
        num_test = X.shape[0]
        # lets make sure that the output type matches the input type
        self.Y_pred = np.zeros(num_test, dtype = self.Y_train.dtype)

        # loop over all test rows
        for i in range(num_test):
            # find the nearest training image to the i'th test image
            # using the L1 distance (sum of absolute value differences)
            distances = np.sum(np.abs(self.X_train - X[i,:]), axis= 1)
            # get the index with smallest distance
            min_index = distances.argsort()[:self.k] 
            # predict the label of the nearest example
            self.Y_pred[i] = Counter(self.Y_train[min_index]).most_common()
        return self.Y_pred
    
    def test_accuracy(self):
        if self.Y_pred:
            np.count_nonzero(self.Y_pred == self.Y_test) / X_test.shape[0]
        elif self.accuracy is None:
            Y_pred = self.predict(X)
            

In [20]:
knn = NearestNeighbour(X_train, Y_train)

In [21]:
Y_pred = nn.predict(nn.X_test[0:100, :])
np.count_nonzero(Y_pred == nn.Y_test[0:100])

95

In [22]:
Y_pred = nn.predict(nn.X_test[100:200, :], k= 1)
np.count_nonzero(Y_pred == nn.Y_test[100:200])

97

In [23]:
Y_pred = nn.predict(nn.X_test[200:300, :])
np.count_nonzero(Y_pred == nn.Y_test[200:300])

98

In [None]:
Y_pred = nn.predict(nn.X_test[300:400, :])
np.count_nonzero(Y_pred == nn.Y_test[300:400])