In [3]:
%%time
%matplotlib inline
from tqdm import tqdm_notebook as tqdm
import time, sys
import numpy as np
import time
import torch
import torchvision
from torchvision import datasets, transforms
from sklearn.metrics import mean_absolute_error
from scipy.spatial import distance
import pandas as pd
import scipy
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

Wall time: 0 ns


#### Loading MNIST using Pytorch and processing the images to tensor form

In [3]:
transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                              ])

In [4]:
trainset = datasets.MNIST('./data', download=True, train=True, transform=transform)

In [5]:
testset = datasets.MNIST('./data', download=True, train=False, transform=transform)

In [6]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size = len(trainset))

In [7]:
testloader = torch.utils.data.DataLoader(testset, batch_size = len(testset))

In [8]:
trainset_array = next(iter(trainloader))[0].numpy()


In [9]:
testset_array = next(iter(testloader))[0].numpy()

In [10]:
trainset_labels_array = next(iter(trainloader))[1].numpy()
testset_labels_array = next(iter(testloader))[1].numpy()

#### Squashing image pixel values to 1D arrays

In [11]:
trainset_array = trainset_array.reshape(60000,784)
testset_array = testset_array.reshape(10000,784)
print(trainset_array.shape)
print(testset_array.shape)

(60000, 784)
(10000, 784)


#### Finding an image in trainingset which is closest to a selected image from testset, by comparing corresponding pixel values
(using euclidean distance as the metric)

In [12]:
a = testset_array[1506]
print(testset_array[1506].shape)
print(testset_labels_array[1506])

(784,)
2


In [13]:
diff = []

In [14]:
start = time.time()
for x in trainset_array:
    difference = distance.euclidean(x,a)
    diff.append(difference)
end = time.time()

In [15]:
diff = np.asarray(diff)
print(diff)
print(np.argmin(diff))

[20.36130714 21.95580292 21.38853836 ... 23.11495972 20.42109489
 20.20189476]
9092


In [16]:
print(diff[9092])
trainset_labels_array[9092]

10.263904571533203


2

#### Finding closest image in training set for each of the images in test set:

In [17]:
A = testset_array
B = trainset_array


In [18]:
torch.cuda.current_device()

0

In [19]:
threeSums = np.sum(np.square(A)[:,np.newaxis,:], axis=2) - 2 * A.dot(B.T) + np.sum(np.square(B), axis=1)
dist = np.sqrt(threeSums)
dist

array([[18.790552, 20.789257, 18.880846, ..., 18.934917, 19.067772,
        18.446215],
       [22.365992, 22.902077, 23.313847, ..., 22.520222, 20.928165,
        21.187037],
       [18.44753 , 20.152567, 18.306366, ..., 18.512846, 16.885553,
        16.782421],
       ...,
       [20.60834 , 22.969301, 21.010323, ..., 19.973719, 20.622482,
        19.396578],
       [21.125296, 20.870401, 22.024317, ..., 20.812914, 18.173307,
        18.48875 ],
       [23.091295, 19.707027, 25.114649, ..., 24.243755, 16.179907,
        23.289103]], dtype=float32)

In [20]:
dist.shape

(10000, 60000)

In [21]:
dist[1].shape

(60000,)

In [22]:
dist_eg1 = dist[486]
print(np.min(dist_eg1))
print(np.argmin(dist_eg1))
print(testset_labels_array[486])

12.449959
37608
8


In [23]:
print(trainset_labels_array[37608])

8


In [24]:
dist_eg1 = dist[9643]
print(np.min(dist_eg1))
print(np.argmin(dist_eg1))
print(testset_labels_array[9643])

6.073906
3487
1


In [25]:
print(trainset_labels_array[3487])

1


In [40]:
losses_min = []
losses_min_index = []
for x in dist:
    loss_min = np.min(x)
    losses_min.append(loss_min)
    loss_min_index = np.argmin(x)
    losses_min_index.append(loss_min_index)

#### Assigning random labels to training set and evaluating

In [26]:
array_random = [0,1,2,3,4,5,6,7,8,9]
labels_random = np.random.choice(array_random,60000)

In [27]:
dist.shape

(10000, 60000)

#### Applying KNN 

In [28]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(trainset_array, trainset_labels_array)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [29]:
ypred = knn.predict(testset_array)

In [30]:
print(metrics.accuracy_score(testset_labels_array, ypred))

0.9705


In [31]:
print(ypred[5034])
print(testset_labels_array[5034])

6
6


k_range = range(3,8)
scores = {}
scores_list = []
for k in tqdm(k_range):
    knn_range = KNeighborsClassifier(n_neighbors=k)
    knn_range.fit(trainset_array, trainset_labels_array)
    y_pred = knn_range.predict(testset_array)
    scores[k] = metrics.accuracy_score(testset_labels_array, y_pred)
    scores_list.append(metrics.accuracy_score(testset_labels_array,y_pred))

update_progress(1)


In [32]:
knn1 = KNeighborsClassifier(n_neighbors=5)
knn1.fit(trainset_array, trainset_labels_array)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [33]:
ypred1 = knn1.predict(testset_array)

In [34]:
print(metrics.accuracy_score(testset_labels_array, ypred1))

0.9688
