In [1]:
# credits  http://cs231n.github.io/classification/
# note: this file is a python3 notebook

In [2]:
import pickle
import numpy as np

In [3]:
# download dataset from http://www.cs.toronto.edu/~kriz/cifar.html
def unpickle(file):
    with open(file, 'rb') as fo:
        dictionary = pickle.load(fo, encoding='bytes')
    return dictionary

In [4]:
# process the data
# create Xtr, Ytr, Xte, Yte

Ytr = []
# note: this creates a row with random numbers
Xtr = np.ndarray(shape=(1,3072), dtype = 'int16')

for i in range(1, 6):
    file_path = 'cifar-10-batches-py/data_batch_' + str(i)
    data_batch = unpickle(file_path)
    Ytr += data_batch[b'labels']
    Xtr = np.concatenate([Xtr, data_batch[b'data']], axis = 0)

# delete the first row we had created randomly
Xtr = np.delete(Xtr, (0), axis=0)

# creating the test data 
file_path = 'cifar-10-batches-py/test_batch'
test_batch = unpickle(file_path)
Yte = test_batch[b'labels']
Xte = test_batch[b'data']
Xte = Xte.astype(np.int16)

In [5]:
# nearest neighbor classifier with L1 distance

class NearestNeighbor(object):
    def __init__(self):
        pass

    def train(self, X, y):
        """ X is N x D where each row is an example. Y is 1-dimension of size N """
        # the nearest neighbor classifier simply remembers all the training data
        self.Xtr = X
        self.ytr = y

    def predict(self, X):
        """ X is N x D where each row is an example we wish to predict label for """
        num_test = X.shape[0]
        # lets make sure that the output type matches the input type
        Ypred = np.zeros(num_test)

        # loop over all test rows
        for i in range(num_test):
            # find the nearest training image to the i'th test image
            # using the L1 distance (sum of absolute value differences)
            distances = np.sum(np.abs(self.Xtr - X[i,:]), axis = 1)
            min_index = np.argmin(distances) # get the index with smallest distance
            Ypred[i] = self.ytr[min_index] # predict the label of the nearest example
        
        return Ypred

In [6]:
# test the classifier

# create a Nearest Neighbor classifier class
nn = NearestNeighbor()

# train the classifier on the training images and labels
nn.train(Xtr, Ytr)

# predict labels on the test images
Yte_predict = nn.predict(Xte)

# and now print the classification accuracy, which is the average number of examples that are correctly predicted (i.e. label matches)
print('accuracy: ', (np.mean(Yte_predict == Yte)))

accuracy:  0.3859


In [None]:
# do not run this code just yet, you'll have to modify the class NearestNeighbor to work for 'k' neighbors.

# assume we have Xtr, Ytr, Xte, Yte as before
# recall Xtr is 50,000 x 3072 matrix

# take first 1000 for validation
Xval = Xtr[:1000, :]
Yval = Ytr[:1000]

# keep last 49,000 for train
Xtr = Xtr[1000:, :]
Ytr = Ytr[1000:]

# find hyperparameters that work best on the validation set
validation_accuracies = []
for k in [1, 3, 5, 10, 20, 50, 100]:
    
    # use a particular value of k and evaluation on validation data
    nn = NearestNeighbor()
    nn.train(Xtr, Ytr)
    
    # here we assume a modified NearestNeighbor class that can take a k as input
    Yval_predict = nn.predict(Xval, k = k)
    acc = np.mean(Yval_predict == Yval)
    print 'accuracy: %f' % (acc,)

    # keep track of what works on the validation set
    validation_accuracies.append((k, acc))