In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


def knn(k, training_data, test_data):
    conf_m = np.zeros((10,10))
    negatives = []
    # For each sample from test data
    for row in test_data:    
        # Compute the distance between the sample and training data
        distances = np.sqrt(np.sum(np.square(training_data[0:,1:] - row[1:]), axis=1))
        # Find k nearest neighbours
        ind = np.argpartition(distances, k)[:k]
        nearest_labels = training_data[ind][:,0].flatten()
        y_pred = np.argmax(np.bincount(labels.astype(np.int8)))
        # Store false predictions
        if label != row[0]:
            true_neg = training_data[ind[0]]
            negatives.append([true_neg, row])
        # Update confusion matrix
        conf_m[int(row[0]), label] +=1
            
        
    #measurements[k] = compute_error(conf_m,k)
    # Choose a random true negative and show
    display_rand_true_neg(negatives)

    return y

def print_measurements(k, num_correct, num_total, accuracy, error_rate, conf_m):
    df = pd.DataFrame(conf_m, index=[x for x in range(0,10)], columns=[x for x in range(0,10)], dtype=np.uint16)
    print('%-20s %5d' % ('k value:', k))
    print('%-20s %5d' % ('correct predictions:', num_correct))
    print('%-20s %5d' % ('total predictions:', num_total))
    print('%-20s %02.3f' %  ('Accuracy: ', accuracy))
    print('%-20s %02.3f\n' %  ('Error rate:', error_rate))
    print('Confusion matrix: \n\n',df)
    

def display_rand_true_neg(negatives):
    rand = np.random.randint(0, len(negatives))
    plt.imshow(negatives[rand][0][1:].reshape(16,16), cmap='gray')
    plt.suptitle('Predicted', fontsize=20)
    plt.show()
    plt.imshow(negatives[rand][1].flatten()[1:].reshape(16,16), cmap='gray')
    plt.suptitle('Actual', fontsize=20)
    plt.show()
        
        
def compute_error(conf_m, k):
    # Sum across the diagonal 
    num_correct = np.sum(np.diag(conf_m))
    # Sum columns
    num_total = np.sum(np.sum(conf_m, axis=0))
    accuracy = num_correct / num_total
    error_rate =  round((1 - accuracy),3)
    print_measurements(k, num_correct, num_total, accuracy, error_rate, conf_m)
    
    return error_rate
          
   
if __name__ == '__main__':
    # Load the training and test dataset
    df = pd.read_csv('zip.train', sep=' ', header=None, usecols=list(range(0, 257)))
    training_data = np.array(df, dtype=np.float32) 
    df = pd.read_csv('zip.test', sep=' ', header=None, usecols=list(range(0, 257)))
    test_data = np.array(df, dtype=np.float32)[0:100] #reduce amount of test data
    measurements = np.zeros(10)
   
    # Specify k
    for k in range(1,10):
        _y_pred = knn(k, training_data, test_data)
        _accuracy = np.sum(np.equal(_y_pred, _test_y[:_n_test_samples])) / len(_y_pred)
        
    print(measurements)
    plt.plot(measurements)
    plt.xlabel('k value')
    plt.ylabel('Error rate')
    
  

100.0
k value:                 1
correct predictions:    94
total predictions:     100
Accuracy:            0.940
Error rate:          0.060

Confusion matrix: 

     0  1   2  3  4  5  6  7  8   9
0  26  0   0  0  0  0  0  0  0   0
1   0  8   0  0  0  0  0  0  0   0
2   1  0  15  1  0  0  0  0  0   0
3   0  0   0  4  0  1  0  0  0   0
4   0  1   0  0  4  0  0  0  0   0
5   0  0   0  0  0  4  0  0  0   0
6   0  0   0  0  1  0  9  0  0   0
7   0  0   0  0  0  0  0  8  0   0
8   0  0   0  1  0  0  0  0  5   0
9   0  0   0  0  0  0  0  0  0  11


<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

100.0
k value:                 2
correct predictions:    94
total predictions:     100
Accuracy:            0.940
Error rate:          0.060

Confusion matrix: 

     0  1   2  3  4  5  6  7  8   9
0  26  0   0  0  0  0  0  0  0   0
1   0  8   0  0  0  0  0  0  0   0
2   2  0  15  0  0  0  0  0  0   0
3   0  0   0  4  0  1  0  0  0   0
4   0  1   0  0  4  0  0  0  0   0
5   0  0   0  0  0  4  0  0  0   0
6   0  0   0  0  1  0  9  0  0   0
7   0  0   0  0  0  0  0  8  0   0
8   0  0   0  1  0  0  0  0  5   0
9   0  0   0  0  0  0  0  0  0  11


<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

100.0
k value:                 3
correct predictions:    96
total predictions:     100
Accuracy:            0.960
Error rate:          0.040

Confusion matrix: 

     0  1   2  3  4  5  6  7  8   9
0  26  0   0  0  0  0  0  0  0   0
1   0  8   0  0  0  0  0  0  0   0
2   1  0  16  0  0  0  0  0  0   0
3   0  0   0  4  0  1  0  0  0   0
4   0  1   0  0  4  0  0  0  0   0
5   0  0   0  0  0  4  0  0  0   0
6   0  0   0  0  1  0  9  0  0   0
7   0  0   0  0  0  0  0  8  0   0
8   0  0   0  0  0  0  0  0  6   0
9   0  0   0  0  0  0  0  0  0  11


<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

100.0
k value:                 4
correct predictions:    93
total predictions:     100
Accuracy:            0.930
Error rate:          0.070

Confusion matrix: 

     0  1   2  3  4  5  6  7  8   9
0  26  0   0  0  0  0  0  0  0   0
1   0  8   0  0  0  0  0  0  0   0
2   1  0  16  0  0  0  0  0  0   0
3   0  0   0  4  0  1  0  0  0   0
4   0  1   0  0  4  0  0  0  0   0
5   0  0   0  1  0  3  0  0  0   0
6   1  0   1  0  0  0  8  0  0   0
7   0  0   0  0  0  0  0  8  0   0
8   0  0   0  1  0  0  0  0  5   0
9   0  0   0  0  0  0  0  0  0  11


<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

100.0
k value:                 5
correct predictions:    94
total predictions:     100
Accuracy:            0.940
Error rate:          0.060

Confusion matrix: 

     0  1   2  3  4  5  6  7  8   9
0  26  0   0  0  0  0  0  0  0   0
1   0  8   0  0  0  0  0  0  0   0
2   1  0  16  0  0  0  0  0  0   0
3   0  0   0  4  0  1  0  0  0   0
4   0  1   0  0  4  0  0  0  0   0
5   0  0   0  0  0  4  0  0  0   0
6   0  0   1  0  0  0  9  0  0   0
7   0  0   0  0  0  0  0  8  0   0
8   0  0   0  1  0  1  0  0  4   0
9   0  0   0  0  0  0  0  0  0  11


<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

100.0
k value:                 6
correct predictions:    95
total predictions:     100
Accuracy:            0.950
Error rate:          0.050

Confusion matrix: 

     0  1   2  3  4  5  6  7  8   9
0  26  0   0  0  0  0  0  0  0   0
1   0  8   0  0  0  0  0  0  0   0
2   1  0  16  0  0  0  0  0  0   0
3   0  0   0  4  0  1  0  0  0   0
4   0  1   0  0  4  0  0  0  0   0
5   0  0   0  0  0  4  0  0  0   0
6   0  0   1  0  0  0  9  0  0   0
7   0  0   0  0  0  0  0  8  0   0
8   0  0   0  1  0  0  0  0  5   0
9   0  0   0  0  0  0  0  0  0  11


<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

100.0
k value:                 7
correct predictions:    95
total predictions:     100
Accuracy:            0.950
Error rate:          0.050

Confusion matrix: 

     0  1   2  3  4  5  6  7  8   9
0  26  0   0  0  0  0  0  0  0   0
1   0  8   0  0  0  0  0  0  0   0
2   1  0  16  0  0  0  0  0  0   0
3   0  0   0  4  0  1  0  0  0   0
4   0  1   0  0  4  0  0  0  0   0
5   0  0   0  0  0  4  0  0  0   0
6   0  0   1  0  0  0  9  0  0   0
7   0  0   0  0  0  0  0  8  0   0
8   0  0   0  1  0  0  0  0  5   0
9   0  0   0  0  0  0  0  0  0  11


<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

100.0
k value:                 8
correct predictions:    95
total predictions:     100
Accuracy:            0.950
Error rate:          0.050

Confusion matrix: 

     0  1   2  3  4  5  6  7  8   9
0  26  0   0  0  0  0  0  0  0   0
1   0  8   0  0  0  0  0  0  0   0
2   1  0  16  0  0  0  0  0  0   0
3   0  0   0  4  0  1  0  0  0   0
4   0  1   0  0  4  0  0  0  0   0
5   0  0   0  0  0  4  0  0  0   0
6   0  0   1  0  0  0  9  0  0   0
7   0  0   0  0  0  0  0  8  0   0
8   0  0   0  1  0  0  0  0  5   0
9   0  0   0  0  0  0  0  0  0  11


<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

100.0
k value:                 9
correct predictions:    93
total predictions:     100
Accuracy:            0.930
Error rate:          0.070

Confusion matrix: 

     0  1   2  3  4  5  6  7  8   9
0  26  0   0  0  0  0  0  0  0   0
1   0  8   0  0  0  0  0  0  0   0
2   1  0  15  1  0  0  0  0  0   0
3   0  0   0  4  0  1  0  0  0   0
4   0  1   0  0  4  0  0  0  0   0
5   0  0   0  0  0  4  0  0  0   0
6   0  0   1  0  0  0  9  0  0   0
7   0  0   0  0  0  0  0  8  0   0
8   0  0   0  1  0  1  0  0  4   0
9   0  0   0  0  0  0  0  0  0  11


<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

[0.   0.06 0.06 0.04 0.07 0.06 0.05 0.05 0.05 0.07]


In [None]:
%%latex
knn 
cons:
    - classifying one sample requires going throgh whole test data set
    - potentially huge data set needs to be stored
    - long testing phase
    - degraded accuracy for higher dimensions since there's little differene betwenn nearest and farthest neighbour
pros:
    - easy to implement
    - no training required
    