In [0]:
#####################################
######## K nearest neighbour ########

import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
X = (X - X.mean(axis=0))/X.std(axis=0)
Y = iris.target

K = 5


def one_hot_encode(Y):
  '''
  Implements one hot encoding. Given a categorical variable Y of length
  n and with p unique labels, returns a (n,p) shape boolean array 
  where (i,j)th entry is True iff Y[i]==j
  '''
  labels = np.unique(Y)
  n,p = len(Y),len(labels)
  encoded = np.zeros((n,p), dtype = bool)
  for c,value in enumerate(labels):
    label_index = (Y == value) 
    encoded[label_index,c] = True 
  return labels, encoded  
labels,Y = one_hot_encode(Y)


#######################
##### My Solution #####

#Create training and testing sets
#np.random.seed(42) # so we can reproduce the same results every time
n, p = X.shape
test_prop = 0.1 #proportion of data to reserve for testing
test_size = int(n*test_prop)

prm  = np.random.permutation(n)
X = X[prm]
Y = Y[prm]
X_test = X[:test_size] #reserve top for testing 
X_train = X[test_size:] #reserve remainder for training
Y_test = Y[:test_size] #reserve top for testing
Y_train = Y[test_size:] #use remainder for training


def distance(X_train,X_test):
  '''
  Use the L2 norm as a distance metric 
  '''
  dX = X_train[:,np.newaxis,:] - X_test[np.newaxis,:,:] #add axes to make broadcasting work 
  return np.linalg.norm(dX,axis=-1)

def weights(d):
  '''
  Can use weights as a function of distance. Default is equals weights to each
  KNN
  '''
  W = np.ones(np.shape(d))
  return  W/np.sum(W, axis=0)

d = distance(X_train,X_test)          
srt = np.argpartition(d,K,axis=0)[:K]   #bring k smallest distances from d to top and retursns their index
KNN = np.take_along_axis(d,srt,axis=0)  #retreive the distances of K nearest neighbors
W = np.expand_dims(weights(KNN),2)      #apply weights based on distances, expand dimensions to enable broadcasting later

srt = srt[:,:,np.newaxis]               #add new axis at the end of sort to include the label values, k 
Z = np.expand_dims(Y_train, 1)          #add new axis in the middle of Y_train to incorporate text index, j 
Z = np.take_along_axis(Z,srt,axis=0)    #sort Z such that (i,j,k) entry is True if the ith KNN to jth test example has the label k 

Y_predicted = np.sum(W*Z, axis=0)
print(Y_predicted)

def check_accuracy(Y_test,Y_predicted): 
    '''
    Determines the accuracy of the model by comparing the predicted target 
    variables with the actual test data
    '''
    idx_predicted = np.argmax(Y_predicted,axis=1) #extract the predicted indices in the model
    idx_test = np.argmax(Y_test,axis=1)           #extract the indices of the test data 
    corr = (idx_predicted == idx_test)            #compare them to see how many were correct
    print('Model accuracy in percentage: {0:5.3f}'.format(np.sum(corr)/len(corr)*100))
    return 
  
check_accuracy(Y_test,Y_predicted)
check_accuracy.__doc__

[[0.  0.  1. ]
 [1.  0.  0. ]
 [0.  1.  0. ]
 [0.  0.  1. ]
 [0.  0.  1. ]
 [1.  0.  0. ]
 [1.  0.  0. ]
 [0.  0.  1. ]
 [0.  0.8 0.2]
 [0.  1.  0. ]
 [0.  0.8 0.2]
 [0.  0.  1. ]
 [1.  0.  0. ]
 [0.  0.  1. ]
 [0.  1.  0. ]]
Model accuracy in percentage: 93.333


'\n    Determines the accuracy of the model by comparing the predicted target \n    variables with the actual test data\n    '

expand_dims

arg_partition

take_along_axis

linalg.norm

One hot encoding: When we have many levels in your categorical variable, it's easier to find the unique labels and construct one column for each label with boolean (True or False) values. 

In [0]:
Z.shape

(5, 15, 3)

In [0]:
#KNN from scikit learn
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, Y_train)
Y_model = model.predict(X_test)
print(Y_model)
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_model)


[[False  True False]
 [False False  True]
 [ True False False]
 [False False  True]
 [False  True False]
 [False False  True]
 [ True False False]
 [ True False False]
 [False False  True]
 [False  True False]
 [False False  True]
 [ True False False]
 [ True False False]
 [ True False False]
 [False  True False]]


1.0