# K-Nearest Neighbor
k-Nearest neighbors classifier assigns the class of an example using the majority vote of the k most examples in the data. In this example, we apply k-nearest neighbor using the well-known Iris dataset.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets, neighbors
from sklearn import model_selection
from sklearn.model_selection import StratifiedKFold
from matplotlib.colors import ListedColormap
%matplotlib inline

In [None]:
# import some data to play with
iris = datasets.load_iris()
target = np.array(iris.target)

print("Number of examples: ", iris.data.shape[0])
print("Number of variables:", iris.data.shape[0])
print("Variable names:     ", iris.feature_names)
print("Target values:      ", iris.target_names)
print("Class Distribution  ", [(x,sum(target==x)) for x in np.unique(target)])

We will use only the first two variables (sepal length and sepal width)

In [None]:
x = iris.data[:, :2]
y = iris.target

First we just evaluate the performance for k=15

In [None]:
k = 15
knn = neighbors.KNeighborsClassifier(n_neighbors=k)
knn_eval = model_selection.cross_val_score(knn, x, y, cv=StratifiedKFold(n_splits=10,shuffle=True,random_state=1234))

print("%d-nearest-neighbor   Accuracy=%.3f Std=%.3f"%(k,np.average(knn_eval),np.std(knn_eval)))

Next, we perform an experiment to select the best k. For this purpose, we use the typical train-validation-test setup in which train-validation part is performed using cross validation and the final evaluation is done with the test set that was never used for selecting k.

In [None]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.33, random_state=1234, stratify=y)

knn_accuracy = {}
knn_std = {}

best_k = -1
best_accuracy = 0.0

best_test_k = -1
best_test_accuracy = 0.0

plt_x_label = []
plt_y_train = []
plt_y_bar_train = []
plt_y_test = []

for k in np.arange(1,30,1):
    knn = neighbors.KNeighborsClassifier(n_neighbors=k)
    knn_eval = model_selection.cross_val_score(knn, x_train, y_train, cv=StratifiedKFold(n_splits=10,shuffle=True,random_state=1234))
    #print("k-nn k=%3d Accuracy=%.3f Std=%.3f"%(k,np.average(knn_eval),np.std(knn_eval)))
    knn_accuracy[k] = np.average(knn_eval)
    knn_std[k] = np.std(knn_eval)
    
    knn_model = neighbors.KNeighborsClassifier(n_neighbors=k)
    knn_model = knn_model.fit(x_train, y_train)
    knn_model_eval = knn_model.score(x_test, y_test)
    
    print("k-nn k=%3d Train %.3f +/- %.3f\tTest %.3f"%(k,np.average(knn_eval),np.std(knn_eval),np.average(knn_model_eval)))

    
    if np.average(knn_eval)>best_accuracy:
        best_accuracy = np.average(knn_eval)
        best_k = k

    if np.average(knn_model_eval)>best_test_accuracy:
        best_test_accuracy = np.average(knn_model_eval)
        best_test_k = k
    
    plt_x_label = plt_x_label + [k]
    plt_y_train = plt_y_train + [np.average(knn_eval)]
    plt_y_test = plt_y_test + [np.average(knn_model_eval)]
    plt_y_bar_train = plt_y_bar_train + [np.std(knn_eval)]
    
print("\n\nBest k=%d Accuracy on Train %.3f"%(best_k,best_accuracy))
print("Best k=%d Accuracy on Test  %.3f"%(best_test_k,best_test_accuracy))

In [None]:
plt.figure(1, figsize=(8, 6))
font = {'family' : 'sans', 'size'   : 20}
plt.rc('font', **font)
plt.title('Train vs Test Accuracy')
plt.plot(plt_x_label,plt_y_train)
plt.plot(plt_x_label,plt_y_test)
plt.ylim([0.5,1.0])


plt.xlabel('k')
plt.ylabel('Accuracy (%)')

In [None]:
plt.figure(1, figsize=(8, 6))
font = {'family' : 'sans', 'size'   : 20}
plt.rc('font', **font)
plt.ylim([0.5,1.0])
plt.title('Train vs Test Accuracy (with std bars)')
plt.plot(plt_x_label,plt_y_train)
plt.plot(plt_x_label,plt_y_test)
plt.errorbar(plt_x_label, plt_y_train, yerr=plt_y_bar_train, fmt='o')

plt.xlabel('k')
plt.ylabel('Accuracy (%)')

We repeat the same procedure using distance as the weight function for prediction, thus the class of more similar examples will weight more.

In [None]:
best_k = -1
best_accuracy = 0.0

best_test_k = -1
best_test_accuracy = 0.0

plt_x_label = []
plt_y_train = []
plt_y_bar_train = []
plt_y_test = []

for k in np.arange(1,30,1):
    knn = neighbors.KNeighborsClassifier(n_neighbors=k,weights='distance')
    knn_eval = model_selection.cross_val_score(knn, x_train, y_train, cv=StratifiedKFold(n_splits=10,shuffle=True,random_state=1234))
    #print("k-nn k=%3d Accuracy=%.3f Std=%.3f"%(k,np.average(knn_eval),np.std(knn_eval)))
    knn_accuracy[k] = np.average(knn_eval)
    knn_std[k] = np.std(knn_eval)
    
    knn_model = neighbors.KNeighborsClassifier(n_neighbors=k,weights='distance')
    knn_model = knn_model.fit(x_train, y_train)
    knn_model_eval = knn_model.score(x_test, y_test)
    
    print("k-nn k=%3d Train %.3f +/- %.3f\tTest %.3f"%(k,np.average(knn_eval),np.std(knn_eval),np.average(knn_model_eval)))

    
    if np.average(knn_eval)>best_accuracy:
        best_accuracy = np.average(knn_eval)
        best_k = k

    if np.average(knn_model_eval)>best_test_accuracy:
        best_test_accuracy = np.average(knn_model_eval)
        best_test_k = k
    
    plt_x_label = plt_x_label + [k]
    plt_y_train = plt_y_train + [np.average(knn_eval)]
    plt_y_test = plt_y_test + [np.average(knn_model_eval)]
    plt_y_bar_train = plt_y_bar_train + [np.std(knn_eval)]
    
print("\n\nBest k=%d Accuracy on Train %.3f"%(best_k,best_accuracy))
print("Best k=%d Accuracy on Test  %.3f"%(best_test_k,best_test_accuracy))

In [None]:
plt.figure(1, figsize=(8, 6))
font = {'family' : 'sans', 'size'   : 20}
plt.rc('font', **font)
plt.ylim([0.5,1.0])
plt.title('Train vs Test Accuracy (with std bars)')
plt.plot(plt_x_label,plt_y_train);
plt.plot(plt_x_label,plt_y_test);
plt.errorbar(plt_x_label, plt_y_train, yerr=plt_y_bar_train, fmt='o');

plt.xlabel('k');
plt.ylabel('Accuracy (%)');