# kNN classifier

This is just a short example on how we can create a [k-Nearest neighbors](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm) classifier and display the
confusion matrix.

In [None]:
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
import pandas as pd
from sklearn.datasets import load_breast_cancer
import numpy as np
plt.style.use('seaborn-notebook')
%matplotlib notebook

In [None]:
data_set = load_breast_cancer()
data = pd.DataFrame(data_set['data'], columns=data_set['feature_names'])
data['target'] = data_set['target']
data

In [None]:
class_names = {0: 'Malignant', 1: 'Benign'}

In [None]:
# We select just two of the variables so we can plot things in 2D:
variables = ['worst radius', 'mean texture']
X = data[variables].to_numpy()
y = data['target'].to_numpy()

In [None]:
# Try a kNN classifier:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X, y)

In [None]:
fig1, ax1 = plt.subplots(constrained_layout=True)
# Predict for many points to display the decision boundaries:
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.05),
                     np.arange(y_min, y_max, 0.05))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
colors = [
    [1., 0.49803922, 0.05490196],
    [0.12156863, 0.46666667, 0.70588235],
]
cmap = ListedColormap(colors)
ax1.contourf(xx, yy, Z, alpha=0.5, vmin=0, vmax=1, cmap=cmap)

ax1.scatter(X[y==1, 0], X[y==1, 1], label=class_names[1], color=colors[1])
ax1.scatter(X[y==0, 0], X[y==0, 1], label=class_names[0], color=colors[0])

ax1.set_title(f'kNN with n_neighbors = {clf.n_neighbors}')
ax1.legend()
ax1.set_xlabel(variables[0]);
ax1.set_ylabel(variables[0]);

In [None]:
# Plot the confusion matrix:
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, stratify=y
)
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
fig = ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test, display_labels=['Malignant', 'Benign'])
fig.ax_.set_title(f'kNN with n_neighbors = {clf.n_neighbors}')

In [None]:
# Let us use cross-validation to find the best parameters:
from sklearn.model_selection import (
    GridSearchCV,
)

X = data[variables].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, stratify=y
)

clf = KNeighborsClassifier()
parameters = [{'n_neighbors': range(1, 11)}]
grid = GridSearchCV(
    clf,
    parameters,
    cv=5,
    scoring='precision',
    return_train_score=True,
)
grid.fit(X_train, y_train)
print('Best parameters for knn:', grid.best_params_)

In [None]:
fig1, ax1 = plt.subplots(constrained_layout=True)
ax1.errorbar(
    parameters[0]['n_neighbors'],
    grid.cv_results_['mean_test_score'],
    yerr=grid.cv_results_['std_test_score'],
    marker='o', markersize=14
)
ax1.set(xlabel='n_neighbors', ylabel=grid.scoring)
ax1.set_title('Optimizing n_neighbors for a k-nearest neighbors classifier');

In [None]:
fig = ConfusionMatrixDisplay.from_estimator(
    grid.best_estimator_, X_test, y_test, display_labels=['Malignant', 'Benign']
)
fig.ax_.set_title(f'kNN with n_neighbors = {grid.best_estimator_.n_neighbors}')