# kNN classifier

A short example on how we can create a [k-Nearest neighbors](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm) classifier and display the
confusion matrix.

In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.datasets import load_breast_cancer

sns.set_theme(style="ticks", context="notebook", palette="muted")
%matplotlib notebook

In [None]:
data_set = load_breast_cancer()
data = pd.DataFrame(data_set["data"], columns=data_set["feature_names"])
data["target"] = data_set["target"]
data.head()

In [None]:
class_names = {0: "Malignant", 1: "Benign"}

In [None]:
# We select just two of the variables so we can plot things in 2D:
variables = ["worst radius", "mean texture"]
X = data[variables].to_numpy()
y = data["target"].to_numpy()

In [None]:
# Try a kNN classifier:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X, y)

In [None]:
# Plot decision boundaries
from sklearn.inspection import DecisionBoundaryDisplay

fig, ax = plt.subplots(constrained_layout=True)
colors = sns.color_palette()[:2]
cmap = ListedColormap(colors)

DecisionBoundaryDisplay.from_estimator(
    clf,
    X,
    alpha=0.4,
    ax=ax,
    response_method="predict",
    grid_resolution=100,
    cmap=cmap,
)

ax.scatter(X[y == 1, 0], X[y == 1, 1], label=class_names[1], color=colors[1])
ax.scatter(X[y == 0, 0], X[y == 0, 1], label=class_names[0], color=colors[0])

ax.set_title(f"kNN with n_neighbors = {clf.n_neighbors}", loc="left")
ax.legend()
ax.set(xlabel=variables[0], ylabel=variables[1])
sns.despine(fig=fig)

In [None]:
# Plot the confusion matrix:
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, stratify=y
)
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
fig = ConfusionMatrixDisplay.from_estimator(
    clf, X_test, y_test, display_labels=["Malignant", "Benign"], cmap="Blues"
)
fig.ax_.set_title(f"kNN with n_neighbors = {clf.n_neighbors}", loc="left")

In [None]:
# Let us use cross-validation to find the best parameters:
from sklearn.model_selection import (
    GridSearchCV,
)

X = data[variables].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, stratify=y
)

parameters = [{"n_neighbors": range(1, 11)}]
grid = GridSearchCV(
    KNeighborsClassifier(),
    parameters,
    cv=5,
    scoring="precision",
    return_train_score=True,
)
grid.fit(X_train, y_train)
print("Best parameters for knn:", grid.best_params_)

In [None]:
fig1, ax1 = plt.subplots(constrained_layout=True)
ax1.errorbar(
    parameters[0]["n_neighbors"],
    grid.cv_results_["mean_test_score"],
    yerr=grid.cv_results_["std_test_score"],
    marker="o",
    markersize=14,
)
ax1.set(xlabel="n_neighbors", ylabel=grid.scoring)
ax1.set_title(
    "Optimizing n_neighbors for a k-nearest neighbors classifier", loc="left"
)
sns.despine(fig=fig1)

In [None]:
fig = ConfusionMatrixDisplay.from_estimator(
    grid.best_estimator_,
    X_test,
    y_test,
    display_labels=["Malignant", "Benign"],
    cmap="Blues",
)
fig.ax_.set_title(
    f"kNN with n_neighbors = {grid.best_estimator_.n_neighbors}", loc="left"
)