# Setup

In [1]:
from collections import Counter

import numpy as np
import time

In [2]:
def euclidean_distance(x, y):
    # vectorized version of computing all euclidean distances at once
    # https://www.pythonlikeyoumeanit.com/Module3_IntroducingNumpy/Broadcasting.html
    # https://stackoverflow.com/questions/27948363/numpy-broadcast-to-perform-euclidean-distance-vectorized
    squared_dists = np.sum(x**2, axis=1)[:, np.newaxis] \
        + np.sum(y**2, axis=1) \
        - 2 * np.dot(x, y.T)
    # clip very low negative values to 0 due to floating precisions
    return np.sqrt(np.clip(squared_dists, a_min=0, a_max=None))

In [3]:
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # Compute distances between x and all examples in the training set
        distances = euclidean_distance(X, self.X_train)
        print(f"{distances.shape = }")
        # Sort by distance and return indices of the first k neighbors
        k_idxs = np.argsort(distances, axis=-1)[:, :k]
        print(f"{k_idxs.shape = }")
        # Extract the labels of the k nearest neighbor training samples
        k_neighbor_labels = self.y_train[k_idxs]
        print(f"{k_neighbor_labels.shape = }")
        print(f"{k_neighbor_labels[0] = }")
        # # return the most common class label
        most_commons = [Counter(x).most_common(1)[0][0]
                        for x in k_neighbor_labels]
        most_commons = np.array(most_commons)
        print(f"{most_commons.shape = }")

        # print(f"{np.array(most_commons).shape = }")
        return most_commons

In [4]:
# Imports
from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.model_selection import train_test_split

cmap = ListedColormap(["#FF0000", "#00FF00", "#0000FF"])

def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true) * 100
    return accuracy

iris = datasets.load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"{X_train.shape = }")
print(f"{y_train.shape = }")
print(f"{X_test.shape = }")

X_train.shape = (120, 4)
y_train.shape = (120,)
X_test.shape = (30, 4)


In [5]:
k = 3
clf = KNN(k=k)
clf.fit(X_train, y_train)

In [16]:
start_time = time.perf_counter()
predictions = clf.predict(X_test)
total_inf_time = time.perf_counter() - start_time
print(f"Time elapsed for inference: {total_inf_time:.4f} seconds")
print(f"KNN classification accuracy: {accuracy(y_test, predictions):.1f}%")

distances.shape = (30, 120)
k_idxs.shape = (30, 3)
k_neighbor_labels.shape = (30, 3)
k_neighbor_labels[0] = array([1, 1, 1])
most_commons.shape = (30,)
Time elapsed for inference: 0.0015 seconds
KNN classification accuracy: 100.0%
