k-Nearest Neighbours algorithm:
- find the k nearest elements of the new element to classify
  - define "nearest":
    - local distance (one feature): minkowsky of order p, overlap, hamming 
    - global distance (how to merge heterogenous distance): summation (pay attention to ranges and scales, normalisation)
- find the most occuring label from this k elements
  - in case of tie: choose first, choose randomly or choose smallest sum of neighbour distance
  - weighted voting: give more importance to nearest elements
- apply this label to the new element

In [176]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from typing import Union, Callable

In [178]:
class MyKNN(BaseEstimator, ClassifierMixin):
    def __init__(self, n_neighbors: int = 5):
        self.n_neighbors = n_neighbors

    def fit(self, x_train, y_train):
        self.x_train = np.array(x_train)
        self.y_train = np.array(y_train)
        return self

    def predict(self, x_tests):
        x_tests = np.array(x_tests)
        y_pred = [None] * x_tests.shape[0]

        # for each x_test in xtests do 
        # - find the k nearest elements
        # - find the most present label
        # - give xtest this label

        for i, x_test in enumerate(x_tests):
            dist = np.array([x_test]).repeat(self.x_train.shape[0], axis=0) - self.x_train
            sum_dist = np.power(dist, 2).sum(axis=1)
            nn_indices = sum_dist.argsort()[:self.n_neighbors]
            nn_labels = self.y_train.take(nn_indices)
            labels, counts = np.unique(nn_labels, return_counts=True)
            y_pred[i] = labels[counts.argmax()]

        return np.array(y_pred)

In [None]:
athletes = pd.read_csv('AthleteSelection.csv', header=0, index_col='Athlete')
x_train = athletes[['Speed', 'Agility']].values
y_train = athletes.pop('Selected').values
y_train

array(['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes'],
      dtype=object)

In [None]:
athletes = pd.read_csv('AthleteTest.csv', header=0, index_col='Athlete')
x_test = athletes[['Speed', 'Agility']].values

In [None]:
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [None]:
knn = MyKNN(n_neighbors=3)
knn.fit(x_train, y_train)
knn.predict(x_test)

array(['No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No'],
      dtype='<U3')

In [None]:
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn.fit(x_train, y_train)
knn.predict(x_test)

array(['No', 'No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No'],
      dtype=object)

In [None]:
penguins = pd.read_csv('penguins_af.csv', header=0, index_col='id')
y = penguins.pop('species').values
x = penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1/5)

In [None]:
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', MyKNN())
])

pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
confusion_matrix(y_test, y_pred)

array([[31,  0,  0],
       [ 0, 14,  0],
       [ 0,  0, 22]], dtype=int64)

In [None]:
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('classifier', KNeighborsClassifier())
])

pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
confusion_matrix(y_test, y_pred)

array([[31,  0,  0],
       [ 0, 14,  0],
       [ 0,  0, 22]], dtype=int64)