# $k$-Nearest-Neighbors

> 
>
> **Author:** Bjarne C. Hiller

## Distance Metric

## Vector Norms

$p$-norm

$$
    \lVert \mathbf{x} \rVert_p = \left( \sum_{i=1}^d x_i^p \right)^{\frac{1}{p}}
$$

Taxicab norm or Manhatten norm:

Euclidian norm:

Euclidean Distance:
$$
    D(x,y) = \sqrt{\sum_{i=0}^d (x_i - y_i)^2}
$$

In [1]:
from sklearn.datasets import make_classification



In [5]:
import numpy as np

A = np.random.randn(6,2)
B = np.random.randn(4,2)

(A[:, np.newaxis, :] - B).shape

(6, 4, 2)

In [12]:
def distance_matrix(A, B):
    """
    Computes pairwise distances between points.

    :param A: point array (n, d)
    :param B: point array (m, d)
    :returns: pairwise distance matrix (n,m) 
    """
    # repeat A m times to receive (n,m,d)
    A = A[:, np.newaxis, :]
    # numpy broadcasting magic!
    D = A - B
    D = np.power(D, 2)
    D = np.sum(D, axis=-1)
    D = np.sqrt(D)
    return D
    
D = distance_matrix(A, B)

In [None]:
np.histogram()

In [23]:
rank = np.argsort(D, axis=1)[:, :1]

y = np.array(["A", "B", "C", "D", "E", "F"])
y[rank]

array([['C'],
       ['B'],
       ['B'],
       ['B'],
       ['C'],
       ['B']], dtype='<U1')

In [None]:
np.unique(y)
np.bincount()

array(['A', 'B', 'C', 'D', 'E', 'F'], dtype='<U1')

In [25]:
class KNN:
    def __init__(self, k=1):
        self.k = k
        # sklearn convention: fields ending on underscores (_) are computed during fit
        self.X_train_ = None
        self.y_train_ = None
        self.classes_ = None
    
    def fit(self, X, y):
        # store training data
        self.X_train_ = X
        self.y_train_ = y
        self.classes_ = np.unique(y)
    
    def predict(self, X):
        # compute pairwise distances between train and test
        D = distance_matrix(X, self.X_train_)

        # get sorted indices
        rank = np.argsort(D, axis=1)

        # use only top k ranks
        rank = rank[:, :self.k]

        # get labels (n,k) associated with k closest train points
        y = self.y_train_[rank]

        # for k>1, we need to count label occurences
        # TODO!        



        return y

In [None]:
from sklearn.datasets import load_iris

iris_ds = load_iris()

In [28]:
from sklearn.model_selection import train_test_split

X = iris_ds["data"]
y = iris_ds["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=19)

model = KNN(k=1)
model.fit(X_train, y_train)

In [34]:
y_hat = model.predict(X_test)
y_hat = y_hat[:, 0]

In [36]:
(y_hat == y_test).sum()

np.int64(35)

In [37]:
y_hat.shape

(38,)