# KNN

In [1]:
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
def normalize(X, axis=-1, order=2):
    """ Normalize the dataset X """
    l2 = np.atleast_1d(np.linalg.norm(X, order, axis))
    l2[l2 == 0] = 1
    return X / np.expand_dims(l2, axis)

In [3]:
def train_test_split(X, y, test_size=0.5, shuffle=True, seed=None):
    """ Split the data into train and test sets """
    if shuffle:
        X, y = shuffle_data(X, y, seed)
    # Split the training data from test data in the ratio specified in
    # test_size
    split_i = len(y) - int(len(y) // (1 / test_size))
    X_train, X_test = X[:split_i], X[split_i:]
    y_train, y_test = y[:split_i], y[split_i:]

    return X_train, X_test, y_train, y_test

In [5]:
def shuffle_data(X, y, seed=None):
    """ Random shuffle of the samples in X and y """
    if seed:
        np.random.seed(seed)
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    return X[idx], y[idx]

In [18]:
import math
def euclidean_distance(x1, x2):
    """ Calculates the l2 distance between two vectors """
    distance = 0
    # Squared distance between each coordinate
    for i in range(len(x1)):
        distance += pow((x1[i] - x2[i]), 2)
    return math.sqrt(distance)

In [73]:
data = datasets.load_iris()
X = normalize(data.data)
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [74]:
X.shape

(150, 4)

In [75]:
y.shape

(150,)

In [76]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((101, 4), (49, 4), (101,), (49,))

In [77]:
k= 5

In [78]:
y_pred = np.empty(X_test.shape[0])
y_pred1 = np.empty(X_test.shape[0])

In [79]:
y_pred.shape

(49,)

In [80]:
for i, test_sample in enumerate(X_test):
        idx = np.argsort([euclidean_distance(test_sample, x) for x in X_train])[:k]
        k_nearest_neighbors = np.array([y_train[i] for i in idx])
        counts = np.bincount(k_nearest_neighbors.astype('int'))
        y_pred[i] = counts.argmax()

In [82]:
for i, test_sample in enumerate(X_test):
    #dist = np.empty(X_train.shape[0])
    idx = np.argsort([euclidean_distance(test_sample, x) for x in X_train])[:k]
    knn = np.array([y_train[i] for i in idx])
    counts = np.bincount(knn.astype('int'))
    #print(i, idx, knn, counts, counts.argmax())
    y_pred1[i] = counts.argmax()

In [81]:
y_pred

array([ 1.,  2.,  2.,  1.,  0.,  1.,  1.,  1.,  0.,  0.,  1.,  1.,  2.,
        2.,  0.,  1.,  2.,  1.,  2.,  2.,  2.,  0.,  2.,  2.,  2.,  1.,
        0.,  0.,  1.,  2.,  2.,  1.,  1.,  2.,  0.,  2.,  1.,  0.,  1.,
        0.,  2.,  0.,  2.,  0.,  1.,  1.,  2.,  2.,  2.])

In [83]:
y_pred1

array([ 1.,  2.,  2.,  1.,  0.,  1.,  1.,  1.,  0.,  0.,  1.,  1.,  2.,
        2.,  0.,  1.,  2.,  1.,  2.,  2.,  2.,  0.,  2.,  2.,  2.,  1.,
        0.,  0.,  1.,  2.,  2.,  1.,  1.,  2.,  0.,  2.,  1.,  0.,  1.,
        0.,  2.,  0.,  2.,  0.,  1.,  1.,  2.,  2.,  2.])

In [84]:
y_pred - y_pred1

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])