In [1]:
import numpy as np
from collections import Counter

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [2]:
def flatten(container):
    for i in container:
        if isinstance(i, (list, tuple)):
            yield from flatten(i)
        else:
            yield i

In [3]:
def format_input(X):
    if not isinstance(X, (pd.DataFrame, pd.Series)):
        if isinstance(X, dict):
            X = pd.DataFrame(X)
        else:
            X = pd.DataFrame(X,
                             columns=['x' + str(i) for i in range(X.shape[1])])

    return X

In [4]:
def calculate_euclidean_distance(a, b):
    return np.linalg.norm(a-b)

In [5]:
def get_nearest_neighbors(X_train, y_train, new_row):
    neighbors = []
    for row, label in zip(X_train, y_train):
        euc_dist = calculate_euclidean_distance(row, new_row)
        neighbors.append((euc_dist, label))
    
    return neighbors

In [6]:
def get_votes(X_train, y_train, new_row, k=5):
    neighbors = get_nearest_neighbors(X_train, y_train, new_row)

    neighbors = sorted(neighbors, key=lambda x: x[0])
    
    majority_class, class_count = Counter([c for _, c in neighbors[:k]
                                           ]).most_common(1)[0]

    return majority_class, class_count

In [7]:
def predict(X_train, y_train, X_test, k=5):
    preds = []

    for row in X_test:
        preds.append(get_votes(X_train, y_train, row, k))

    return np.array([c for c, _ in preds])

In [8]:
def score(y_test, y_pred):
    assert len(y_test) == len(y_pred)
    
    return (y_test == y_pred).mean() * 100

In [9]:
X, y = load_iris(return_X_y=True)

In [10]:
print(X.shape)
print(y.shape)

(150, 4)
(150,)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [12]:
print(X_train.shape)
print(X_test.shape)

(135, 4)
(15, 4)


In [13]:
for i in range(10):
    print(f'{X_train[i]} -> {y_train[i]}')

[4.9 3.6 1.4 0.1] -> 0
[6.  2.9 4.5 1.5] -> 1
[5.5 2.6 4.4 1.2] -> 1
[4.8 3.  1.4 0.3] -> 0
[5.4 3.9 1.3 0.4] -> 0
[5.6 2.8 4.9 2. ] -> 2
[5.6 3.  4.5 1.5] -> 1
[4.8 3.4 1.9 0.2] -> 0
[4.4 2.9 1.4 0.2] -> 0
[6.2 2.8 4.8 1.8] -> 2


In [14]:
train_pred = predict(X_train, y_train, X_train)
test_pred = predict(X_train, y_train, X_test)

In [15]:
train_acc = score(y_train, train_pred)
test_acc = score(y_test, test_pred)

In [16]:
print('Train Accuracy:',np.round(train_acc, 2))
print('Test Accuracy:',np.round(test_acc, 2))

Train Accuracy: 95.56
Test Accuracy: 100.0
