In [2]:
import numpy as np
import pandas as pd
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",
                header=None)

In [35]:
# sample of 20%
df1= df.sample(frac=0.2)
df2=df.drop(df1.index)

df1
df2

Unnamed: 0,0,1,2,3,4
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [36]:
x_test=np.array([val[0:4] for val in df1.values])
y_test=np.array([val[-1] for val in df1.values])    

x_train=np.array([val[0:4] for val in df2.values])
y_train=np.array([val[-1] for val in df2.values])  

In [46]:
def euclidean(X_train, X_test):
    X_train_reshaped = np.expand_dims(X_train, 1)
    X_diff = X_train_reshaped - X_test
    dist_matrix = ((X_diff**2).sum(axis=2))**.5
    return dist_matrix

X_tr = np.random.random((120, 4))
X_te = np.random.random((30, 4))

time_naive = timeit(lambda: naive_distance(X_tr, X_te, euclidean_numpy), number=100)
time_numpy = timeit(lambda: euclidean(X_tr, X_te), number=100)
print("Naive", time_naive)
print("Numpy", time_numpy)
print("Ratio", time_naive / time_numpy)

Naive 1.168634375000238
Numpy 0.011109749999832275
Ratio 105.18997952410099


In [64]:
from collections import Counter

def majority_voting(votes):
    count = Counter(votes)
    return count.most_common(1)[0][0] # most_common(n) returns a list with the n most recurring votes (n=1 -> top vote)

class KNearestNeighbors:
    def __init__(self, k, distance_metric="euclidean"):
        self.k = k
        self.distance_metric = distance_metric
        

    def fit(self, X_train, y_train):
        self.X_train = x_train
        self.y_train = y_train
        
        self.X_train_reshaped = np.expand_dims(self.X_train, 1)
        self.X_train_norm = ((self.X_train**2).sum(axis=1)**.5).reshape(-1,1)

    def _euclidean(self, X_test):
        X_diff = self.X_train_reshaped - X_test
        dist_matrix = ((X_diff**2).sum(axis=2))**.5
        return dist_matrix

    def _cosine(self, X_test):
        X_test_norm = ((X_test**2).sum(axis=1)**.5).T
        dot_prods = x_train @ x_test.T 
        dist_matrix = 1 - abs(dot_prods / self.X_train_norm.reshape(-1,1) / X_test_norm)
        return dist_matrix

    def _manhattan(self, X_test):
        X_diff = self.X_train_reshaped - X_test
        dist_matrix = abs(X_diff).sum(axis=2)
        return dist_matrix
    
    def predict(self, X_test):
        if self.distance_metric == "euclidean":
            dist_matrix = self._euclidean(X_test)
        elif self.distance_metric == "cosine":
            dist_matrix = self._cosine(X_test)
        elif self.distance_metric == "manhattan":
            dist_matrix = self._manhattan(X_test)
        else:
            raise Exception("Unknown distance metric")
        knn = dist_matrix.argsort(axis=0)[:self.k, :].T
        y_pred = np.array([ majority_voting(self.y_train[knn][i]) for i in range(len(self.y_train[knn])) ])
        return y_pred

In [65]:
knn_model = KNearestNeighbors(3, "cosine")
knn_model.fit(x_train, y_train)
y_pred = knn_model.predict(x_test)
y_pred

array(['Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',
       'Iris-virginica', 'Iris-virginica', 'Iris-versicolor',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-setosa', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-virginica', 'Iris-versicolor',
       'Iris-setosa', 'Iris-virginica'], dtype='<U15')

In [66]:
def accuracy_score(y_true, y_pred):
    return (y_true==y_pred).sum()/len(y_true)
accuracy_score(y_test, y_pred)


1.0