#Implement K-nearest neighbor (KNN) from scratch

In [1]:
import numpy as np

In [2]:
class KNN:
    def __init__(self, k):
      self.k = k

    def train(self, X, y):
      self.X_train = X
      self.y_train = y

    def predict(self, X_test):
      distances = self.compute_distance(X_test)
      return self.predict_labels(X_test, distances)

    def compute_distance(self, X_test):
      num_test = X_test.shape[0]
      num_train = self.X_train.shape[0]
      distances = np.zeros((num_test, num_train))

      for i in range(num_test):
          # Ignored taking square root. As sqrt is monotonous and would not change the result.
          distances[i, :] = np.sum((self.X_train - X_test[i, :]) ** 2, axis=1)

      return distances

    def predict_labels(self, X_test, distances):
      y_pred = np.zeros(X_test.shape[0])
      for i in range(X_test.shape[0]):
          y_indices = np.argsort(distances[i, :])
          knn_possible_classes = self.y_train[y_indices[: self.k]].astype(int)
          y_pred[i] = np.argmax(np.bincount(knn_possible_classes))

      return y_pred

## Predict on Iris data

In [3]:
#import iris dataset
from sklearn import datasets

iris = datasets.load_iris()
#load only two features
X = iris.data
y = iris.target

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

print('Num of types in y:', np.bincount(y))
print('Num of types in y_test:', np.bincount(y_test))
print('Num of types in y_train:', np.bincount(y_train))

Num of types in y: [50 50 50]
Num of types in y_test: [15 15 15]
Num of types in y_train: [35 35 35]


In [9]:
knn_cf = KNN(k=3)
knn_cf.train(X_train, y_train)
y_pred = knn_cf.predict(X_test)
print(f"Accuracy: {sum(y_pred == y_test) / y_test.shape[0]}")

Accuracy: 0.9777777777777777


## Predict using sklearn and compare

In [11]:
#Lets apply KNN
from sklearn.neighbors import KNeighborsClassifier

knn_cf_sk = KNeighborsClassifier(n_neighbors=3)
knn_cf_sk.fit(X_train, y_train)
y_pred_sk = knn_cf_sk.predict(X_test)
print(f"Accuracy: {sum(y_pred_sk == y_test) / y_test.shape[0]}")

Accuracy: 0.9777777777777777
