In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

from sklearn import datasets
from sklearn.model_selection import train_test_split

In [61]:
iris =datasets.load_iris()
X, y =iris.data, iris.target
X.shape

(150, 4)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
class my_KNN:

  def distance(self,x1,x2):
    return np.sqrt(np.sum((x1-x2)**2))

  def __init__(self,k=3):
    self.k=k

  # since knn is a lazy learner it only takes in the data during training
  def fit(self,X ,y):
    self.X=X
    self.y=y

  def predict(self,X):
    y_predict = [self.predict_new(j) for j in X]
    return np.array(y_predict)

  def predict_new(self,x):

    # Compute distances between x and all examples in the training set
    distances = [self.distance(x,x_train) for x_train in self.X]

    # Sort by distance and return indices of the first k neighbors
    k_nearest = np.argsort(distances)[:self.k]

    # Extract the labels of the k nearest neighbor training samples
    k_nearest_labels = [self.y[i] for i in k_nearest]

    majority_element = Counter(k_nearest_labels).most_common(1)
    return majority_element[0][0]


In [64]:
class KNN:

    def __init__(self, k=3, distance_metric="Euclidean"):
        self.k = k
        self.distance_metric = "Euclidean"


    def __distance(self, x, y):
        if(self.distance_metric=="Euclidean"): # Euclidean distance using l2 norm
            return np.linalg.norm( (x-y), 2 )
        elif(self.distance_metric == "Manhattan"): # Manhattan distance using l1 norm
            return np.linalg.norm((x-y), 1)
        else:
            print("Please enter a valid distance metric")
            return -99999


    def __calc_all_dist(self, train_data, point):
        dists = []
        [dists.append( [self.__distance(i[:-1], point) , i[-1]] ) for i in train_data]
        return dists
        # Distance of a given test_point from all the training points/samples

    # The idea is to get the first k distances from the sorted distance list and find the
    # maximum occuring category from this list
    def __calc_votes(self, train_data, point):
        distances = self.__calc_all_dist(train_data, point)
        votes = []
        [votes.append(i[1]) for i in sorted(distances)[:self.k]]
        return Counter(votes).most_common(1)[0][0] # We want to find the (1st) most common category in this list containing the k nearest neighbors

    # Calculate votes for all the test points with respect to the training data
    def predict(self, train_data, test_data, test_size=0.2): # The data to be used for calculating the distance metric and the data whose category is to be predicted
        if len(np.unique(train_data[:, -1])) > self.k:
            warnings.warn("Hey! The number of categories is greater than k. Please increase the value of k")
        result = []
        [result.append(self.__calc_votes(train_data, point)) for point in test_data[:, :-1]]
        return result


In [65]:
def accuracy(y_true, y_pred):
  accuracy = np.sum(y_true == y_pred) / len(y_true)
  return accuracy

In [66]:
k = 4
classifier = my_KNN(k=k)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
print("KNN classification accuracy", accuracy(y_test, predictions))

KNN classification accuracy 1.0
