<a href="https://colab.research.google.com/github/aipankajmishra/ML-from-scratch/blob/master/KNN_ipnb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imporing libraries

In [None]:
import sys
import pandas as pd 
import numpy as np
from sklearn.datasets import load_iris,load_boston
from sklearn.model_selection import train_test_split

# **Utility functions**

In [None]:
def most_common_element(lst):
	return max(set(lst),key = lst.count)

def euclidean_distance(test_instance, train_instance):
	_sum = np.sum([(val - train_instance[idx])**2 for idx,val in enumerate(test_instance)])
	return np.sqrt(_sum)

def accuracy_score(ypred, y_test):
	return (np.sum(ypred == y_test)/len(y_test))

In [None]:
class KNN:

	def __init__(self,k = 3):
		self.k = k 

	def fit(self, X, y):
		self.X_train = X
		self.y_train = y 


	def label_assign(self, closest_k):
		common_label = most_common_element(self.y_train[closest_k].tolist())
		return common_label


	def _predict(self, x):
		
		"""
			This method returns output for a single instace.
			We will call this method for all instance in X_test
			
			Calculate distance of this instance from all other in the training set. 
			After calculating the distance, we will select the closest k from the given sample.

			After that, we will vote and get the value which is in the majority.
		"""

		distances = []

		for i, train_record in enumerate(self.X_train):
			eucl_distance =  euclidean_distance(x,train_record)
			distances.append(eucl_distance)

		""" 
		Now we have calculated the euclidean distance of the instance from each  sample in the train, 
		we will try labelling it using the closes k neighbor's label. 
		"""
		closest_k = np.argsort(distances)[:self.k]
		
		assign_label = self.label_assign(closest_k)
		return assign_label


	def predict(self, X_test):
		return np.asarray([self._predict(x) for x in X_test])

In [None]:
def main():
  print("Running the KNN algorithm implementation from scratch")

  # Loading the dataset, we will use KNN for solving a classification problem

  X, y = load_iris(return_X_y = True)

  # Now dividing the dataset we got into the train and test
  X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.33, random_state = 42)

  """
    Now we have the split, lets call the KNN implementation to fit the train data and have the predictions
  """

  clf = KNN(k = 10)

  clf.fit(X_train,y_train)

  ypred = clf.predict(X_test)

  print("The predictions are - ")
  print(ypred)

  print("The accuracy is - ")
  accuracy = accuracy_score(ypred,y_test)
  print(accuracy)


if __name__ == "__main__":
	main()

Running the KNN algorithm implementation from scratch
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]]
The predictions are - 
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0 1 1 2 1 2]
The accuracy is - 
0.98
