<a href="https://colab.research.google.com/github/akansha-mehrotra/ML/blob/main/KNN_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn import datasets
from sklearn.model_selection import train_test_split , KFold
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from collections import Counter

In [8]:
iris = datasets.load_iris()
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']], columns= iris['feature_names'] + ['target'])
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [9]:
x= iris_df.iloc[:, :-1]
y= iris_df.iloc[:, -1]

In [10]:
x_train, x_test, y_train, y_test= train_test_split(x, y,test_size= 0.2, shuffle= True, random_state= 0)
x_train= np.asarray(x_train)
y_train= np.asarray(y_train)

x_test= np.asarray(x_test)
y_test= np.asarray(y_test)


In [11]:
def distance_ecu(x_train, x_test_point):

  distances= []  ## create empty list called distances
  for row in range(len(x_train)): ## Loop over the rows of x_train
      current_train_point= x_train[row] #Get them point by point
      current_distance= 0 ## initialize the distance by zero

      for col in range(len(current_train_point)): ## Loop over the columns of the row
          
          current_distance += (current_train_point[col] - x_test_point[col]) **2
          ## Or current_distance = current_distance + (x_train[i] - x_test_point[i])**2
      current_distance= np.sqrt(current_distance)

      distances.append(current_distance) ## Append the distances

  # Store distances in a dataframe
  distances= pd.DataFrame(data=distances,columns=['dist'])
  return distances

In [23]:
def nearest_neighbors(distance_point, K):

    # Sort values using the sort_values function
    df_nearest= distance_point.sort_values(by=['dist'], axis=0)

    ## Take only the first K neighbors
    df_nearest= df_nearest[:K]
    return df_nearest

In [22]:
def voting(df_nearest, y_train):

    ## Use the Counter Object to get the labels with K nearest neighbors.
    counter_vote= Counter(y_train[df_nearest.index])

    y_pred= counter_vote.most_common()[0][0]   # Majority Voting

    return y_pred

In [21]:
def KNN_from_scratch(x_train, y_train, x_test, K):

    y_pred=[]

    ## Loop over all the test set and perform the three steps
    for x_test_point in x_test:
      distance_point  = distance_ecu(x_train, x_test_point)  ## Step 1
      df_nearest_point= nearest_neighbors(distance_point, K)  ## Step 2
      y_pred_point    = voting(df_nearest_point, y_train) ## Step 3
      y_pred.append(y_pred_point)

    return y_pred 

In [14]:
scaler= Normalizer().fit(x_train) # the scaler is fitted to the training set
normalized_x_train= scaler.transform(x_train) # the scaler is applied to the training set
normalized_x_test= scaler.transform(x_test) # the scaler is applied to the test set

In [17]:
K=3
y_pred_scratch= KNN_from_scratch(normalized_x_train, y_train, normalized_x_test, K)

In [18]:
accuracy_score(y_test, y_pred_scratch)

0.9666666666666667

In [19]:
i=0
print ('%-25s %-25s %-25s' % ('Original Label', 'Predicted Label', 'Correct/Wrong'))
for label in y_test:
    print ('%-25s %-25s' % (label, y_pred_scratch[i]), end="")
    if (label == y_pred_scratch[i]):
        print (' %-25s' % ('Correct'))
    else:
        print (' %-25s' % ('Wrong'))
    i = i + 1

Original Label            Predicted Label           Correct/Wrong            
2.0                       2.0                       Correct                  
1.0                       1.0                       Correct                  
0.0                       0.0                       Correct                  
2.0                       2.0                       Correct                  
0.0                       0.0                       Correct                  
2.0                       2.0                       Correct                  
0.0                       0.0                       Correct                  
1.0                       1.0                       Correct                  
1.0                       1.0                       Correct                  
1.0                       1.0                       Correct                  
2.0                       2.0                       Correct                  
1.0                       1.0                       Correct     

In [20]:
from sklearn import metrics
metrics.confusion_matrix(y_test, y_pred_scratch)

array([[11,  0,  0],
       [ 0, 12,  1],
       [ 0,  0,  6]])