<a href="https://colab.research.google.com/github/arjun1131/Building-ML-models-from-Stratch/blob/main/KNN_Classifier_from_Stratch_using_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#K-Nearest Neighbor(KNN) Algorithm

K-NN algorithm stores all the available data and classifies a new data point based on the similarity. This means when new data appears then it can be easily classified into a well suite category by using K- NN algorithm. <br>
K-NN is a non-parametric algorithm, which means it does not make any assumption on underlying data.<br>
It is also called a lazy learner algorithm because it does not learn from the training set immediately instead it stores the dataset and at the time of classification, it performs an action on the dataset.

In [6]:
#Importing Libraries

import numpy as np
import statistics

In [45]:
#Creating class for K-NN Classifier
class knn():

  #Hyper parameters initilization function
  def __init__(self , distance_metric):

    self.distance_metric = distance_metric

  #Distance calculation function
  def distance(self , train_data_point , test_data_point):

    #Calculating Euclidean Distance
    if (self.distance_metric == 'Euclidean'):

      dist = 0

      for i in range(len(train_data_point) - 1):            
        dist = dist + (train_data_point[i] - test_data_point[i]) ** 2   #Formula for calculating Euclidean distance
      euclidean_distance = np.sqrt(dist)

      return euclidean_distance

    #Calculating Manhattan Distance
    elif (self.distance_metric == 'Manhattan'):
      
      dist = 0

      for i in range(len(train_data_point) - 1):
        dist = dist + abs(train_data_point[i] - test_data_point[i])    #Formula for calculating Manhattan distance
      manhattan_distance = dist

      return manhattan_distance

  #Function for finding nearest neighbor 
  def nearest_neighbor(self , x_train , test_data , k):

    dist_list = []
    #Calling distance function to find Distance between given test point and train points 
    for training_data in x_train:

      distance = self.distance(training_data , test_data)
      dist_list.append((training_data , distance))

    #Sorting neighbor data points with respect to distance  
    dist_list.sort (key = lambda x: x[1])

    neighbors_list = []

    #Taking K nearest data points 
    for i in range(k):

      neighbors_list.append(dist_list[i][0])

    return neighbors_list

  #Prediction function
  def predict(self , x_train , test_data , k):

    neighbors = self.nearest_neighbor(x_train , test_data , k)

    for i in neighbors:
      label = []
      label.append(i[-1])

    class_pred = statistics.mode(label)

    return class_pred





#Diabetes Prediction using K-NN Classifier

In [27]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [28]:
#Loading data from CSV into Pandas Dataframe
diabetes_data = pd.read_csv('/content/drive/MyDrive/ML Datasets/diabetes.csv')

In [29]:
#Validating dataframe by using head function of Pandas
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [30]:
#Finding rows , columns of dataframe
diabetes_data.shape

(768, 9)

In [31]:
#Splitting features & labels
x = diabetes_data.drop(columns = 'Outcome', axis = 1)
y = diabetes_data['Outcome']

In [32]:
#Converting into numpy array
x = x.to_numpy()
y = y.to_numpy()

In [33]:
#Splitting train & test data 
x_train , x_test , y_train , y_test = train_test_split(x, y, test_size = 0.2, stratify = y ,random_state = 2) 

In [34]:
print(x_train.shape,x_test.shape)

(614, 8) (154, 8)


In [35]:
print(x_train)

[[0.00e+00 1.19e+02 0.00e+00 ... 3.24e+01 1.41e-01 2.40e+01]
 [6.00e+00 1.05e+02 7.00e+01 ... 3.08e+01 1.22e-01 3.70e+01]
 [1.00e+00 1.89e+02 6.00e+01 ... 3.01e+01 3.98e-01 5.90e+01]
 ...
 [1.10e+01 8.50e+01 7.40e+01 ... 3.01e+01 3.00e-01 3.50e+01]
 [4.00e+00 1.12e+02 7.80e+01 ... 3.94e+01 2.36e-01 3.80e+01]
 [0.00e+00 8.60e+01 6.80e+01 ... 3.58e+01 2.38e-01 2.50e+01]]


In [36]:
x_train = np.insert(x_train, 8, y_train, axis=1)

In [37]:
print(x_train)

[[0.00e+00 1.19e+02 0.00e+00 ... 1.41e-01 2.40e+01 1.00e+00]
 [6.00e+00 1.05e+02 7.00e+01 ... 1.22e-01 3.70e+01 0.00e+00]
 [1.00e+00 1.89e+02 6.00e+01 ... 3.98e-01 5.90e+01 1.00e+00]
 ...
 [1.10e+01 8.50e+01 7.40e+01 ... 3.00e-01 3.50e+01 0.00e+00]
 [4.00e+00 1.12e+02 7.80e+01 ... 2.36e-01 3.80e+01 0.00e+00]
 [0.00e+00 8.60e+01 6.80e+01 ... 2.38e-01 2.50e+01 0.00e+00]]


In [38]:
x_train.shape

(614, 9)

In [39]:
print(x_train[:,-1])

[1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0.
 0. 1. 1. 0. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1.
 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1.
 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.
 0. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0.
 0. 0. 0. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0.
 0. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 0. 1. 0. 0.
 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0.

#Model Training

In [46]:
#Creating instance of model
model = knn(distance_metric = 'Manhattan')

In [47]:
prediction = model.predict(x_train, x_test[2], k=5)

In [49]:
print(x_test[2])

[  5.    147.     78.      0.      0.     33.7     0.218  65.   ]


In [50]:
print(y_test[2])

0


In [51]:
print(prediction)

0.0


In [52]:
x_test.shape

(154, 8)

In [53]:
x_test_size = x_test.shape[0]
print(x_test_size)

154


In [59]:
#Predicting labels for Test data
y_pred = []

for i in range(x_test_size):
  prediction = model.predict(x_train, x_test[i], k=5)
  y_pred.append(prediction)

In [60]:
#Printing predicted labels
print(y_pred)

[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]


#Model Evaluation

In [58]:
#Calculating accuracy in % for test data
accuracy = accuracy_score(y_test, y_pred)
print(accuracy*100)

74.02597402597402
