Importing required libraries

In [109]:
import numpy as np
import statistics

K-Nearest-Neighbours

In [110]:
class KNN_Classifier():
    def __init__(self, distance_metric):
        self.distance_metric = distance_metric

    # getting the distance metric
    def get_distance_metric(self,training_data_point, test_data_point):
        if (self.distance_metric == 'euclidean'):
            dist = 0
            for i in range(len(training_data_point) - 1):
                dist = dist + (training_data_point[i] - test_data_point[i])**2
            euclidean_dist = np.sqrt(dist)
            return euclidean_dist

        elif (self.distance_metric == 'manhattan'):
            dist = 0
            for i in range(len(training_data_point) - 1):
                dist = dist + abs(training_data_point[i] - test_data_point[i])

            manhattan_dist = dist

            return manhattan_dist

    # getting the nearest neighbors
    def nearest_neighbors(self,X_train, test_data, k):
        distance_list = []

        for training_data in X_train:
            distance = self.get_distance_metric(training_data, test_data)
            distance_list.append((training_data, distance))

        distance_list.sort(key=lambda x: x[1])

        neighbors_list = []

        for j in range(k):
            neighbors_list.append(distance_list[j][0])

        return neighbors_list


      # predict the class of the new data point:
    def predict(self,X_train, test_data, k):
        neighbors = self.nearest_neighbors(X_train, test_data, k)

        for data in neighbors:
            label = []
            label.append(data[-1])

        predicted_class = statistics.mode(label)

        return predicted_class

In [111]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [112]:
#reading data
df=pd.read_csv('diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [113]:
#sepearate features and target
x=df.drop(columns='Outcome',axis=1)
y=df['Outcome']

In [114]:
#converting data to numpy
x=x.to_numpy()
y=y.to_numpy()

In [115]:
#train test and split-split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

In [116]:
# xtrain contains features with target
#xtest without target and our function will predict that outcome
x_train=np.insert(x_train,8,y_train,axis=1) 

In [117]:
#model training

In [150]:
classifier=KNN_Classifier(distance_metric='manhattan')

In [151]:
#it will predict only one point at the time
prediction=classifier.predict(x_train,x_test[0],k=5)

In [152]:
print(x_test[0])

[  3.    106.     72.      0.      0.     25.8     0.207  27.   ]


In [153]:
print(y_test[0]) #true value

0


In [154]:
print(prediction) #predicted value

0.0


In [155]:
x_test.shape

(154, 8)

In [156]:
x_test_size = x_test.shape[0]
print(x_test_size)

154


In [157]:
#for predicting multiple data points

y_pred=[]

for i in range(x_test_size):
    prediction=classifier.predict(x_train,x_test[i],k=5)
    y_pred.append(prediction)

In [158]:
print(y_pred)

[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]


In [159]:
y_true=y_test

Model evaluation

In [160]:
accuracy=accuracy_score(y_true,y_pred)

In [161]:
print(accuracy*100)

74.02597402597402
