In [1]:
import numpy as np
import statistics

In [2]:
class KNN_Classifier():

  # initiating the parameters
  def __init__(self, distance_metric):

    self.distance_metric = distance_metric

  # getting the distance metric
  def get_distance_metric(self,training_data_point, test_data_point):

    if (self.distance_metric == 'euclidean'):

      dist = 0
      for i in range(len(training_data_point) - 1):
        dist = dist + (training_data_point[i] - test_data_point[i])**2

      euclidean_dist = np.sqrt(dist)
    
      return euclidean_dist

    elif (self.distance_metric == 'manhattan'):

      dist = 0

      for i in range(len(training_data_point) - 1):
        dist = dist + abs(training_data_point[i] - test_data_point[i])

      manhattan_dist = dist

      return manhattan_dist

  # getting the nearest neighbors
  def nearest_neighbors(self,X_train, test_data, k):

    distance_list = []

    for training_data in X_train:

      distance = self.get_distance_metric(training_data, test_data)
      distance_list.append((training_data, distance))

    distance_list.sort(key=lambda x: x[1])

    neighbors_list = []

    for j in range(k):
      neighbors_list.append(distance_list[j][0])

    return neighbors_list


  # predict the class of the new data point:
  def predict(self,X_train, test_data, k):
    neighbors = self.nearest_neighbors(X_train, test_data, k)
    label = []
    for data in neighbors:
      
      label.append(data[-1])

    predicted_class = statistics.mode(label)

    return predicted_class

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
diabetes_dataset = pd.read_csv('./diabetes.csv') 

In [5]:
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
X = diabetes_dataset.drop(columns='Outcome', axis = 1)
Y = diabetes_dataset['Outcome']

In [7]:
X = X.to_numpy()
Y = Y.to_numpy()

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

In [9]:
print(X_train)

[[1.00e+00 9.00e+01 6.20e+01 ... 2.72e+01 5.80e-01 2.40e+01]
 [5.00e+00 1.26e+02 7.80e+01 ... 2.96e+01 4.39e-01 4.00e+01]
 [2.00e+00 1.05e+02 5.80e+01 ... 3.49e+01 2.25e-01 2.50e+01]
 ...
 [1.00e+00 9.70e+01 7.00e+01 ... 3.81e+01 2.18e-01 3.00e+01]
 [1.00e+01 1.11e+02 7.00e+01 ... 2.75e+01 1.41e-01 4.00e+01]
 [4.00e+00 1.44e+02 5.80e+01 ... 2.95e+01 2.87e-01 3.70e+01]]


In [10]:
X_train = np.insert(X_train, 8, Y_train, axis=1)

In [11]:
classifier = KNN_Classifier(distance_metric='manhattan')

In [12]:
prediction = classifier.predict(X_train, X_test[2], k=5)

In [13]:
X_test_size = X_test.shape[0]
print(X_test_size)

154


In [14]:
y_pred = []

for i in range(X_test_size):
  prediction = classifier.predict(X_train, X_test[i], k=5)
  y_pred.append(prediction)

In [15]:
y_true = Y_test

In [16]:
accuracy = accuracy_score(y_true, y_pred)

In [17]:
print(accuracy*100)  # accuracy score in %

72.72727272727273
