In [57]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [58]:
df = pd.read_csv(r'C:\Users\User\Desktop\Data Science\Machine Learning\Data\diabetes_prediction_dataset.csv')

In [59]:
print(df.head())


   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  


In [60]:
print(len(df))

100000


In [61]:
#Replacing any zero with mean that might affect our data
zero_not_accepted = ['age', 'HbA1c_level', 'bmi', 'blood_glucose_level']
for column in zero_not_accepted:
    df[column] = df[column].replace(0, np.NaN)
    mean = int(df[column].mean(skipna =True))
    df[column] = df[column].replace(np.NaN, mean)

In [62]:
#split dataset
X = df.iloc[:, 0:8]
y = df.iloc[:,8]

In [63]:
#Turn categories into numbers using onehotencoder and then split the data into train and test

categorical_features = ['gender', 'smoking_history']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                 one_hot, 
                                 categorical_features)], 
                                remainder = 'passthrough')
transformed_X = transformer.fit_transform(X)
transformed_X
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, random_state = 0, test_size = 0.2)

In [64]:
#Freature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [65]:
import math
print(math.sqrt(len(y_test)))

141.4213562373095


In [66]:
# As we get the value of square root of y_test as 141.42, 
# we round it up to closest odd number which is 141. so n_neighbors = 141
# P = 2 coz we are looking if patient is diabetic or not
# to measure a distance we are using euclidean

In [67]:
#train the data
classifier = KNeighborsClassifier(n_neighbors=141, p=2, metric = 'euclidean')

In [68]:
classifier.fit(X_train, y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=141)

In [69]:
y_pred = classifier.predict(X_test)
y_pred

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [71]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[18295     2]
 [ 1063   640]]


In [72]:
print(f1_score(y_test, y_pred))

0.5458422174840085


In [73]:
print(accuracy_score(y_test, y_pred))

0.94675
