In [86]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [87]:
warnings.filterwarnings("ignore", category=FutureWarning)

In [59]:
df = pd.read_csv('diabetes_prediction_dataset.csv')

In [60]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [61]:
pd.set_option('display.max_rows',150)

In [62]:
df.head(10)


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
5,Female,20.0,0,0,never,27.32,6.6,85,0
6,Female,44.0,0,0,never,19.31,6.5,200,1
7,Female,79.0,0,0,No Info,23.86,5.7,85,0
8,Male,42.0,0,0,never,33.64,4.8,145,0
9,Female,32.0,0,0,never,27.32,5.0,100,0


In [63]:
df['gender'].value_counts(dropna=False)

Female    58552
Male      41430
Other        18
Name: gender, dtype: int64

In [64]:
##data enconding on 
df['gender'].replace({'Female':'1','Male':'2','Other':'3'},inplace = True)

In [65]:
df['smoking_history'].value_counts(dropna=False)

No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: smoking_history, dtype: int64

In [66]:
df['smoking_history'].replace({'No Info':'1','never':'2','former':'3','current':'4','not current':'5','ever':'6'},inplace = True)

In [67]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,1,80.0,0,1,2,25.19,6.6,140,0
1,1,54.0,0,0,1,27.32,6.6,80,0
2,2,28.0,0,0,2,27.32,5.7,158,0
3,1,36.0,0,0,4,23.45,5.0,155,0
4,2,76.0,1,1,4,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,1,80.0,0,0,1,27.32,6.2,90,0
99996,1,2.0,0,0,1,17.37,6.5,100,0
99997,2,66.0,0,0,3,27.83,5.7,155,0
99998,1,24.0,0,0,2,35.42,4.0,100,0


In [68]:
#split dataset
X = df.iloc[0:,0:8]
y = df.iloc[0:,8]
X_train, X_test,y_train,y_test = train_test_split(X,y,random_state=1,test_size=0.2,stratify=y)

In [69]:
classifier = KNeighborsClassifier(n_neighbors=5,p=2,metric='euclidean')

In [70]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [71]:
import math
math.sqrt(len(y_test))

141.4213562373095

In [72]:
classifier.fit(X_train,y_train)

KNeighborsClassifier(metric='euclidean')

In [117]:
y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [74]:
cm = confusion_matrix(y_test,y_pred)
print(cm)

[[18183   117]
 [  659  1041]]


In [75]:
print(f1_score(y_test,y_pred,average='macro'))

0.8537943328648596


In [76]:
print(accuracy_score(y_test,y_pred))

0.9612


In [77]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     18300
           1       0.90      0.61      0.73      1700

    accuracy                           0.96     20000
   macro avg       0.93      0.80      0.85     20000
weighted avg       0.96      0.96      0.96     20000



## Hyperparameter tuning

In [78]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [112]:
k = np.random.randint(1,30,60)

In [113]:
params = { 'n_neighbors': k ,
           'weights': ['uniform', 'distance'],
           'metric': ['euclidean', 'manhattan']}

In [114]:
random_search = RandomizedSearchCV(classifier,params,n_iter=5,cv=5,n_jobs=-1,verbose=0)
random_search.fit(X_train,y_train)

RandomizedSearchCV(cv=5, estimator=KNeighborsClassifier(metric='euclidean'),
                   n_iter=5, n_jobs=-1,
                   param_distributions={'metric': ['euclidean', 'manhattan'],
                                        'n_neighbors': array([23, 23, 14, 29,  4, 22,  1,  2,  9, 13, 11,  2,  4, 24, 11,  1, 28,
        9, 21, 13, 12, 10, 10, 19, 16, 10, 14, 16, 27,  8, 24, 11, 22, 21,
       17, 18, 27,  1, 24, 20,  1,  6, 13, 19, 26,  8, 23,  7, 26, 13,  1,
       27, 24, 18,  9, 19, 12, 20, 22,  6]),
                                        'weights': ['uniform', 'distance']})

In [115]:
print("train score - " + str(random_search.score(X_train,y_train)))
print("test score - " + str(random_search.score(X_test,y_test)))

train score - 0.9640375
test score - 0.9622


In [116]:
print(random_search.best_params_)

{'weights': 'uniform', 'n_neighbors': 19, 'metric': 'manhattan'}
