In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_auc_score
from sklearn.preprocessing import MinMaxScaler,StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [3]:
df=pd.read_csv("diabetes.csv")
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


In [4]:
df["Outcome"].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [5]:
500/(500+268)

0.6510416666666666

In [6]:
268/(500+268)

0.3489583333333333

In [7]:
x=df.drop("Outcome",axis=1)
y=df["Outcome"]

In [8]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=24,stratify=y)

In [9]:
y_train.value_counts()

Outcome
0    375
1    201
Name: count, dtype: int64

In [10]:
375/(375+201)

0.6510416666666666

Model Training

In [11]:
knn_clf = KNeighborsClassifier() # by default k=5,p=2(Eucledian)
knn_clf.fit(x_train, y_train)

Evaluation

In [12]:
y_pred_testing=knn_clf.predict(x_test)
y_pred_testing[:5]

array([0, 0, 0, 0, 1], dtype=int64)

In [13]:
y_test[:5]

385    0
397    1
537    0
268    0
244    0
Name: Outcome, dtype: int64

Testing Data Evaluation 

In [14]:
cnf_matrix = confusion_matrix(y_test,y_pred_testing)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)

accuracy = accuracy_score(y_test,y_pred_testing)
print("Accuracy score :",accuracy)
print("*"*80)

clf_report = classification_report(y_test,y_pred_testing)
print("Classification report :\n",clf_report)

Confusion Matrix :
 [[96 29]
 [32 35]]
********************************************************************************
Accuracy score : 0.6822916666666666
********************************************************************************
Classification report :
               precision    recall  f1-score   support

           0       0.75      0.77      0.76       125
           1       0.55      0.52      0.53        67

    accuracy                           0.68       192
   macro avg       0.65      0.65      0.65       192
weighted avg       0.68      0.68      0.68       192



Training Data Evaluation 

In [15]:
y_pred_training=knn_clf.predict(x_train)
cnf_matrix=confusion_matrix(y_train,y_pred_training)
print("Confusion Matrix:\n",cnf_matrix)
print("*"*80)
accuracy=accuracy_score(y_train,y_pred_training)
print("Accuracy score: ",accuracy)
print("*"*80)
clf_report=classification_report(y_train,y_pred_training)
print("Classification report :\n",clf_report)

Confusion Matrix:
 [[334  41]
 [ 67 134]]
********************************************************************************
Accuracy score:  0.8125
********************************************************************************
Classification report :
               precision    recall  f1-score   support

           0       0.83      0.89      0.86       375
           1       0.77      0.67      0.71       201

    accuracy                           0.81       576
   macro avg       0.80      0.78      0.79       576
weighted avg       0.81      0.81      0.81       576



Hyperparameter tuning

In [16]:
knn_clf=KNeighborsClassifier()
hyperparameter = {"n_neighbors" : np.arange(3,30),
                  "p" : [1,2]}

gscv_knn_clf = GridSearchCV(knn_clf,hyperparameter,cv=5)
gscv_knn_clf.fit(x_train,y_train)

In [17]:
gscv_knn_clf.best_estimator_

In [18]:
gscv_knn_clf.best_params_

{'n_neighbors': 7, 'p': 2}

In [19]:
knn_clf=KNeighborsClassifier(n_neighbors=7,p=2)
knn_clf.fit(x_train,y_train)
y_pred_testing=knn_clf.predict(x_test)

cnf_matrix=confusion_matrix(y_test,y_pred_testing)
print("Confusion Matrix:\n",cnf_matrix)
print("*"*80)

accuracy=accuracy_score(y_test,y_pred_testing)
print("Accuracy Score : ",accuracy)
print("*"*80)

clf_report=classification_report(y_test,y_pred_testing)
print("Classification Report :\n",clf_report)

Confusion Matrix:
 [[96 29]
 [33 34]]
********************************************************************************
Accuracy Score :  0.6770833333333334
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.74      0.77      0.76       125
           1       0.54      0.51      0.52        67

    accuracy                           0.68       192
   macro avg       0.64      0.64      0.64       192
weighted avg       0.67      0.68      0.67       192



Training Data evaluation

In [20]:
y_pred_training=knn_clf.predict(x_train)

cnf_matrix=confusion_matrix(y_train,y_pred_training)
print("Confusion Matrix:\n",cnf_matrix)
print("*"*80)

accuracy=accuracy_score(y_train,y_pred_training)
print("Acuuracy Score : ",accuracy)
print("*"*80)

clf_report=classification_report(y_train,y_pred_training)
print("Classification Report :\n",clf_report)

Confusion Matrix:
 [[333  42]
 [ 68 133]]
********************************************************************************
Acuuracy Score :  0.8090277777777778
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.83      0.89      0.86       375
           1       0.76      0.66      0.71       201

    accuracy                           0.81       576
   macro avg       0.80      0.77      0.78       576
weighted avg       0.81      0.81      0.81       576



preprocessing

1.Normalization

In [21]:
x_df=df.drop("Outcome",axis=1)
x_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,148,50,35,0,33.6,0.627,50
1,85,66,29,0,26.6,0.351,31
2,183,64,0,0,23.3,0.672,52
3,150,66,23,94,28.1,0.167,21
4,150,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63
764,122,70,27,0,36.8,0.340,27
765,121,72,23,112,26.2,0.245,30
766,126,60,0,0,30.1,0.349,47


In [22]:
df.describe()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,121.117188,69.076823,20.536458,79.799479,31.992578,0.471876,33.24349,0.348958
std,31.805091,19.367794,15.952218,115.244002,7.88416,0.331329,11.758182,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,142.0,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [23]:
normal_scalar=MinMaxScaler()
array = normal_scalar.fit_transform(x_df)

x_normal_df=pd.DataFrame(array,columns=x_df.columns)
x_normal_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.743719,0.409836,0.353535,0.000000,0.500745,0.234415,0.483333
1,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.166667
2,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,0.516667
3,0.753769,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000
4,0.753769,0.327869,0.353535,0.198582,0.642325,0.943638,0.200000
...,...,...,...,...,...,...,...
763,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.700000
764,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.100000
765,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.150000
766,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.433333


Train Test Split 

In [24]:
df_normal = x_normal_df.copy()
df_normal["Outcome"] = df["Outcome"]
df_normal

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.743719,0.409836,0.353535,0.000000,0.500745,0.234415,0.483333,1
1,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.166667,0
2,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,0.516667,1
3,0.753769,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000,0
4,0.753769,0.327869,0.353535,0.198582,0.642325,0.943638,0.200000,1
...,...,...,...,...,...,...,...,...
763,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.700000,0
764,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.100000,0
765,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.150000,0
766,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.433333,1


In [25]:
x=df_normal.drop("Outcome",axis=1)
y=df_normal["Outcome"]

In [26]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.25,random_state=24,stratify=y)


Model Training

In [27]:
knn_clf=KNeighborsClassifier()
knn_clf.fit(x_train,y_train)

Testing Data Evaluation 

In [28]:
y_pred_testing=knn_clf.predict(x_test)

cnf_matrix=confusion_matrix(y_test,y_pred_testing)
print("Confusion Matrix:\n",cnf_matrix)
print("*"*80)

accuracy=accuracy_score(y_test,y_pred_testing)
print("Accuracy : ",accuracy)
print("*"*80)

clf_report=classification_report(y_test,y_pred_testing)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[100  25]
 [ 28  39]]
********************************************************************************
Accuracy :  0.7239583333333334
********************************************************************************
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.80      0.79       125
           1       0.61      0.58      0.60        67

    accuracy                           0.72       192
   macro avg       0.70      0.69      0.69       192
weighted avg       0.72      0.72      0.72       192



Training Data Evaluation 

In [29]:
y_pred_training=knn_clf.predict(x_train)
cnf_matrix=confusion_matrix(y_train,y_pred_training)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy=accuracy_score(y_train,y_pred_training)
print("Accuracy Score :",accuracy)
print("*"*80)
clf_report=classification_report(y_train,y_pred_training)
print("CLF report :\n",clf_report)

Confusion Matrix :
 [[332  43]
 [ 59 142]]
********************************************************************************
Accuracy Score : 0.8229166666666666
********************************************************************************
CLF report :
               precision    recall  f1-score   support

           0       0.85      0.89      0.87       375
           1       0.77      0.71      0.74       201

    accuracy                           0.82       576
   macro avg       0.81      0.80      0.80       576
weighted avg       0.82      0.82      0.82       576



Hyperparameters Tuning on Normalized Data

In [30]:
knn_clf = KNeighborsClassifier()
hyperparameter = {"n_neighbors":np.arange(3,30),
                  "p" : [1,2]}
gscv_knn_clf=GridSearchCV(knn_clf,hyperparameter,cv=5)
gscv_knn_clf.fit(x_train,y_train)

In [31]:
gscv_knn_clf.best_estimator_

In [32]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_auc_score

from sklearn.preprocessing import MinMaxScaler,StandardScaler


In [33]:
df=pd.read_csv("diabetes.csv")
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


In [34]:
df["Outcome"].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [35]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=24,stratify=y)


Model Training

In [36]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train,y_train)


Evalaution

In [37]:
y_pred_testing=knn_clf.predict(x_test)
y_pred_testing[:5]

array([0, 0, 0, 0, 1], dtype=int64)

In [38]:
y_test[:5]

385    0
397    1
537    0
268    0
244    0
Name: Outcome, dtype: int64

Testing Data Evaluation

In [39]:
y_pred_testing=knn_clf.predict(x_test)

cnf_matrix=confusion_matrix(y_test,y_pred_testing)
print("Confusion Matrix:\n",cnf_matrix)
print("*"*80)

accuracy=accuracy_score(y_test,y_pred_testing)
print("Accuracy : ",accuracy)
print("*"*80)

clf_report=classification_report(y_test,y_pred_testing)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[100  25]
 [ 28  39]]
********************************************************************************
Accuracy :  0.7239583333333334
********************************************************************************
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.80      0.79       125
           1       0.61      0.58      0.60        67

    accuracy                           0.72       192
   macro avg       0.70      0.69      0.69       192
weighted avg       0.72      0.72      0.72       192



Training Data Evaluation

In [40]:
y_pred_training=knn_clf.predict(x_train)
cnf_matrix=confusion_matrix(y_train,y_pred_training)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy=accuracy_score(y_train,y_pred_training)
print("Accuracy Score :",accuracy)
print("*"*80)
clf_report=classification_report(y_train,y_pred_training)
print("CLF report :\n",clf_report)

Confusion Matrix :
 [[332  43]
 [ 59 142]]
********************************************************************************
Accuracy Score : 0.8229166666666666
********************************************************************************
CLF report :
               precision    recall  f1-score   support

           0       0.85      0.89      0.87       375
           1       0.77      0.71      0.74       201

    accuracy                           0.82       576
   macro avg       0.81      0.80      0.80       576
weighted avg       0.82      0.82      0.82       576



Hyperparameter Tuning Parameters

In [41]:
knn_clf = KNeighborsClassifier()
hyperparameter = {"n_neighbors":np.arange(3,30),
                  "p":[1,2]}
gscv_knn_clf = GridSearchCV(knn_clf,hyperparameter,cv=5)
gscv_knn_clf.fit(x_train,y_train)


In [42]:
gscv_knn_clf.best_estimator_

In [43]:
gscv_knn_clf.best_params_

{'n_neighbors': 19, 'p': 1}

In [44]:
knn_clf=KNeighborsClassifier(n_neighbors=19,p=1)
knn_clf.fit(x_train,y_train)


Testing Data Evaluation 

In [45]:
y_pred_testing=knn_clf.predict(x_test)

cnf_matrix=confusion_matrix(y_test,y_pred_testing)
print("Confusion Matrix:\n",cnf_matrix)
print("*"*80)

accuracy=accuracy_score(y_test,y_pred_testing)
print("Accuracy : ",accuracy)
print("*"*80)

clf_report=classification_report(y_test,y_pred_testing)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[108  17]
 [ 33  34]]
********************************************************************************
Accuracy :  0.7395833333333334
********************************************************************************
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.86      0.81       125
           1       0.67      0.51      0.58        67

    accuracy                           0.74       192
   macro avg       0.72      0.69      0.69       192
weighted avg       0.73      0.74      0.73       192



Training Data Evaluation

In [46]:
y_pred_training=knn_clf.predict(x_train)
cnf_matrix=confusion_matrix(y_train,y_pred_training)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy=accuracy_score(y_train,y_pred_training)
print("Accuracy Score :",accuracy)
print("*"*80)
clf_report=classification_report(y_train,y_pred_training)
print("CLF report :\n",clf_report)

Confusion Matrix :
 [[341  34]
 [ 86 115]]
********************************************************************************
Accuracy Score : 0.7916666666666666
********************************************************************************
CLF report :
               precision    recall  f1-score   support

           0       0.80      0.91      0.85       375
           1       0.77      0.57      0.66       201

    accuracy                           0.79       576
   macro avg       0.79      0.74      0.75       576
weighted avg       0.79      0.79      0.78       576



Preprocessing


Normalization

In [47]:
x_df=df.drop("Outcome",axis=1)
x_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,148,50,35,0,33.6,0.627,50
1,85,66,29,0,26.6,0.351,31
2,183,64,0,0,23.3,0.672,52
3,150,66,23,94,28.1,0.167,21
4,150,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63
764,122,70,27,0,36.8,0.340,27
765,121,72,23,112,26.2,0.245,30
766,126,60,0,0,30.1,0.349,47


In [48]:
df.describe()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,121.117188,69.076823,20.536458,79.799479,31.992578,0.471876,33.24349,0.348958
std,31.805091,19.367794,15.952218,115.244002,7.88416,0.331329,11.758182,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,142.0,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [49]:
normal_scalar=MinMaxScaler()
array = normal_scalar.fit_transform(x_df)
x_normal_df=pd.DataFrame(array,columns=x_df.columns)
x_normal_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.743719,0.409836,0.353535,0.000000,0.500745,0.234415,0.483333
1,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.166667
2,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,0.516667
3,0.753769,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000
4,0.753769,0.327869,0.353535,0.198582,0.642325,0.943638,0.200000
...,...,...,...,...,...,...,...
763,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.700000
764,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.100000
765,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.150000
766,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.433333


Train Test Split

In [50]:
df_normal=x_normal_df.copy()
df_normal["Outcome"]=df["Outcome"]
df_normal

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.743719,0.409836,0.353535,0.000000,0.500745,0.234415,0.483333,1
1,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.166667,0
2,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,0.516667,1
3,0.753769,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000,0
4,0.753769,0.327869,0.353535,0.198582,0.642325,0.943638,0.200000,1
...,...,...,...,...,...,...,...,...
763,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.700000,0
764,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.100000,0
765,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.150000,0
766,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.433333,1


In [52]:
x=df_normal.drop("Outcome",axis=1)
y=df_normal["Outcome"]
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=24,stratify=y)


Model Training

In [53]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train, y_train)


Testing Data Evaluation 

In [54]:
y_pred_testing=knn_clf.predict(x_test)

cnf_matrix=confusion_matrix(y_test,y_pred_testing)
print("Confusion Matrix:\n",cnf_matrix)
print("*"*80)

accuracy=accuracy_score(y_test,y_pred_testing)
print("Accuracy : ",accuracy)
print("*"*80)

clf_report=classification_report(y_test,y_pred_testing)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[100  25]
 [ 28  39]]
********************************************************************************
Accuracy :  0.7239583333333334
********************************************************************************
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.80      0.79       125
           1       0.61      0.58      0.60        67

    accuracy                           0.72       192
   macro avg       0.70      0.69      0.69       192
weighted avg       0.72      0.72      0.72       192



Training Data Evaluation

In [55]:
y_pred_training=knn_clf.predict(x_train)
cnf_matrix=confusion_matrix(y_train,y_pred_training)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy=accuracy_score(y_train,y_pred_training)
print("Accuracy Score :",accuracy)
print("*"*80)
clf_report=classification_report(y_train,y_pred_training)
print("CLF report :\n",clf_report)

Confusion Matrix :
 [[332  43]
 [ 59 142]]
********************************************************************************
Accuracy Score : 0.8229166666666666
********************************************************************************
CLF report :
               precision    recall  f1-score   support

           0       0.85      0.89      0.87       375
           1       0.77      0.71      0.74       201

    accuracy                           0.82       576
   macro avg       0.81      0.80      0.80       576
weighted avg       0.82      0.82      0.82       576



Hyperparameter tuning on normalized data

In [56]:
knn_clf = KNeighborsClassifier()
hyperparameter = {"n_neighbors":np.arange(3,30),
                  "p":[1,2]}
gscv_knn_clf=GridSearchCV(knn_clf,hyperparameter,cv=5)
gscv_knn_clf.fit(x_train,y_train)

In [57]:
gscv_knn_clf.best_estimator_

In [58]:
knn_clf=gscv_knn_clf.best_estimator_
knn_clf.fit(x_train,y_train)


Testing Data Evaluation

In [59]:
y_pred_testing=knn_clf.predict(x_test)

cnf_matrix=confusion_matrix(y_test,y_pred_testing)
print("Confusion Matrix:\n",cnf_matrix)
print("*"*80)

accuracy=accuracy_score(y_test,y_pred_testing)
print("Accuracy : ",accuracy)
print("*"*80)

clf_report=classification_report(y_test,y_pred_testing)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[108  17]
 [ 33  34]]
********************************************************************************
Accuracy :  0.7395833333333334
********************************************************************************
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.86      0.81       125
           1       0.67      0.51      0.58        67

    accuracy                           0.74       192
   macro avg       0.72      0.69      0.69       192
weighted avg       0.73      0.74      0.73       192



Training Data Evaluation

In [60]:
y_pred_training=knn_clf.predict(x_train)
cnf_matrix=confusion_matrix(y_train,y_pred_training)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy=accuracy_score(y_train,y_pred_training)
print("Accuracy Score :",accuracy)
print("*"*80)
clf_report=classification_report(y_train,y_pred_training)
print("CLF report :\n",clf_report)

Confusion Matrix :
 [[341  34]
 [ 86 115]]
********************************************************************************
Accuracy Score : 0.7916666666666666
********************************************************************************
CLF report :
               precision    recall  f1-score   support

           0       0.80      0.91      0.85       375
           1       0.77      0.57      0.66       201

    accuracy                           0.79       576
   macro avg       0.79      0.74      0.75       576
weighted avg       0.79      0.79      0.78       576



Standardization

In [61]:
x_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,148,50,35,0,33.6,0.627,50
1,85,66,29,0,26.6,0.351,31
2,183,64,0,0,23.3,0.672,52
3,150,66,23,94,28.1,0.167,21
4,150,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63
764,122,70,27,0,36.8,0.340,27
765,121,72,23,112,26.2,0.245,30
766,126,60,0,0,30.1,0.349,47


In [62]:
std_scalar=StandardScaler()
array=std_scalar.fit_transform(x_df)
x_std_df=pd.DataFrame(array,columns=x_df.columns)
x_std_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.845787,-0.985618,0.907270,-0.692891,0.204013,0.468492,1.426022
1,-1.136319,-0.158966,0.530902,-0.692891,-0.684422,-0.365061,-0.190927
2,1.946957,-0.262298,-1.288212,-0.692891,-1.103255,0.604397,1.596227
3,0.908711,-0.158966,0.154533,0.123302,-0.494043,-0.920763,-1.041953
4,0.908711,-1.502276,0.907270,0.765836,1.409746,5.484909,-0.020722
...,...,...,...,...,...,...,...
763,-0.632927,0.357691,1.722735,0.870031,0.115169,-0.908682,2.532356
764,0.027775,0.047697,0.405445,-0.692891,0.610154,-0.398282,-0.531337
765,-0.003687,0.151028,0.154533,0.279594,-0.735190,-0.685193,-0.276029
766,0.153623,-0.468961,-1.288212,-0.692891,-0.240205,-0.371101,1.170715


In [63]:
df_std = x_std_df.copy()
df_std["Outcome"] = df["Outcome"]
df_std

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.845787,-0.985618,0.907270,-0.692891,0.204013,0.468492,1.426022,1
1,-1.136319,-0.158966,0.530902,-0.692891,-0.684422,-0.365061,-0.190927,0
2,1.946957,-0.262298,-1.288212,-0.692891,-1.103255,0.604397,1.596227,1
3,0.908711,-0.158966,0.154533,0.123302,-0.494043,-0.920763,-1.041953,0
4,0.908711,-1.502276,0.907270,0.765836,1.409746,5.484909,-0.020722,1
...,...,...,...,...,...,...,...,...
763,-0.632927,0.357691,1.722735,0.870031,0.115169,-0.908682,2.532356,0
764,0.027775,0.047697,0.405445,-0.692891,0.610154,-0.398282,-0.531337,0
765,-0.003687,0.151028,0.154533,0.279594,-0.735190,-0.685193,-0.276029,0
766,0.153623,-0.468961,-1.288212,-0.692891,-0.240205,-0.371101,1.170715,1


In [64]:
x=df_std.drop("Outcome",axis=1)
y=df_std["Outcome"]

In [65]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42,stratify=y)


Model Training

In [66]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train, y_train)


Testing Data Evaluation

In [67]:
y_pred_testing=knn_clf.predict(x_test)

cnf_matrix=confusion_matrix(y_test,y_pred_testing)
print("Confusion Matrix:\n",cnf_matrix)
print("*"*80)

accuracy=accuracy_score(y_test,y_pred_testing)
print("Accuracy : ",accuracy)
print("*"*80)

clf_report=classification_report(y_test,y_pred_testing)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[108  17]
 [ 33  34]]
********************************************************************************
Accuracy :  0.7395833333333334
********************************************************************************
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.86      0.81       125
           1       0.67      0.51      0.58        67

    accuracy                           0.74       192
   macro avg       0.72      0.69      0.69       192
weighted avg       0.73      0.74      0.73       192



Training Data Evaluation

In [68]:
y_pred_training=knn_clf.predict(x_train)
cnf_matrix=confusion_matrix(y_train,y_pred_training)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy=accuracy_score(y_train,y_pred_training)
print("Accuracy Score :",accuracy)
print("*"*80)
clf_report=classification_report(y_train,y_pred_training)
print("CLF report :\n",clf_report)

Confusion Matrix :
 [[328  47]
 [ 60 141]]
********************************************************************************
Accuracy Score : 0.8142361111111112
********************************************************************************
CLF report :
               precision    recall  f1-score   support

           0       0.85      0.87      0.86       375
           1       0.75      0.70      0.72       201

    accuracy                           0.81       576
   macro avg       0.80      0.79      0.79       576
weighted avg       0.81      0.81      0.81       576



With Hyperparameters tuning

In [69]:
knn_clf = KNeighborsClassifier()
hyperparameter = {"n_neighbors":np.arange(3,30),
                  "p":[1,2]}
gscv_knn_clf=GridSearchCV(knn_clf,hyperparameter,cv=5)
gscv_knn_clf.fit(x_train,y_train)

In [70]:
gscv_knn_clf.best_estimator_

In [71]:
gscv_knn_clf.best_params_

{'n_neighbors': 17, 'p': 2}

In [72]:
knn_clf=gscv_knn_clf.best_estimator_
knn_clf.fit(x_train,y_train)

Testing Data Evaluation

In [73]:
y_pred_testing=knn_clf.predict(x_test)

cnf_matrix=confusion_matrix(y_test,y_pred_testing)
print("Confusion Matrix:\n",cnf_matrix)
print("*"*80)

accuracy=accuracy_score(y_test,y_pred_testing)
print("Accuracy : ",accuracy)
print("*"*80)

clf_report=classification_report(y_test,y_pred_testing)
print("Classification Report:\n",clf_report)

Confusion Matrix:
 [[110  15]
 [ 34  33]]
********************************************************************************
Accuracy :  0.7447916666666666
********************************************************************************
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.88      0.82       125
           1       0.69      0.49      0.57        67

    accuracy                           0.74       192
   macro avg       0.73      0.69      0.70       192
weighted avg       0.74      0.74      0.73       192



Training Data Evaluation

In [74]:
y_pred_training=knn_clf.predict(x_train)
cnf_matrix=confusion_matrix(y_train,y_pred_training)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy=accuracy_score(y_train,y_pred_training)
print("Accuracy Score :",accuracy)
print("*"*80)
clf_report=classification_report(y_train,y_pred_training)
print("CLF report :\n",clf_report)

Confusion Matrix :
 [[336  39]
 [ 86 115]]
********************************************************************************
Accuracy Score : 0.7829861111111112
********************************************************************************
CLF report :
               precision    recall  f1-score   support

           0       0.80      0.90      0.84       375
           1       0.75      0.57      0.65       201

    accuracy                           0.78       576
   macro avg       0.77      0.73      0.75       576
weighted avg       0.78      0.78      0.78       576

