# <a href = "https://www.youtube.com/watch?v=4HKqjENq9OU&t=8s"> KNN - Predict whether a person will have diabetes or not </a>

Dataset - Diabetes Data

In [1]:
# importing library

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [2]:
data = pd.read_csv('./Data/diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# check for null values
data.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
# Checking Columns with "0" values
cols_names = data.columns

for i in cols_names:
    # Column Name
    print(i)
    col_data = data[data[i]==0]
    print(col_data,"\n")
    print("Total number of rows with 0 value in", i ,"is", col_data.shape[0],"\n")
    print("#"*100)

Pregnancies
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
4              0      137             40             35      168  43.1   
16             0      118             84             47      230  45.8   
45             0      180             66             39        0  42.0   
57             0      100             88             60      110  46.8   
58             0      146             82              0        0  40.5   
..           ...      ...            ...            ...      ...   ...   
713            0      134             58             20      291  26.4   
727            0      141             84             26        0  32.4   
736            0      126             86             27      120  27.4   
753            0      181             88             44      510  43.3   
757            0      123             72              0        0  36.3   

     DiabetesPedigreeFunction  Age  Outcome  
4                       2.288   33        1  
16     

In [5]:
# Replace zeroes with NAN values
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

In [6]:
# Imputing mean values in Columns which have 0 Value
for i in zero_not_accepted:
    data[i] = data[i].replace(0, np.NaN)
    mean = int(data[i].mean(skipna=True))
    data[i] = data[i].fillna(mean)

print(data.isna().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [7]:
# split dataset
X = data.iloc[:, 0:8]
y = data.iloc[:, 8]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [8]:
# Shape of the dataset
print("X_train:", X_train.shape, "\t y_train:", y_train.shape, "\t X_test:", X_test.shape, "\t y_test:", y_test.shape)

X_train: (614, 8) 	 y_train: (614,) 	 X_test: (154, 8) 	 y_test: (154,)


In [9]:
# Feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [10]:
# Define the model: Init K-NN
classifier = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean')

In [11]:
# Fit Model
classifier.fit(X_train, y_train)

In [12]:
# Predict the test set results
y_pred = classifier.predict(X_test)
print(y_pred)
print(y_pred.shape)

[1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0
 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 0 1 1 1 1 0 0 0 0 0 0 1
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 0
 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 0 0 0 0]
(154,)


In [13]:
# Evaluating the Model
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:",cm)
# F1 Score
print("F1 Score for the testing data and prediction dataset:",round(f1_score(y_test, y_pred)*100,3),"\n")
# Accuracy
print("Accuracy of the model is",round(accuracy_score(y_test, y_pred)*100, 2),"\n")
# Classification report
print("Classification Report", classification_report(y_pred, y_test))


Confusion Matrix: [[94 13]
 [15 32]]
F1 Score for the testing data and prediction dataset: 69.565 

Accuracy of the model is 81.82 

Classification Report               precision    recall  f1-score   support

           0       0.88      0.86      0.87       109
           1       0.68      0.71      0.70        45

    accuracy                           0.82       154
   macro avg       0.78      0.79      0.78       154
weighted avg       0.82      0.82      0.82       154



In [14]:
y_test.T

661    1
122    0
113    0
14     1
529    0
      ..
476    1
482    0
230    1
527    0
380    0
Name: Outcome, Length: 154, dtype: int64

In [15]:
dict = {"y_test":y_test, "y_pred":y_pred}

In [16]:
submission = pd.DataFrame(dict)
submission.sample(10)

Unnamed: 0,y_test,y_pred
588,1,1
568,0,1
526,0,0
258,0,0
48,1,0
355,1,1
68,0,0
623,0,0
698,0,0
605,0,0


In [17]:
# Printing rows which do not have same y_test and y_pred data
submission[submission['y_test']!= submission['y_pred']]

Unnamed: 0,y_test,y_pred
577,1,0
744,0,1
285,0,1
374,0,1
648,1,0
40,0,1
419,1,0
198,1,0
683,1,0
578,0,1
