# KNN - Predict whether a person will have diabetes or not

In [18]:
import pandas as pd
import numpy as np

#Algorithms and models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

#Testing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [4]:
#Load data base
dataset = pd.read_csv("diabetes.csv")
#Number of rows
print(len(dataset))
print("")

#Some few data on top
print(dataset.head())

768

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


## Data Pre-processing

In [6]:
#Replace zeros

#Columns where zeros are not accepted.
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
#If we have 0's in this column, it means that the patient is dead already.

for column in zero_not_accepted:  # Takes each column one by one
    dataset[column] = dataset[column].replace(0, np.NaN)  # Replace the 0 in that column with NaN
    #NaN says we have no data in there i.e the patient is not probably dead, we just don't have the data
    
    mean = int(dataset[column].mean(skipna=True))  # Find the mean while skipping the NaN values
    dataset[column] = dataset[column].replace(np.NaN, mean)  # Replace the NaN values with the mean


In [7]:
print(dataset['Glucose'])

0      148.0
1       85.0
2      183.0
3       89.0
4      137.0
       ...  
763    101.0
764    122.0
765    121.0
766    126.0
767     93.0
Name: Glucose, Length: 768, dtype: float64


In [9]:
#Split Dataset into train and test data

# X is generally capital and y is small 
X = dataset.iloc[:,0:8] #Select all rows but column from  Pregnancies to Age.
#We want X to be feature and hence we didn't select the outcome here

y = dataset.iloc[:,8] # Select all rows but only the 'Outcomes' column

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
#We will use 80% data on Training and rest(20%) of the data on testing


## Feature scaling

In [10]:
#Feature Scaling
sc_X = StandardScaler() # All data is between -1 and 1
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

## Training

In [14]:
import math
math.sqrt(len(y_test)) # Output : 12.409673645990857

#In KNN algorithm, all neighbours vote. and since 12 is a even number, a 6-6 vote can occur.
#Hence we want a odd number, so we will take neighbours = 11.

#p=2 means diabetic or not

12.409673645990857

In [15]:
#Define the model: Initialize KNN
classifier = KNeighborsClassifier(n_neighbors=11, p=2, metric = 'euclidean') # We need the euclidean distance between two points, [In Nepal, we say, equidistance from two points. root under x1-y1 wholesq - y1-y2 whole sq]

In [16]:
#Fit Model
classifier.fit(X_train, y_train)

## Prediction

In [20]:
## Predict the test set results
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [21]:
#Evaluate Model
cm = confusion_matrix(y_test, y_pred) # y_test and y_predict
print(cm)

# Output:
# [[94 13]
#  [15 32]]

#Our prediticion said 

[[94 13]
 [15 32]]


In [23]:
print(f1_score(y_test, y_pred))

0.6956521739130436


In [24]:
print(accuracy_score(y_test, y_pred))

0.8181818181818182
