# Early stage diabetes risk prediction

In [1]:
import pandas as pd
import numpy as np

### Importing Dataset files and Analyzing

In [2]:
training_data = pd.read_excel(r"D:\data science\Python - Anaconda\K-Nearest Neighbor\Early Diabetes Prediction\Early diabetes_train.xlsx",
                              header = 0, index_col = False)

testing_data = pd.read_excel(r"D:\data science\Python - Anaconda\K-Nearest Neighbor\Early Diabetes Prediction\Early diabetes_test.xlsx",
                            header = 0, index_col = False)

print(training_data.shape)
print(testing_data.shape)

(500, 17)
(20, 17)


In [3]:
training_data.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40.0,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58.0,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41.0,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60.0,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [4]:
testing_data.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,60.0,Male,No,No,,No,No,No,No,No,No,No,No,No,No,Yes,Negative
1,58.0,Male,No,No,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,No,Negative
2,39.0,Female,Yes,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes,No,No,No,Positive
3,54.0,Male,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Negative
4,67.0,Male,No,No,No,Yes,No,No,No,Yes,No,Yes,No,No,Yes,No,Negative


In [5]:
training_data.tail()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
495,70.0,Male,Yes,No,No,No,Yes,No,Yes,Yes,No,Yes,Yes,Yes,Yes,No,Negative
496,44.0,Male,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Negative
497,38.0,Male,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Negative
498,35.0,Male,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Negative
499,61.0,Male,No,No,No,Yes,No,Yes,No,Yes,No,Yes,No,No,Yes,No,Negative


In [6]:
testing_data.tail()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
15,40.0,Male,No,No,Yes,No,No,No,No,No,No,No,No,No,No,Yes,Negative
16,48.0,Female,No,No,Yes,Yes,No,No,Yes,Yes,No,Yes,Yes,,No,No,Positive
17,60.0,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,No,Yes,Yes,Yes,No,Yes,Positive
18,,Male,No,No,No,No,Yes,Yes,No,No,No,No,No,No,No,No,Negative
19,60.0,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,No,No,Yes,Yes,No,No,Positive


In [7]:
# Thus from above observation it seems there is no overlap of the training and testing data.

### Feature Selection

In [8]:
# Domain Knowledge:
# From Domain knowledge we can say that all the factors considered in the data play important role in Diabetes Detection.
# Thus we will not remove any of the variables from data.
# So all the variables from data are considered.

### Missing Values

In [9]:
print(training_data.isnull().sum())

Age                   22
Gender                 0
Polyuria               0
Polydipsia             0
sudden weight loss     9
weakness               0
Polyphagia             0
Genital thrush         0
visual blurring        0
Itching                0
Irritability           0
delayed healing        0
partial paresis        0
muscle stiffness       9
Alopecia               0
Obesity                0
class                  0
dtype: int64


In [10]:
print(testing_data.isnull().sum())

Age                   2
Gender                0
Polyuria              0
Polydipsia            0
sudden weight loss    3
weakness              0
Polyphagia            0
Genital thrush        0
visual blurring       0
Itching               0
Irritability          0
delayed healing       0
partial paresis       0
muscle stiffness      2
Alopecia              0
Obesity               0
class                 0
dtype: int64


In [11]:
# Thus from above we can see that some variables have missing values.
# We have to remove those missing values by using the principle of central tendancy.
# Missing values are present in both the data training as well as testing data.

In [12]:
# Imputting the missing values from training data.

training_data["Age"].fillna(int(training_data["Age"].mean()), inplace = True)
training_data["sudden weight loss"].fillna(training_data["sudden weight loss"].mode()[0], inplace = True)
training_data["muscle stiffness"].fillna(training_data["muscle stiffness"].mode()[0], inplace = True)

print(training_data.isnull().sum())

Age                   0
Gender                0
Polyuria              0
Polydipsia            0
sudden weight loss    0
weakness              0
Polyphagia            0
Genital thrush        0
visual blurring       0
Itching               0
Irritability          0
delayed healing       0
partial paresis       0
muscle stiffness      0
Alopecia              0
Obesity               0
class                 0
dtype: int64


In [13]:
# Thus from above we can safely say that we have removed all the missing vlaues form training data.

In [14]:
# Imputting missing data from the testing data.

testing_data["Age"].fillna(int(testing_data["Age"].mean()), inplace = True)
testing_data["sudden weight loss"].fillna(testing_data["sudden weight loss"].mode()[0], inplace = True)
testing_data["muscle stiffness"].fillna(testing_data["muscle stiffness"].mode()[0], inplace = True)

print(testing_data.isnull().sum())

Age                   0
Gender                0
Polyuria              0
Polydipsia            0
sudden weight loss    0
weakness              0
Polyphagia            0
Genital thrush        0
visual blurring       0
Itching               0
Irritability          0
delayed healing       0
partial paresis       0
muscle stiffness      0
Alopecia              0
Obesity               0
class                 0
dtype: int64


In [15]:
# Thus from above we can safely say that we have removed all the missing vlaues form testing data.

In [16]:
# We dont have to remove the outliers from the data in KNN algorithms 
# coz in KNN we calculate the distance between the points so even if we have outliers they will automatically
# be removed as outliers will be at greater distance from the intended points.

### Encoding

In [17]:
# Now in Encoding we have to convert categorical variables into numerical variables.

In [18]:
training_data.dtypes

Age                   float64
Gender                 object
Polyuria               object
Polydipsia             object
sudden weight loss     object
weakness               object
Polyphagia             object
Genital thrush         object
visual blurring        object
Itching                object
Irritability           object
delayed healing        object
partial paresis        object
muscle stiffness       object
Alopecia               object
Obesity                object
class                  object
dtype: object

In [19]:
# Now we find all the categorical variables.

colname = []

for i in training_data.columns:
    if(training_data[i].dtypes == "object"):
        colname.append(i)
        
colname

['Gender',
 'Polyuria',
 'Polydipsia',
 'sudden weight loss',
 'weakness',
 'Polyphagia',
 'Genital thrush',
 'visual blurring',
 'Itching',
 'Irritability',
 'delayed healing',
 'partial paresis',
 'muscle stiffness',
 'Alopecia',
 'Obesity',
 'class']

In [20]:
# Now we transform categorical variables into numerical using Label Encoder.

from sklearn import preprocessing

le = preprocessing.LabelEncoder()

for i in colname:
    training_data[i] = le.fit_transform(training_data[i])
    testing_data[i] = le.fit_transform(testing_data[i])

In [21]:
# Here Labeles are as follows:
# Male = 1
# Female = 0
# Yes = 1
# No = 0

In [22]:
training_data.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40.0,1,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58.0,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41.0,1,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,48.0,1,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60.0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


In [23]:
testing_data.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,60.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
1,58.0,1,0,0,0,1,0,0,0,1,0,1,0,1,1,0,0
2,39.0,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1
3,54.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,67.0,1,0,0,0,1,0,0,0,1,0,1,0,0,1,0,0


### Spliting The Data

In [24]:
# Now here we will split the data into x_train, y_train, x_test, y_test

In [26]:
x_train = training_data.values[:,:-1]
y_train = training_data.values[:,-1]
y_train = y_train.astype(int)       # To make sure Depedent variable remains into integer type

In [27]:
x_test = testing_data.values[:,:-1]
y_test = testing_data.values[:,-1]
y_test = y_test.astype(int)        # To make sure Depedent variable remains into integer type

### Scaling the data

In [28]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# while working on separate train and test data, fit the scaler object only upon train data
# and use the same scaler object to transform both the train as well as the test data.

### Building The Model

In [29]:
# To build the KNN model we first must decide the value of K
# There are two method of finding the value of 
# Method 1 : square root of no. of observations.
# Method 2 : Trial and error.

#### Method 1 : Building Model with Square Root Method.

In [31]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

# Build the model
# Here we take K = sqrt(no. of observations)
model_KNN = KNeighborsClassifier(n_neighbors = int(np.sqrt(len(x_train))), metric = "euclidean")

# Fit the model 
model_KNN.fit(x_train,y_train)

# Predict 
y_pred = model_KNN.predict(x_test)


# Showing the result
print(list(zip(y_test, y_pred)))

[(0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (1, 1), (1, 1), (1, 1), (0, 0), (1, 1), (0, 0), (1, 1), (0, 0), (1, 1), (1, 1), (0, 0), (1, 1)]


In [32]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

cfm = confusion_matrix(y_test,y_pred)
print(cfm, "\n")

print("Classification Report : ")
print(classification_report(y_test,y_pred))

acc = accuracy_score(y_test,y_pred)
print("Accuracy Score = ", acc)

[[10  0]
 [ 0 10]] 

Classification Report : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

Accuracy Score =  1.0


#### Method 2 : Building The Model Using Trail & Error.

In [36]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
my_dict = {}

for k in range(1, int(np.sqrt(len(x_train))+1)):
    model_KNN = KNeighborsClassifier(n_neighbors = k , metric = "euclidean")
    model_KNN.fit(x_train,y_train)
    y_pred = model_KNN.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
    print("Accuracy is ", acc, "For the value of k = ", k)
    my_dict[k] = acc

Accuracy is  1.0 For the value of k =  1
Accuracy is  1.0 For the value of k =  2
Accuracy is  1.0 For the value of k =  3
Accuracy is  1.0 For the value of k =  4
Accuracy is  1.0 For the value of k =  5
Accuracy is  1.0 For the value of k =  6
Accuracy is  1.0 For the value of k =  7
Accuracy is  1.0 For the value of k =  8
Accuracy is  1.0 For the value of k =  9
Accuracy is  1.0 For the value of k =  10
Accuracy is  1.0 For the value of k =  11
Accuracy is  1.0 For the value of k =  12
Accuracy is  1.0 For the value of k =  13
Accuracy is  1.0 For the value of k =  14
Accuracy is  1.0 For the value of k =  15
Accuracy is  1.0 For the value of k =  16
Accuracy is  1.0 For the value of k =  17
Accuracy is  1.0 For the value of k =  18
Accuracy is  1.0 For the value of k =  19
Accuracy is  1.0 For the value of k =  20
Accuracy is  1.0 For the value of k =  21
Accuracy is  1.0 For the value of k =  22


In [37]:
# From above we can see that we get accuracy always equal to 1 irrespective of the k value.
# but in case we get different accuarcy and have to find max accuracy for a perticular k
# we use following code

In [38]:
for k in my_dict:
    if (my_dict[k] == max(my_dict.values())):
        print(k, " : ", my_dict[k])

1  :  1.0
2  :  1.0
3  :  1.0
4  :  1.0
5  :  1.0
6  :  1.0
7  :  1.0
8  :  1.0
9  :  1.0
10  :  1.0
11  :  1.0
12  :  1.0
13  :  1.0
14  :  1.0
15  :  1.0
16  :  1.0
17  :  1.0
18  :  1.0
19  :  1.0
20  :  1.0
21  :  1.0
22  :  1.0


In [39]:
# thus we get accuracy for all k values equal to 1
# thus we stick to k = sqrt(no. of observations)
# k = 22

In [40]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

model_KNN = KNeighborsClassifier(n_neighbors = 22, metric = "euclidean")

model_KNN.fit(x_train,y_train)

y_pred = model_KNN.predict(x_test)

print(list(zip(y_test,y_pred)))

[(0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (1, 1), (1, 1), (1, 1), (0, 0), (1, 1), (0, 0), (1, 1), (0, 0), (1, 1), (1, 1), (0, 0), (1, 1)]


In [41]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

cfm = confusion_matrix(y_test, y_pred)
print(cfm, "\n")

print("Classification Report : ")
print(classification_report(y_test,y_pred))

acc = accuracy_score(y_test,y_pred)
print("Accuracy Score = ", acc)

[[10  0]
 [ 0 10]] 

Classification Report : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

Accuracy Score =  1.0


### By using Logistic Regression

In [42]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(x_train,y_train)

y_pred = lr.predict(x_test)

print(list(zip(y_test,y_pred)))

[(0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (1, 1), (0, 0), (0, 0), (1, 1), (1, 1), (1, 1), (0, 0), (1, 1), (0, 0), (1, 1), (0, 0), (1, 1), (1, 1), (0, 0), (1, 1)]


In [43]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

cfm = confusion_matrix(y_test,y_pred)
print(cfm, "\n")

print("Classification Report : ")
print(classification_report(y_test, y_pred))

acc = accuracy_score(y_test,y_pred)
print("Accuracy Score is = ", acc)

[[10  0]
 [ 0 10]] 

Classification Report : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        10

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

Accuracy Score is =  1.0


In [44]:
# Thus from above result we can say that logistic regression gives accuracy = 1

In [45]:
# Thus from above all observations we can say that all methods used give accuracy = 1