# Model Building - Diabetes Risk Prediction

In [1]:
import pandas as pd
import numpy as np  

In [2]:
df = pd.read_csv('./data/diabetes_data_upload.csv')

In [3]:
df.replace(['Yes','No','Positive','Negative'], [1,0,1,0],inplace=True)
df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,Male,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,Male,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,Male,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,Male,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


In [4]:
y = df['class']
gender = df['Gender']
df.drop(['class','Gender'], axis = 1, inplace=True)

# Decision Tree model:

In [5]:
# Split train/test data:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(df.values, y, test_size = 0.2, random_state=1)

tree = DecisionTreeClassifier(max_depth=5, random_state=1)
tree.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=5, random_state=1)

In [6]:
#Test the accuracy of the model:
from sklearn.metrics import accuracy_score

tree_pred = tree.predict(X_test)
tree_accuracy = accuracy_score(y_test,tree_pred)
print("Accuracy = " + str(tree_accuracy))

Accuracy = 0.9038461538461539


Train and test the Decision Tree with the features selected in the EDA phase:

In [7]:
# Train and test the Decision Tree with the features
# selected in the EDA phase:
features = ['Polyuria','Irritability','sudden weight loss','partial paresis','Polyphagia','visual blurring','Polydipsia','weakness','Alopecia']
df_red = df[features]
df_red.head()

Unnamed: 0,Polyuria,Irritability,sudden weight loss,partial paresis,Polyphagia,visual blurring,Polydipsia,weakness,Alopecia
0,0,0,0,0,0,0,1,1,1
1,0,0,0,1,0,1,0,1,1
2,1,0,0,0,1,0,0,1,1
3,0,0,1,0,1,0,0,1,0
4,1,1,1,1,1,1,1,1,1


In [8]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(df_red.values, y, test_size = 0.2,random_state=1)

tree_red = DecisionTreeClassifier(max_depth=5,random_state=1)
tree_red.fit(X_train2,y_train2)

DecisionTreeClassifier(max_depth=5, random_state=1)

In [9]:
tree_red_pred = tree_red.predict(X_test2)
tree_accuracy_red = accuracy_score(y_test2,tree_red_pred)
print("Accuracy (red) = " + str(tree_accuracy_red))

Accuracy (red) = 0.875


Identification of the parameters for the tree using cross-validation:

In [10]:
from sklearn.model_selection import GridSearchCV, cross_val_score

tree_params = {'max_depth': range(1,11), 'max_features': range(4,14)}

tree_grid = GridSearchCV(tree, tree_params, n_jobs=-1, verbose=True)

tree_grid.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 440 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    3.5s finished


GridSearchCV(estimator=DecisionTreeClassifier(max_depth=5, random_state=1),
             n_jobs=-1,
             param_grid={'max_depth': range(1, 11),
                         'max_features': range(4, 14)},
             verbose=True)

In [11]:
print("Best params: ")
print(tree_grid.best_params_)

print("Best score: ")
print(tree_grid.best_score_)

Best params: 
{'max_depth': 10, 'max_features': 5}
Best score: 
0.9495123350545038


In [12]:

import sys

import pydotplus #pip install pydotplus
from sklearn.tree import export_graphviz

def tree_graph_to_png(tree, feature_names, png_file_to_save):
    tree_str = export_graphviz(tree, feature_names=feature_names, 
                                     filled=True, out_file=None)
    graph = pydotplus.graph_from_dot_data(tree_str)  
    graph.write_png(png_file_to_save)

In [13]:
tree_graph_to_png(tree=tree_grid.best_estimator_,feature_names=df.columns,png_file_to_save='./images/decision_tree.png')

InvocationException: GraphViz's executables not found

<img src="images/decision_tree.png">

# Random Forest model:

In [29]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100, n_jobs=-1)

print(np.mean(cross_val_score(forest,X_train,y_train)))

0.9711417096959266


In [26]:
forest_params = {'max_depth': range(6,12),'max_features': range(4,14)}

forest_grid = GridSearchCV(forest, forest_params, n_jobs=-1, verbose=True)

forest_grid.fit(X_train, y_train)

print("Best params: ")
print(forest_grid.best_params_)

print("Best score: ")
print(forest_grid.best_score_)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   11.2s finished
Best params: 
{'max_depth': 9, 'max_features': 4}
Best score: 
0.9639414802065405
