In [61]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score,classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV 

In [62]:
diabetes = pd.read_csv("diabetes.csv")

In [63]:
diabetes.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [64]:
diabetes.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [65]:
diabetes.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [66]:
diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [67]:
# distributions for all numeric variables
import seaborn as sns
for i in diabetes.columns[0:-1] :
    sns.displot(data=diabetes, x=i, kde=True)

In [68]:
sns.barplot(diabetes['Outcome'].value_counts().index,diabetes['Outcome'].value_counts()).set_title('Outcome')



Text(0.5, 1.0, 'Outcome')

In [69]:
sns.displot(data=diabetes, x="Age", kde=True)

<seaborn.axisgrid.FacetGrid at 0x1d7a83e4df0>

In [70]:
sns.pairplot(diabetes,hue='Outcome');

In [71]:
#box plot for outlier visualization
sns.set(style="whitegrid")
diabetes.boxplot(figsize=(15,6))

<AxesSubplot:xlabel='Age', ylabel='Density'>

In [72]:
#outlier remove

Q1=diabetes.quantile(0.25)
Q3=diabetes.quantile(0.75)
IQR=Q3-Q1
#outlier remove
df_out = diabetes[~((diabetes < (Q1 - 1.5 * IQR)) |(diabetes > (Q3 + 1.5 * IQR))).any(axis=1)]
diabetes.shape,df_out.shape

((768, 9), (639, 9))

In [73]:
#box plot for outlier visualization
sns.set(style="whitegrid")
df_out.boxplot(figsize=(15,6))

<AxesSubplot:xlabel='Age', ylabel='Density'>

In [74]:
# distributions for all numeric variables after deleting the outliers
import seaborn as sns
for i in diabetes.columns[0:-1] :
    sns.displot(data=diabetes, x=i, kde=True)

In [75]:
import matplotlib.pyplot as plt
f,ax = plt.subplots(figsize=(14, 14))
matrice_corr = diabetes.corr()
sns.heatmap(matrice_corr, annot=True, linewidths=.5, fmt= '.1f',ax=ax)

<AxesSubplot:>

In [76]:
#Correlation with output variable
cor_target = matrice_corr["Outcome"]

#Selecting highly correlated features
relevant_features = cor_target.sort_values(ascending=False)

print(relevant_features)

Outcome                     1.000000
Glucose                     0.466581
BMI                         0.292695
Age                         0.238356
Pregnancies                 0.221898
DiabetesPedigreeFunction    0.173844
Insulin                     0.130548
SkinThickness               0.074752
BloodPressure               0.065068
Name: Outcome, dtype: float64


In [77]:
X =df_out.drop(['Outcome'], axis=1)
Y = df_out['Outcome']

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [79]:
X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
722,1,149,68,29,127,29.3,0.349,42
213,0,140,65,26,130,42.6,0.431,24
343,5,122,86,0,0,34.7,0.290,33
306,10,161,68,23,132,25.5,0.326,47
304,3,150,76,0,0,21.0,0.207,37
...,...,...,...,...,...,...,...,...
91,4,123,80,15,176,32.0,0.443,34
132,3,170,64,37,225,34.5,0.356,30
328,2,102,86,36,120,45.5,0.127,23
527,3,116,74,15,105,26.3,0.107,24


In [80]:
from sklearn.tree import DecisionTreeClassifier

clf=DecisionTreeClassifier()
clf_model=clf.fit(X_train, y_train)

y_pred = clf_model.predict(X_test)
x_pred = clf_model.predict(X_train)

In [81]:
from sklearn.metrics import confusion_matrix, accuracy_score

print("Train Accuracy :", accuracy_score(y_train,x_pred))
print("Train Confusion Matrix:")
print(confusion_matrix(y_train,x_pred))
report=classification_report(y_train,x_pred)
print(report)
print('------------------------------------------------------')
print('Test Accuracy :', accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))
report=classification_report(y_test,y_pred) 
print(report)

Train Accuracy : 1.0
Train Confusion Matrix:
[[347   0]
 [  0 164]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       347
           1       1.00      1.00      1.00       164

    accuracy                           1.00       511
   macro avg       1.00      1.00      1.00       511
weighted avg       1.00      1.00      1.00       511

------------------------------------------------------
Test Accuracy : 0.75
[[73 19]
 [13 23]]
              precision    recall  f1-score   support

           0       0.85      0.79      0.82        92
           1       0.55      0.64      0.59        36

    accuracy                           0.75       128
   macro avg       0.70      0.72      0.70       128
weighted avg       0.76      0.75      0.76       128



In [82]:
dt = DecisionTreeClassifier(random_state=0)

In [83]:
params = {'max_depth': [2, 3, 5, 10, 20],
          'min_samples_leaf': [5, 10, 20, 50, 100],
          'criterion': ["entropy"] }

In [84]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=dt, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")


grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 25 candidates, totalling 100 fits


In [85]:
grid_search.best_estimator_

In [86]:
dt_best=grid_search.best_estimator_

In [87]:
print("Train Accuracy :", accuracy_score(y_train, dt_best.predict(X_train)))
print("Train Confusion Matrix:")
print(confusion_matrix(y_train, dt_best.predict(X_train)))
report=classification_report(y_train,dt_best.predict(X_train))
print(report)
print('-------------------------------------------------------')
print("Test Accuracy :", accuracy_score(y_test, dt_best.predict(X_test)))
print("Test Confusion Matrix:")
print(confusion_matrix(y_test, dt_best.predict(X_test)))
report=classification_report(y_test,dt_best.predict(X_test))
print(report)

Train Accuracy : 0.7808219178082192
Train Confusion Matrix:
[[332  15]
 [ 97  67]]
              precision    recall  f1-score   support

           0       0.77      0.96      0.86       347
           1       0.82      0.41      0.54       164

    accuracy                           0.78       511
   macro avg       0.80      0.68      0.70       511
weighted avg       0.79      0.78      0.76       511

-------------------------------------------------------
Test Accuracy : 0.78125
Test Confusion Matrix:
[[88  4]
 [24 12]]
              precision    recall  f1-score   support

           0       0.79      0.96      0.86        92
           1       0.75      0.33      0.46        36

    accuracy                           0.78       128
   macro avg       0.77      0.64      0.66       128
weighted avg       0.78      0.78      0.75       128



In [88]:
import pickle
Pkl_Filename = "model_tree.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(clf, file)

In [89]:
#load back
#with open(Pkl_Filename, 'rb') as file:  
    #Pickled_LR_Model = pickle.load(file)
#Pickled_LR_Model