In [1]:
import pandas as pd
import numpy as np
import pylab as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn import metrics

In [2]:
import os
os.chdir(r'C:\Users\Vish\Documents\Data\Dataset')
df = pd.read_csv("diabetes.csv")

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
#Check class imbalance
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [5]:
#Splitting the data into independent and dependent variables
X = df.iloc[:,:8]
y = df['Outcome'] 

In [6]:
#Evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [7]:
#Model building
model = DecisionTreeClassifier(criterion='entropy',splitter='best',max_depth=10,min_samples_split=2,min_samples_leaf=1,max_features='auto',max_leaf_nodes=2,random_state=1)
#criterion:{“gini”, “entropy”}, default=“gini”--The function to measure the quality of a split
#Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain
#splitter:{“best”, “random”}, default=“best”--The strategy used to choose the split at each node
#max_depth:{int, None}, default=None--The maximum depth of the tree
#min_samples_split:{int, float}, default=2--The minimum number of samples required to split an internal node
#min_samples_leaf:{int, float}, default=1--The minimum number of samples required to be at a leaf node
#max_features:{int, float, None, “auto”, “sqrt”, “log2”}, default=None--The number of features to consider when looking for the best split
#max_leaf_nodes:{int, None}, default=None--The maximum number of leaf nodes to consider
#min_impurity_decrease:float, default=0.0--Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.
#random_state:{int, RandomState}, default=None--Can be used for generating reproducible results
model = model.fit(X_train, y_train)

In [8]:
#Model building
#Fit the classifier
#model = DecisionTreeRegressor(criterion='entropy',splitter='best',max_depth=10,min_samples_split=2,min_samples_leaf=1,max_features='auto',max_leaf_nodes=2,random_state=1)
#criterion:{“mse”, “friedman_mse”, “mae”}, default=”mse”--The function to measure the quality of a split
#splitter:{“best”, “random”}, default=“best”--The strategy used to choose the split at each node
#max_depth:{int, None}, default=None--The maximum depth of the tree
#min_samples_split:{int, float}, default=2--The minimum number of samples required to split an internal node
#min_samples_leaf:{int, float}, default=1--The minimum number of samples required to be at a leaf node
#max_features:{int, float, None, “auto”, “sqrt”, “log2”}, default=None--The number of features to consider when looking for the best split
#max_leaf_nodes:{int, None}, default=None--The maximum number of leaf nodes to consider
#min_impurity_decrease:float, default=0.0--Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.
#random_state:{int, RandomState}, default=None--Can be used for generating reproducible results
#model = model.fit(X_train, y_train)

In [9]:
#Predictions
#The predict fuction converts probability values > .5 to 1 else 0
y_pred = model.predict(X_test)

In [10]:
#Evaluating model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("Recall: ", metrics.precision_score(y_test,y_pred))
print("F1 score: ", metrics.f1_score(y_test,y_pred))

Accuracy:  0.6190476190476191
Recall:  0.4375
F1 score:  0.5268817204301075


In [11]:
# Using our own threshold
# decisions = (model.predict_proba(X_test) >= 0.6).astype(int)
# y_pred=decisions[:,1]
# print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
# print("Recall: ", metrics.precision_score(y_test,y_pred))
# print("F1 score: ", metrics.f1_score(y_test,y_pred))

In [12]:
#AUC ROC
# Auc roc works on class probabilities. Although it can work on predictions of 0,1 as well but it works well with probabilities
# class probabilities are indepedent of threshold because first class probabilites are calculated and then threshold is used
#to predict 0 and 1. So auc will remain same for all threshold values
# Using probs to generat class probabilities
# probs array return two array- probability for negative class and probability for positive class
probs = model.predict_proba(X_test)
probs

array([[0.81451613, 0.18548387],
       [0.81451613, 0.18548387],
       [0.81451613, 0.18548387],
       [0.48788927, 0.51211073],
       [0.48788927, 0.51211073],
       [0.81451613, 0.18548387],
       [0.48788927, 0.51211073],
       [0.48788927, 0.51211073],
       [0.81451613, 0.18548387],
       [0.48788927, 0.51211073],
       [0.81451613, 0.18548387],
       [0.48788927, 0.51211073],
       [0.48788927, 0.51211073],
       [0.81451613, 0.18548387],
       [0.48788927, 0.51211073],
       [0.48788927, 0.51211073],
       [0.48788927, 0.51211073],
       [0.81451613, 0.18548387],
       [0.48788927, 0.51211073],
       [0.48788927, 0.51211073],
       [0.48788927, 0.51211073],
       [0.81451613, 0.18548387],
       [0.81451613, 0.18548387],
       [0.48788927, 0.51211073],
       [0.81451613, 0.18548387],
       [0.81451613, 0.18548387],
       [0.81451613, 0.18548387],
       [0.48788927, 0.51211073],
       [0.81451613, 0.18548387],
       [0.48788927, 0.51211073],
       [0.

In [13]:
#Probability for positive class
y_pred_prob = probs[:, 1]
y_pred_prob

array([0.18548387, 0.18548387, 0.18548387, 0.51211073, 0.51211073,
       0.18548387, 0.51211073, 0.51211073, 0.18548387, 0.51211073,
       0.18548387, 0.51211073, 0.51211073, 0.18548387, 0.51211073,
       0.51211073, 0.51211073, 0.18548387, 0.51211073, 0.51211073,
       0.51211073, 0.18548387, 0.18548387, 0.51211073, 0.18548387,
       0.18548387, 0.18548387, 0.51211073, 0.18548387, 0.51211073,
       0.51211073, 0.18548387, 0.18548387, 0.18548387, 0.18548387,
       0.51211073, 0.51211073, 0.18548387, 0.18548387, 0.18548387,
       0.18548387, 0.18548387, 0.51211073, 0.18548387, 0.51211073,
       0.18548387, 0.18548387, 0.18548387, 0.18548387, 0.18548387,
       0.51211073, 0.18548387, 0.51211073, 0.51211073, 0.18548387,
       0.18548387, 0.51211073, 0.18548387, 0.18548387, 0.51211073,
       0.51211073, 0.51211073, 0.18548387, 0.18548387, 0.51211073,
       0.51211073, 0.51211073, 0.51211073, 0.51211073, 0.51211073,
       0.18548387, 0.51211073, 0.18548387, 0.51211073, 0.18548

In [14]:
#AUC ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
roc_auc = metrics.auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc)

# fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
# roc_auc = metrics.auc(fpr, tpr)
# print("Area under the ROC curve : %f" % roc_auc)

Area under the ROC curve : 0.630444


In [15]:
thresholds

array([1.51211073, 0.51211073, 0.18548387])

In [16]:
#Which Error is Costly??
i = np.arange(len(tpr)) # index for df
roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),'tpr' : pd.Series(tpr,
index = i),'1-fpr' : pd.Series(1-fpr, index = i)})
print(roc)

        fpr       tpr     1-fpr
0  0.000000  0.000000  1.000000
1  0.401274  0.662162  0.598726
2  1.000000  1.000000  0.000000


In [17]:
#Which Error is Costly??
#i = np.arange(len(tpr)) # index for df
roc_with_threshold = pd.DataFrame({'fpr' : pd.Series(fpr, index=thresholds),'tpr' : pd.Series(tpr,
index = thresholds),'1-fpr' : pd.Series(1-fpr, index = thresholds)})
print(roc_with_threshold)

               fpr       tpr     1-fpr
1.512111  0.000000  0.000000  1.000000
0.512111  0.401274  0.662162  0.598726
0.185484  1.000000  1.000000  0.000000
