<a href="https://colab.research.google.com/github/athresh/ml-for-engineers/blob/master/Evaluation_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

url = 'https://raw.githubusercontent.com/athresh/ml-for-engineers/master/datasets/diabetes.csv'
data = pd.read_csv(url)
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [None]:
hist = data.hist(figsize=(30,30))

In [None]:
data_cleaned = data.copy(deep=True)
data_cleaned[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = data_cleaned[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
data_cleaned['Glucose'].fillna(data_cleaned['Glucose'].mean(), inplace=True)
data_cleaned['BloodPressure'].fillna(data_cleaned['BloodPressure'].mean(), inplace=True)
data_cleaned['SkinThickness'].fillna(data_cleaned['SkinThickness'].median(), inplace=True)
data_cleaned['Insulin'].fillna(data_cleaned['Insulin'].median(), inplace=True)
data_cleaned['BMI'].fillna(data_cleaned['BMI'].median(), inplace=True)

data_cleaned.describe().T

In [None]:
hist = data_cleaned.hist(figsize=(30,30))

In [None]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
cols_X = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
cols_y = ['Outcome']
X,y = data_cleaned[cols_X], data_cleaned[cols_y[0]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
model = tree.DecisionTreeClassifier(max_depth=3)
model = model.fit(X_train, y_train);
print("Training complete")

In [None]:
import graphviz
dot_data = tree.export_graphviz(model, out_file=None, feature_names=cols_X, class_names=cols_y[0], filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph

In [None]:
pred_test = model.predict(X_test)
pred_train = model.predict(X_train)
acc_test = metrics.accuracy_score(y_test, pred_test)
acc_train = metrics.accuracy_score(y_train,pred_train)
print("Accuracy on test set={}".format(acc_test))
print("Accuracy on train set={}".format(acc_train))

In [None]:
report_test = metrics.classification_report(y_test, pred_test)
print(report_test)

In [None]:
report_train = metrics.classification_report(y_train, pred_train)
print(report_train)

In [None]:
import seaborn as sns
confusion_matrix_test = metrics.confusion_matrix(y_test, pred_test)
p = sns.heatmap(pd.DataFrame(confusion_matrix_test), annot=True, cmap="YlGnBu", fmt="g")
plt.title('Confusion matrix')
plt.ylabel('Actual value')
plt.xlabel('Predicted value')

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score, auc
y_pred_prob = model.predict_proba(X_test)[:,1]
fpr, tpr, thr = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='blue', lw=lw, label='ROC curve(area={})'.format(roc_auc))
plt.plot([0,1], [0,1], color='navy', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
from sklearn.model_selection import StratifiedKFold
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
kf = StratifiedKFold(n_splits=10)
X_train_copy = X_train.copy(deep=True)
y_train_copy = y_train.copy(deep=True)
acc_max = 0
for max_depth in range(1,20):
    acc=[]
    for train_index, val_index in kf.split(X_train_copy, y_train_copy):
        X_train_cv, X_val = X_train_copy.iloc[train_index], X_train_copy.iloc[val_index]
        y_train_cv, y_val = y_train_copy.iloc[train_index], y_train_copy.iloc[val_index]
        model = tree.DecisionTreeClassifier(max_depth=max_depth)
        model = model.fit(X_train_cv, y_train_cv)
        pred_val = model.predict(X_val)
        acc_cur = metrics.accuracy_score(y_val, pred_val)
        acc.append(acc_cur)
#     print(acc)
    print("Accuracy with max tree depth of {} = {}".format(max_depth, np.mean(acc)))
    if np.mean(acc) > acc_max:
        acc_max = np.mean(acc)
        max_depth_best = max_depth
print("Best max tree depth value based on cross validation = {}".format(max_depth_best))
model = tree.DecisionTreeClassifier(max_depth=max_depth_best)
model = model.fit(X_train, y_train)
pred_test = model.predict(X_test)
acc_test = metrics.accuracy_score(y_test, pred_test)
print("Accuracy on test set = {}".format(acc_test))

In [None]:
pred_test = (model.predict_proba(X_test)[:,1] >= 0.5).astype(bool)
report_test = metrics.classification_report(y_test, pred_test)
print(report_test)