In [3]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

- load diabetes data set

In [4]:
df = pd.read_csv('diabetes_data_train_710samples.csv')
df.head()
df.columns

Index(['Row', 'preg_count', 'glucose_concentration', 'diastolic_bp',
       'triceps_skin_fold_thickness', 'two_hr_serum_insulin', 'bmi',
       'diabetes_pedi', 'age', 'diabetes_class'],
      dtype='object')

- select features and target

In [5]:
X = df[['preg_count', 'glucose_concentration', 'diastolic_bp',
        'triceps_skin_fold_thickness', 'two_hr_serum_insulin', 'bmi',
        'diabetes_pedi', 'age']]
y = df['diabetes_class']

- split the data set

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

- Use DecisionTree (Supervised learning)

In [8]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

In [9]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [10]:
y_pred = dtc.predict(X_test)

In [11]:
from sklearn.metrics import confusion_matrix

In [12]:
dtc.score(X_test, y_test)

0.7323943661971831

- Use Random Forest (Supervised learning)

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
tf = RandomForestClassifier()

In [15]:
tf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
y_pred = dtc.predict(X_test)

In [19]:
from sklearn.metrics import confusion_matrix
tf.score(X_test, y_test)

0.7230046948356808

- Using Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

- predict the value

In [14]:
y_pred = lr.predict(X_test)

In [15]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1])

- metrics for LR  - lr.score(X_test, y_test)

In [33]:
# Lets cacluate the Random Guess Probability
# And we will compare with the prediction percentage returned by model
len(df[df['diabetes_class'] == 0]), len(df), len(df[df['diabetes_class'] == 0])/len(df)

(464, 710, 0.6535211267605634)

In [20]:
check = pd.DataFrame({'actual':y_test, 'predicted':y_pred})

In [21]:
len(check[check['actual']==check['predicted']]), len(check)

(161, 213)

In [22]:
lr.score(X_test, y_test)

0.755868544600939

- another metric confusion matrix 
- import confustion_matrix from sklearn.metrics
- tn, fp, fn, tp

In [24]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_pred, y_test)

array([[119,  34],
       [ 18,  42]])

In [50]:
#True Negatives, False Positives, False Negatives, True Positives
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
tn, fp, fn, tp, (tn + tp)/(tn + tp + fn + fp)

(119, 18, 34, 42, 0.755868544600939)

- change threshold by running predict_proba instead of predict

In [36]:
y_prob_pred = lr.predict_proba(X_test)

In [37]:
y_prob_pred

array([[0.94570823, 0.05429177],
       [0.75411808, 0.24588192],
       [0.65761011, 0.34238989],
       [0.90761056, 0.09238944],
       [0.84579983, 0.15420017],
       [0.74861263, 0.25138737],
       [0.79381964, 0.20618036],
       [0.5536263 , 0.4463737 ],
       [0.54445347, 0.45554653],
       [0.39282426, 0.60717574],
       [0.80562053, 0.19437947],
       [0.82433531, 0.17566469],
       [0.38413812, 0.61586188],
       [0.46088696, 0.53911304],
       [0.80631931, 0.19368069],
       [0.25600877, 0.74399123],
       [0.92965161, 0.07034839],
       [0.60542468, 0.39457532],
       [0.90150747, 0.09849253],
       [0.60760943, 0.39239057],
       [0.93181777, 0.06818223],
       [0.07900526, 0.92099474],
       [0.8849568 , 0.1150432 ],
       [0.3182982 , 0.6817018 ],
       [0.31561487, 0.68438513],
       [0.61286625, 0.38713375],
       [0.44983518, 0.55016482],
       [0.18678482, 0.81321518],
       [0.28406564, 0.71593436],
       [0.82180096, 0.17819904],
       [0.

In [39]:
print(y_prob_pred[:5])
print(y_test[:5])

[[0.94570823 0.05429177]
 [0.75411808 0.24588192]
 [0.65761011 0.34238989]
 [0.90761056 0.09238944]
 [0.84579983 0.15420017]]
81     0
77     0
216    1
225    0
654    0
Name: diabetes_class, dtype: int64


In [42]:
y_my_pred = [ 0 if p < 0.3 else 1 for n,p in y_prob_pred ]

In [44]:
tn, fp, fn, tp = confusion_matrix(y_my_pred, y_test).ravel()
tn, fp, fn, tp, (tn + tp)/(tn + tp + fn + fp)

(87, 10, 50, 66, 0.7183098591549296)

In [63]:
# Target reducing False Negatives... Make it 0
df2 = pd.DataFrame([],  columns =  ["Probability", "True Negatives", "False Positives", "False Negatives", "True Positives"])
prob_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.65, 0.8]
for prob_value in prob_values:
    y_my_pred = [ 0 if p < prob_value else 1 for n,p in y_prob_pred ]
    tn, fp, fn, tp = confusion_matrix(y_test, y_my_pred).ravel()
    df2.loc[-1] = [prob_value, tn, fp, fn, tp]
    df2.index = df2.index + 1
    df2 = df2.sort_index()
    print(prob_value, tn, fp, fn, tp, (tn + tp)/(tn + tp + fn + fp))

0.1 20 117 0 76 0.4507042253521127
0.2 60 77 6 70 0.6103286384976526
0.3 87 50 10 66 0.7183098591549296
0.4 106 31 27 49 0.7276995305164319
0.5 119 18 34 42 0.755868544600939
0.6 125 12 40 36 0.755868544600939
0.65 129 8 46 30 0.7464788732394366
0.8 135 2 75 1 0.6384976525821596


In [64]:
df2

Unnamed: 0,Probability,True Negatives,False Positives,False Negatives,True Positives
0,0.8,135.0,2.0,75.0,1.0
1,0.65,129.0,8.0,46.0,30.0
2,0.6,125.0,12.0,40.0,36.0
3,0.5,119.0,18.0,34.0,42.0
4,0.4,106.0,31.0,27.0,49.0
5,0.3,87.0,50.0,10.0,66.0
6,0.2,60.0,77.0,6.0,70.0
7,0.1,20.0,117.0,0.0,76.0
