# General Overview - Machine Learning

For this binary classification problem, we are using Logistic Regression, Decision Tree Classifier, Random Forest Classifier, and Gradient Boosting Classifier, Gaussian Naive Bayes. To assess the accuracy, we look at the accuracy scores and classification reports.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn import metrics
import seaborn as sns

from sklearn.model_selection import (cross_val_score, 
                                     GridSearchCV, 
                                     train_test_split,  
                                     KFold)
from sklearn.metrics import (classification_report,
                             confusion_matrix, 
                             accuracy_score)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)
from sklearn.naive_bayes import GaussianNB

  import pandas.util.testing as tm


In [2]:
np.random.seed(42)

In [3]:
data = pd.read_csv('diabetes_ml_scale.csv', index_col=0) # import data
diabetes = data.copy() # save a copy of data as diabetes

In [4]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64360 entries, 0 to 64359
Data columns (total 64 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   gender                64360 non-null  int64  
 1   diabetesMed           64360 non-null  int64  
 2   time_in_hospital      64360 non-null  float64
 3   num_lab_procedures    64360 non-null  float64
 4   num_procedures        64360 non-null  float64
 5   num_medications       64360 non-null  float64
 6   num_outpatient        64360 non-null  float64
 7   num_inpatient         64360 non-null  float64
 8   num_diagnoses         64360 non-null  float64
 9   change                64360 non-null  int64  
 10  AfricanAmerican       64360 non-null  int64  
 11  Asian                 64360 non-null  int64  
 12  Caucasian             64360 non-null  int64  
 13  Hispanic              64360 non-null  int64  
 14  Other_race            64360 non-null  int64  
 15  [0-10)             

In [5]:
diabetes.head()

Unnamed: 0,gender,diabetesMed,time_in_hospital,num_lab_procedures,num_procedures,num_medications,num_outpatient,num_inpatient,num_diagnoses,change,...,Steady_pioglitazone,Up_pioglitazone,Down_rosiglitazone,Steady_rosiglitazone,Up_rosiglitazone,Down_insulin,No_insulin,Steady_insulin,Up_insulin,readmitted
0,1,1,0.166667,0.568627,0.0,0.435897,0.0,0.0,0.636364,1,...,0,0,0,0,0,0,0,0,1,YES
1,1,1,0.083333,0.098039,0.833333,0.307692,0.666667,0.5,0.363636,0,...,0,0,0,0,0,0,1,0,0,NO
2,0,1,0.083333,0.421569,0.166667,0.384615,0.0,0.0,0.454545,1,...,0,0,0,0,0,0,0,0,1,NO
3,0,1,0.0,0.490196,0.0,0.179487,0.0,0.0,0.272727,1,...,0,0,0,0,0,0,0,1,0,NO
4,0,1,0.166667,0.294118,1.0,0.384615,0.0,0.0,0.636364,0,...,0,0,0,0,0,0,0,1,0,YES


In [6]:
# separate variables
y = diabetes['readmitted'].values # target variable
X = diabetes.drop('readmitted', axis=1).values

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(45052, 63) (45052,)
(19308, 63) (19308,)


# Logistic Regression

In [7]:
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, logreg_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy Score, Training Set:  0.622458492408772
Accuracy Score, Test Set:  0.6203128237000207
Classification Report 

              precision    recall  f1-score   support

          NO       0.63      0.92      0.74     11655
         YES       0.57      0.17      0.26      7653

    accuracy                           0.62     19308
   macro avg       0.60      0.54      0.50     19308
weighted avg       0.60      0.62      0.55     19308



In [8]:
# confusion matrix
cm = confusion_matrix(y_test, logreg_pred)
print('Confusion Matrix \n', cm)

Confusion Matrix 
 [[10670   985]
 [ 6346  1307]]


## parameter tuning

In [9]:
logreg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [10]:
# parameter tuning - C
parameters = {'C': [0.001, 0.01, 0.1, 1, 10]}
clf = GridSearchCV(logreg, parameters, cv=5, verbose=1, n_jobs=-1)
clf.fit(X, y).best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   10.4s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


{'C': 0.1}

In [11]:
# using new parameters
logreg = LogisticRegression(random_state=42, C=0.1)
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set: {}'.format(logreg.score(X_train, y_train)))
print('Accuracy Score, Test Set: {}'.format(logreg.score(X_test, y_test)))

# classification report
print('Classification Report \n')
print(classification_report(y_test, logreg_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy Score, Training Set: 0.6223475095445263
Accuracy Score, Test Set: 0.6206753677232235
Classification Report 

              precision    recall  f1-score   support

          NO       0.63      0.92      0.74     11655
         YES       0.57      0.17      0.26      7653

    accuracy                           0.62     19308
   macro avg       0.60      0.54      0.50     19308
weighted avg       0.61      0.62      0.55     19308



In [12]:
# confusion matrix
print('Confusion Matrix \n', confusion_matrix(y_test, logreg_pred))

Confusion Matrix 
 [[10697   958]
 [ 6366  1287]]


# Decision Tree Classifier

In [13]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)
decision_tree_pred = decision_tree.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set: {}'.format(decision_tree.score(X_train, y_train)))
print('Accuracy Score, Test Set: {}'.format(decision_tree.score(X_test, y_test)))

# classification report
print('Classification Report \n')
print(classification_report(y_test, decision_tree_pred)) # very overfitted

Accuracy Score, Training Set: 0.9999334102814526
Accuracy Score, Test Set: 0.5475968510461985
Classification Report 

              precision    recall  f1-score   support

          NO       0.63      0.62      0.62     11655
         YES       0.43      0.44      0.43      7653

    accuracy                           0.55     19308
   macro avg       0.53      0.53      0.53     19308
weighted avg       0.55      0.55      0.55     19308



In [14]:
# confusion matrix
print('Confusion Matrix \n', confusion_matrix(y_test, decision_tree_pred))

Confusion Matrix 
 [[7226 4429]
 [4306 3347]]


The Decision Tree Classifier training set is very over fitted compared to the test set, which means that we have to tune the model's parameters so that it is learning the data instead of memorizing it.

## parameter tuning

We are focused on the criterion, max_depth, and min_samples_split parameters.

In [15]:
decision_tree.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': 42,
 'splitter': 'best'}

In [16]:
# hyperparameter tuning
criterion = ['gini', 'entropy']
max_depth = [2, 4, 5, 8, 10]
min_samples_split = [2, 8, 14, 16]
min_samples_leaf = [1, 2, 4, 6]

parameters = dict(max_depth=max_depth, criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)

dt_gs = GridSearchCV(decision_tree, parameters, cv=5, verbose=1, n_jobs=-1)
dt_gs.fit(X_train, y_train).best_params_

Fitting 5 folds for each of 160 candidates, totalling 800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   45.0s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 800 out of 800 | elapsed:  1.4min finished


{'criterion': 'gini',
 'max_depth': 5,
 'min_samples_leaf': 4,
 'min_samples_split': 2}

In [17]:
# using new parameters - overfitting problem reduced
decision_tree = DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=5, min_samples_leaf=4, min_samples_split=2)
decision_tree.fit(X_train, y_train)
decision_tree_pred = decision_tree.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, decision_tree_pred))

Accuracy Score, Training Set: 0.6267646275415076
Accuracy Score, Test Set: 0.6206753677232235
Classification Report 

              precision    recall  f1-score   support

          NO       0.63      0.90      0.74     11655
         YES       0.56      0.19      0.28      7653

    accuracy                           0.62     19308
   macro avg       0.60      0.55      0.51     19308
weighted avg       0.60      0.62      0.56     19308



In [18]:
# confusion matrix
print('Confusion Matrix \n', confusion_matrix(y_test, decision_tree_pred))

Confusion Matrix 
 [[10540  1115]
 [ 6209  1444]]


## cross validation - k folds

In [19]:
kf = KFold(n_splits=5) # split into 5 folds 
kf.get_n_splits(X)

for train_index, test_index in kf.split(X):
    print('Iteration: ')
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    decision_tree = DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=5, min_samples_leaf=4, min_samples_split=2)
    decision_tree.fit(X_train, y_train)
    decision_tree_pred = decision_tree.predict(X_test)
    
    # accuracy scores
    print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
    print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, decision_tree_pred))

Iteration: 
Accuracy Score, Training Set: 0.6304187383467993
Accuracy Score, Test Set: 0.598430702299565
Classification Report 

              precision    recall  f1-score   support

          NO       0.59      0.94      0.73      7305
         YES       0.66      0.15      0.24      5567

    accuracy                           0.60     12872
   macro avg       0.63      0.54      0.48     12872
weighted avg       0.62      0.60      0.52     12872

Iteration: 
Accuracy Score, Training Set: 0.6132885332504662
Accuracy Score, Test Set: 0.6629894344313239
Classification Report 

              precision    recall  f1-score   support

          NO       0.67      0.92      0.78      8192
         YES       0.60      0.22      0.32      4680

    accuracy                           0.66     12872
   macro avg       0.64      0.57      0.55     12872
weighted avg       0.65      0.66      0.61     12872

Iteration: 
Accuracy Score, Training Set: 0.6459174953387197
Accuracy Score, Test Set: 

# Gradient Boosting Classifier

In [20]:
gbc = GradientBoostingClassifier(random_state=42)
gbc.fit(X_train, y_train)
gbc_pred = gbc.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', gbc.score(X_train, y_train))
print('Accuracy Score, Test Set:', gbc.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, gbc_pred))

Accuracy Score, Training Set: 0.6192316656308267
Accuracy Score, Test Set: 0.6674953387197017
Classification Report 

              precision    recall  f1-score   support

          NO       0.75      0.80      0.78      9267
         YES       0.39      0.33      0.35      3605

    accuracy                           0.67     12872
   macro avg       0.57      0.56      0.57     12872
weighted avg       0.65      0.67      0.66     12872



In [21]:
# confusion matrix
print('Confusion Matrix \n', confusion_matrix(y_test, gbc_pred))

Confusion Matrix 
 [[7420 1847]
 [2433 1172]]


## parameter tuning

In [22]:
gbc.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': 42,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [23]:
# abbreviated parameter tuning
n_estimators = [20, 50, 80, 100, 150]
max_depth = [4, 6, 8, 10, 14]

parameters = dict(n_estimators=n_estimators, max_depth=max_depth)

gbc_gs = GridSearchCV(gbc, parameters, cv=5, verbose=1, n_jobs=-1)
gbc_gs.fit(X_train, y_train).best_params_

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed: 25.1min finished


{'max_depth': 4, 'n_estimators': 150}

In [24]:
# using new parameters
gbc = GradientBoostingClassifier(random_state=42, max_depth=4, n_estimators=150)
gbc.fit(X_train, y_train)
gbc_pred = gbc.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', gbc.score(X_train, y_train))
print('Accuracy Score, Test Set:', gbc.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, gbc_pred))

Accuracy Score, Training Set: 0.6361482287134866
Accuracy Score, Test Set: 0.6523461777501554
Classification Report 

              precision    recall  f1-score   support

          NO       0.76      0.76      0.76      9267
         YES       0.38      0.38      0.38      3605

    accuracy                           0.65     12872
   macro avg       0.57      0.57      0.57     12872
weighted avg       0.65      0.65      0.65     12872



In [25]:
# confusion matrix
print('Confusion Matrix \n', confusion_matrix(y_test, gbc_pred))

Confusion Matrix 
 [[7028 2239]
 [2236 1369]]


# Random Forest Classifier

In [26]:
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train, y_train)
forest_pred = forest.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', forest.score(X_train, y_train))
print('Accuracy Score, Test Set:', forest.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, forest_pred)) # very overfitted

Accuracy Score, Training Set: 0.999941733996271
Accuracy Score, Test Set: 0.6208048477315102
Classification Report 

              precision    recall  f1-score   support

          NO       0.75      0.70      0.73      9267
         YES       0.35      0.41      0.38      3605

    accuracy                           0.62     12872
   macro avg       0.55      0.56      0.55     12872
weighted avg       0.64      0.62      0.63     12872



In [27]:
# confusion matrix
print('Confusion Matrix \n', confusion_matrix(y_test, forest_pred))

Confusion Matrix 
 [[6510 2757]
 [2124 1481]]


The original Random Forest Classifier has an over fitting problem with the training set. This can happen and therefore, we have to tune the model's parameters since it looks like the model just memorized sample structures rather than learning them.

## parameter tuning

The parameters we are tuning to optimize our model are criterion, max_depth and n_estimators.

In [28]:
forest.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [29]:
# abbreviated parameter tuning
criterion = ['gini', 'entropy']
n_estimators = [50, 80, 100, 140, 200]
max_depth = [10, 15, 18, 20, 30]

parameters = dict(criterion=criterion, n_estimators=n_estimators, max_depth=max_depth)

forest_gs = GridSearchCV(forest, parameters, cv=5, verbose=1, n_jobs=-1)
forest_gs.fit(X_train, y_train).best_params_

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   54.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  8.3min finished


{'criterion': 'entropy', 'max_depth': 15, 'n_estimators': 100}

In [30]:
# using new parameters
forest = RandomForestClassifier(random_state=42, criterion='entropy', max_depth=15, n_estimators=100)
forest.fit(X_train, y_train)
forest_pred = forest.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', forest.score(X_train, y_train))
print('Accuracy Score, Test Set:', forest.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, forest_pred))

Accuracy Score, Training Set: 0.7289077066500932
Accuracy Score, Test Set: 0.6745649471721566
Classification Report 

              precision    recall  f1-score   support

          NO       0.75      0.83      0.79      9267
         YES       0.39      0.28      0.33      3605

    accuracy                           0.67     12872
   macro avg       0.57      0.55      0.56     12872
weighted avg       0.65      0.67      0.66     12872



In [31]:
# confusion matrix
print('Confusion Matrix \n', confusion_matrix(y_test, forest_pred))

Confusion Matrix 
 [[7671 1596]
 [2593 1012]]


## cross validation - k folds

In [32]:
kf = KFold(n_splits=5) # split into 5 folds 
kf.get_n_splits(X)

for train_index, test_index in kf.split(X):
    print('Iteration: ')
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    forest = RandomForestClassifier(random_state=42, criterion='entropy', max_depth=15, n_estimators=100)
    forest.fit(X_train, y_train)
    forest_pred = forest.predict(X_test)
    
    # accuracy scores
    print('Accuracy Score, Training Set:', forest.score(X_train, y_train))
    print('Accuracy Score, Test Set:', forest.score(X_test, y_test))
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, forest_pred))

Iteration: 
Accuracy Score, Training Set: 0.7126320696084525
Accuracy Score, Test Set: 0.5918272218769423
Classification Report 

              precision    recall  f1-score   support

          NO       0.59      0.96      0.73      7305
         YES       0.68      0.11      0.18      5567

    accuracy                           0.59     12872
   macro avg       0.63      0.53      0.46     12872
weighted avg       0.63      0.59      0.49     12872

Iteration: 
Accuracy Score, Training Set: 0.7108452454940957
Accuracy Score, Test Set: 0.6518023617153511
Classification Report 

              precision    recall  f1-score   support

          NO       0.66      0.91      0.77      8192
         YES       0.56      0.19      0.29      4680

    accuracy                           0.65     12872
   macro avg       0.61      0.55      0.53     12872
weighted avg       0.63      0.65      0.59     12872

Iteration: 
Accuracy Score, Training Set: 0.7194103480422622
Accuracy Score, Test Set:

# Gaussian Naive Bayes

In [33]:
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
gaussian_pred = gaussian.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', gaussian.score(X_train, y_train))
print('Accuracy Score, Test Set:', gaussian.score(X_test, y_test))

# classification report
print('Classification Report \n')
print(classification_report(y_test, gaussian_pred))

Accuracy Score, Training Set: 0.5899432877563704
Accuracy Score, Test Set: 0.5886420136730889
Classification Report 

              precision    recall  f1-score   support

          NO       0.77      0.61      0.68      9267
         YES       0.35      0.54      0.42      3605

    accuracy                           0.59     12872
   macro avg       0.56      0.57      0.55     12872
weighted avg       0.65      0.59      0.61     12872



In [34]:
# confusion matrix
print('Confusion Matrix \n', confusion_matrix(y_test, gaussian_pred))

Confusion Matrix 
 [[5642 3625]
 [1670 1935]]
