# General Overview - Machine Learning

For this binary classification problem, we are using logistic regression, decision tree classifier, random forest classifier, and naive bayes. To assess the accuracy of our models, we are looking at the accuracy scores and classification reports.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (cross_val_score, 
                                     GridSearchCV, 
                                     train_test_split)
from sklearn.metrics import (classification_report,
                             confusion_matrix)

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
np.random.seed(42)

In [3]:
data = pd.read_csv('diabetes_ml.csv', index_col=0) # import data
diabetes = data.copy() # save a copy of data as diabetes

In [4]:
diabetes.head()

Unnamed: 0,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,num_outpatient,num_emergency,num_inpatient,num_diagnoses,...,No_insulin,Steady_insulin,Up_insulin,Elective,Emergency,Newborn,Trauma Center,Unknown_admission_type,Urgent,readmitted
0,1,5,1,41,0,1,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
1,1,15,3,59,0,18,0,0,0,9,...,0,0,1,0,1,0,0,0,0,1
2,1,25,2,11,5,13,2,0,1,6,...,1,0,0,0,1,0,0,0,0,0
3,0,35,2,44,1,16,0,0,0,7,...,0,0,1,0,1,0,0,0,0,0
4,0,45,1,51,0,8,0,0,0,5,...,0,1,0,0,1,0,0,0,0,0


In [5]:
# add the categorical column
diabetes['readmitted'] = diabetes['readmitted'].replace({0: 'No', 1: 'Yes'})

In [6]:
# independent and target variables
y = diabetes['readmitted'].values # target variable
X = diabetes.drop('readmitted', axis=1).values

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(50763, 87) (50763,)
(16921, 87) (16921,)


# Baseline - DummyClassifier

In [7]:
# using the stratified strategy
stratified = DummyClassifier(strategy='stratified', random_state=42).fit(X_train, y_train)
print('Accuracy Score: {}'.format(stratified.score(X_test, y_test)))

Accuracy Score: 0.5177589976951716


In [8]:
# using the most frequent strategy
frequent = DummyClassifier(strategy='most_frequent', random_state=42).fit(X_train, y_train)
print('Accuracy Score: {}'.format(frequent.score(X_test, y_test)))

Accuracy Score: 0.5894450682583772


In [9]:
# using the uniform strategy
uniform = DummyClassifier(strategy='uniform', random_state=42).fit(X_train, y_train)
print('Accuracy Score: {}'.format(uniform.score(X_test, y_test)))

Accuracy Score: 0.49341055493174163


# Logistic Regression

In [19]:
logreg = LogisticRegression(random_state=42).fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))

# confusion matrix
cm = confusion_matrix(y_test, logreg_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, logreg_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy Score, Training Set:  0.6203731063963911
Accuracy Score, Test Set:  0.6238992967318716
Confusion Matrix 
 [[8797 1177]
 [5187 1760]]
Classification Report 

              precision    recall  f1-score   support

          No       0.63      0.88      0.73      9974
         Yes       0.60      0.25      0.36      6947

    accuracy                           0.62     16921
   macro avg       0.61      0.57      0.55     16921
weighted avg       0.62      0.62      0.58     16921



In [11]:
# parameter tuning

# Decision Tree Classifier

In [12]:
decision_tree = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
decision_tree_pred = decision_tree.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))

# confusion matrix

cm = confusion_matrix(y_test, decision_tree_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, decision_tree_pred))

Accuracy Score, Training Set: 0.999960601225302
Accuracy Score, Test Set: 0.5482536493115064
Confusion Matrix 
 [[6042 3932]
 [3712 3235]]
Classification Report 

              precision    recall  f1-score   support

          No       0.62      0.61      0.61      9974
         Yes       0.45      0.47      0.46      6947

    accuracy                           0.55     16921
   macro avg       0.54      0.54      0.54     16921
weighted avg       0.55      0.55      0.55     16921



In [13]:
# parameter tuning

# Random Forest Classifier

In [14]:
forest = RandomForestClassifier(random_state=42).fit(X_train, y_train)
y_pred = forest.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', forest.score(X_train, y_train))
print('Accuracy Score, Test Set:', forest.score(X_test, y_test))

# confusion matrix
cm = confusion_matrix(y_test, y_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, y_pred))

Accuracy Score, Training Set: 0.999960601225302
Accuracy Score, Test Set: 0.6208261923054194
Confusion Matrix 
 [[8276 1698]
 [4718 2229]]
Classification Report 

              precision    recall  f1-score   support

          No       0.64      0.83      0.72      9974
         Yes       0.57      0.32      0.41      6947

    accuracy                           0.62     16921
   macro avg       0.60      0.58      0.57     16921
weighted avg       0.61      0.62      0.59     16921



In [15]:
# parameter tuning

# Gaussian Naive Bayes

In [16]:
gaussian = GaussianNB().fit(X_train, y_train)
gaussian_pred = gaussian.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', gaussian.score(X_train, y_train))
print('Accuracy Score, Test Set:', gaussian.score(X_test, y_test))

# confusion matrix
cm = confusion_matrix(y_test, gaussian_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, gaussian_pred))

Accuracy Score, Training Set: 0.48675216200776156
Accuracy Score, Test Set: 0.4904556468293836
Confusion Matrix 
 [[2403 7571]
 [1051 5896]]
Classification Report 

              precision    recall  f1-score   support

          No       0.70      0.24      0.36      9974
         Yes       0.44      0.85      0.58      6947

    accuracy                           0.49     16921
   macro avg       0.57      0.54      0.47     16921
weighted avg       0.59      0.49      0.45     16921



In [17]:
# parameter tuning