# General Overview - Machine Learning

For this binary classification problem, we are using logistic regression, decision tree classifier, random forest classifier, Gaussian Naive Bayes, and Extra Trees Classifier. To assess the accuracy of our models, we look at the accuracy scores and classification reports.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (cross_val_score, 
                                     GridSearchCV, 
                                     train_test_split, 
                                     validation_curve, 
                                     RandomizedSearchCV, 
                                     KFold)
from sklearn.metrics import (classification_report,
                             confusion_matrix, 
                             accuracy_score, 
                             roc_auc_score, roc_curve, auc)

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

In [2]:
np.random.seed(42)

In [3]:
data = pd.read_csv('diabetes_ml.csv', index_col=0) # import data
diabetes = data.copy() # save a copy of data as diabetes

In [4]:
# set 0 for No and 1 for Yes
diabetes = diabetes.replace({'NO': 0, 'YES': 1})
diabetes.head()

Unnamed: 0,gender,time_in_hospital,num_lab_procedures,num_procedures,num_medications,num_outpatient,num_inpatient,num_diagnoses,change,diabetesMed,...,No_insulin,Steady_insulin,Up_insulin,Elective,Emergency,Newborn,Trauma Center,Unknown_admission_type,Urgent,readmitted
0,1,3,59,0,18,0,0,9,1,1,...,0,0,1,0,1,0,0,0,0,1
1,1,2,11,5,13,2,1,6,0,1,...,1,0,0,0,1,0,0,0,0,0
2,0,2,44,1,16,0,0,7,1,1,...,0,0,1,0,1,0,0,0,0,0
3,0,1,51,0,8,0,0,5,1,1,...,0,1,0,0,1,0,0,0,0,0
4,0,3,31,6,16,0,0,9,0,1,...,0,1,0,0,0,0,0,0,1,1


In [5]:
diabetes.shape

(61678, 66)

In [6]:
diabetes.columns

Index(['gender', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'num_outpatient', 'num_inpatient', 'num_diagnoses',
       'change', 'diabetesMed', 'AfricanAmerican', 'Asian', 'Hispanic',
       'Other_race', '[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)',
       '[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)', 'Circulatory_1',
       'Diabetes_1', 'Digestive_1', 'Genitourinary_1', 'Injury_1',
       'Musculoskeletal_1', 'Neoplasms_1', 'Other_1', 'Respiratory_1', '>200',
       '>300', 'Norm_glu', '>7', '>8', 'None_a1c', 'Norm_a1c',
       'Down_metformin', 'Steady_metformin', 'Up_metformin',
       'Down_repaglinide', 'Steady_repaglinide', 'Up_repaglinide',
       'Down_glipizide', 'Steady_glipizide', 'Up_glipizide',
       'Down_pioglitazone', 'Steady_pioglitazone', 'Up_pioglitazone',
       'Down_rosiglitazone', 'Steady_rosiglitazone', 'Up_rosiglitazone',
       'Down_insulin', 'No_insulin', 'Steady_insulin', 'Up_insulin',
       '

In [7]:
# independent and target variables
y = diabetes['readmitted'].values # target variable
X = diabetes.drop('readmitted', axis=1).values

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(46258, 65) (46258,)
(15420, 65) (15420,)


# Baseline - DummyClassifier

In [11]:
# using the stratified strategy
stratified = DummyClassifier(strategy='stratified', random_state=42).fit(X_train, y_train)
print('Accuracy Score: {}'.format(stratified.score(X_train, y_train)))

Accuracy Score: 0.5253361580699555


In [9]:
# using the most frequent strategy
frequent = DummyClassifier(strategy='most_frequent', random_state=42).fit(X_train, y_train)
print('Accuracy Score: {}'.format(frequent.score(X_test, y_test)))

Accuracy Score: 0.6040207522697795


In [10]:
# using the uniform strategy
uniform = DummyClassifier(strategy='uniform', random_state=42).fit(X_train, y_train)
print('Accuracy Score: {}'.format(uniform.score(X_test, y_test)))

Accuracy Score: 0.5022049286640726


# Logistic Regression

In [11]:
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))

# confusion matrix
cm = confusion_matrix(y_test, logreg_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, logreg_pred))

Accuracy Score, Training Set:  0.620735872713909
Accuracy Score, Test Set:  0.6126459143968872
Confusion Matrix 
 [[8316  921]
 [5052 1131]]
Classification Report 

              precision    recall  f1-score   support

           0       0.62      0.90      0.74      9237
           1       0.55      0.18      0.27      6183

    accuracy                           0.61     15420
   macro avg       0.59      0.54      0.51     15420
weighted avg       0.59      0.61      0.55     15420



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## Parameter Tuning

In [65]:
# parameter tuning - best params
parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
clf = GridSearchCV(logreg, parameters, cv=3)
clf.fit(X, y).best_params_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

{'C': 100, 'multi_class': 'auto'}

In [12]:
# using parameters
logreg = LogisticRegression(random_state=42, C=100)
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))

# confusion matrix
cm = confusion_matrix(y_test, logreg_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, logreg_pred))

Accuracy Score, Training Set:  0.6194604176574863
Accuracy Score, Test Set:  0.6121271076523995
Confusion Matrix 
 [[8272  965]
 [5016 1167]]
Classification Report 

              precision    recall  f1-score   support

           0       0.62      0.90      0.73      9237
           1       0.55      0.19      0.28      6183

    accuracy                           0.61     15420
   macro avg       0.58      0.54      0.51     15420
weighted avg       0.59      0.61      0.55     15420



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Decision Tree Classifier

In [39]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)
decision_tree_pred = decision_tree.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))

# confusion matrix

cm = confusion_matrix(y_test, decision_tree_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, decision_tree_pred)) # very overfitted

Accuracy Score, Training Set: 0.9999783821176877
Accuracy Score, Test Set: 0.538586251621271
Confusion Matrix 
 [[5626 3611]
 [3504 2679]]
Classification Report 

              precision    recall  f1-score   support

           0       0.62      0.61      0.61      9237
           1       0.43      0.43      0.43      6183

    accuracy                           0.54     15420
   macro avg       0.52      0.52      0.52     15420
weighted avg       0.54      0.54      0.54     15420



## parameter tuning

In [48]:
decision_tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [58]:
# GridSearchCV - takes a while!
# [Parallel(n_jobs=-1)]: Done 307200 out of 307200 | elapsed: 101.7min finished

# {'max_depth': 1.0,
#  'max_features': 1,
#  'min_samples_leaf': 0.1,
#  'min_samples_split': 0.1}
max_depth = np.linspace(1, 32, 32, endpoint=True)
min_samples_split = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
min_samples_leaf = [0.1, 0.2, 0.3, 0.4, 0.5]
max_features = list(range(1, X_train.shape[1]))

parameters = dict(max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_features=max_features)

dt_gs = GridSearchCV(decision_tree, parameters, cv=3, verbose=1, n_jobs=-1)
dt_gs.fit(X_train, y_train).best_params_

Fitting 3 folds for each of 102400 candidates, totalling 307200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 320 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 820 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 1520 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 2420 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done 3520 tasks      | elapsed:   45.8s
[Parallel(n_jobs=-1)]: Done 4820 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 6320 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 8020 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 9920 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 12020 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 14320 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 16820 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 19520 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 22420 tasks  

{'max_depth': 1.0,
 'max_features': 1,
 'min_samples_leaf': 0.1,
 'min_samples_split': 0.1}

In [14]:
# using new parameters - not good
decision_tree = DecisionTreeClassifier(random_state=42, max_depth=1.0, max_features=1, min_samples_leaf=0.1, min_samples_split=0.1)
decision_tree.fit(X_train, y_train)
decision_tree_pred = decision_tree.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))

# confusion matrix
cm = confusion_matrix(y_test, decision_tree_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, decision_tree_pred))

Accuracy Score, Training Set: 0.6057114445068961
Accuracy Score, Test Set: 0.5990272373540856
Confusion Matrix 
 [[9237    0]
 [6183    0]]
Classification Report 

              precision    recall  f1-score   support

           0       0.60      1.00      0.75      9237
           1       0.00      0.00      0.00      6183

    accuracy                           0.60     15420
   macro avg       0.30      0.50      0.37     15420
weighted avg       0.36      0.60      0.45     15420



  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
# cross validation - 5-fold
cv_scores = cross_val_score(decision_tree, X, y, cv=5, scoring='accuracy')
    
print('CV Scores: {}'.format(cv_scores))
print('Average 5-Fold CV Score: {}'.format(np.mean(cv_scores)))

CV Scores: [0.55131323 0.54961089 0.54077497 0.52979327 0.53368464]
Average 5-Fold CV Score: 0.5410354000957911


## Bagging Classifier

In [35]:
# initialize
dt = DecisionTreeClassifier(random_state=42)
bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=42)

# fit, predict
bc.fit(X_train, y_train)
bc_pred = bc.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', bc.score(X_train, y_train))
print('Accuracy Score, Test Set:', bc.score(X_test, y_test))

# confusion matrix
cm = confusion_matrix(y_test, bc_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, bc_pred))

# Gradient Boosting Classifier

In [12]:
gbc = GradientBoostingClassifier(n_estimators=200, random_state=42)
gbc.fit(X_train, y_train)
gbc_pred = gbc.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', gbc.score(X_train, y_train))
print('Accuracy Score, Test Set:', gbc.score(X_test, y_test))

# confusion matrix
cm = confusion_matrix(y_test, gbc_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, gbc_pred))

Accuracy Score, Training Set: 0.6359332439794197
Accuracy Score, Test Set: 0.6160830090791181
Confusion Matrix 
 [[8201 1113]
 [4807 1299]]
Classification Report 

              precision    recall  f1-score   support

           0       0.63      0.88      0.73      9314
           1       0.54      0.21      0.31      6106

    accuracy                           0.62     15420
   macro avg       0.58      0.55      0.52     15420
weighted avg       0.59      0.62      0.56     15420



# Random Forest Classifier

In [15]:
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', forest.score(X_train, y_train))
print('Accuracy Score, Test Set:', forest.score(X_test, y_test))

# confusion matrix
cm = confusion_matrix(y_test, y_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, y_pred))

Accuracy Score, Training Set: 0.9999783821176877
Accuracy Score, Test Set: 0.5942282749675746
Confusion Matrix 
 [[7568 1669]
 [4588 1595]]
Classification Report 

              precision    recall  f1-score   support

           0       0.62      0.82      0.71      9237
           1       0.49      0.26      0.34      6183

    accuracy                           0.59     15420
   macro avg       0.56      0.54      0.52     15420
weighted avg       0.57      0.59      0.56     15420



In [42]:
# cross validation - 5-fold
cv_scores = cross_val_score(forest, X, y, cv=5, scoring='accuracy')
    
print('CV Scores: {}'.format(cv_scores))
print('Average 5-Fold CV Score: {}'.format(np.mean(cv_scores)))

CV Scores: [0.62216278 0.62702659 0.5936284  0.61094447 0.60835022]
Average 5-Fold CV Score: 0.6124224918075536


In [43]:
cv_scores = cross_val_score(forest, X, y, cv=5, scoring='precision')
    
print('CV Scores: {}'.format(cv_scores))
print('Average 5-Fold CV Score: {}'.format(np.mean(cv_scores)))

CV Scores: [0.59061489 0.59244792 0.46614173 0.52134606 0.51458448]
Average 5-Fold CV Score: 0.537027014570225


## Parameter Tuning

The most important parameters for random forest are the number of trees (n_estimators) and the number of features considered for splitting (max_features).

In [16]:
# GridSearchCV

# takes a long time!
# Fitting 3 folds for each of 500 candidates, totalling 1500 fits

# [Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
# [Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.0min
# [Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 11.0min
# [Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 28.5min
# [Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 59.3min
# [Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 106.6min
# [Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 134.7min finished

# {'max_depth': 30,
#  'min_samples_leaf': 5,
#  'min_samples_split': 15,
#  'n_estimators': 400}
n_estimators = [100, 200, 400, 600, 800]
max_depth = [5, 10, 15, 20, 30]
min_samples_split = [2, 5, 10, 15, 30]
min_samples_leaf = [1, 2, 5, 10] 

parameters = dict(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, 
                  min_samples_leaf=min_samples_leaf)

forest_gs = GridSearchCV(forest, parameters, cv=3, verbose=1, n_jobs=-1)
forest_gs.fit(X_train, y_train).best_params_

Fitting 3 folds for each of 500 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 28.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 59.3min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 106.6min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 134.7min finished


{'max_depth': 30,
 'min_samples_leaf': 5,
 'min_samples_split': 15,
 'n_estimators': 400}

In [16]:
# using new parameters
forest = RandomForestClassifier(random_state=42, max_depth=30, min_samples_leaf=5, min_samples_split=15, n_estimators=400)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', forest.score(X_train, y_train))
print('Accuracy Score, Test Set:', forest.score(X_test, y_test))

# confusion matrix
cm = confusion_matrix(y_test, y_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, y_pred))

Accuracy Score, Training Set: 0.7640408145618055
Accuracy Score, Test Set: 0.6134241245136187
Confusion Matrix 
 [[8282  955]
 [5006 1177]]
Classification Report 

              precision    recall  f1-score   support

           0       0.62      0.90      0.74      9237
           1       0.55      0.19      0.28      6183

    accuracy                           0.61     15420
   macro avg       0.59      0.54      0.51     15420
weighted avg       0.59      0.61      0.55     15420



## Bagging Classifier

In [37]:
# initialize
rf = RandomForestClassifier(random_state=42)
bc = BaggingClassifier(base_estimator=rf, n_estimators=50, random_state=42)

# fit, predict
bc.fit(X_train, y_train)
bc_pred = bc.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', bc.score(X_train, y_train))
print('Accuracy Score, Test Set:', bc.score(X_test, y_test))

# confusion matrix
cm = confusion_matrix(y_test, bc_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, bc_pred))

Accuracy Score, Training Set: 0.9946387651865624
Accuracy Score, Test Set: 0.6067444876783398
Confusion Matrix 
 [[7973 1264]
 [4800 1383]]
Classification Report 

              precision    recall  f1-score   support

           0       0.62      0.86      0.72      9237
           1       0.52      0.22      0.31      6183

    accuracy                           0.61     15420
   macro avg       0.57      0.54      0.52     15420
weighted avg       0.58      0.61      0.56     15420



# Gaussian Naive Bayes

In [17]:
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
gaussian_pred = gaussian.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', gaussian.score(X_train, y_train))
print('Accuracy Score, Test Set:', gaussian.score(X_test, y_test))

# confusion matrix
cm = confusion_matrix(y_test, gaussian_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, gaussian_pred))

Accuracy Score, Training Set: 0.4610013403087034
Accuracy Score, Test Set: 0.46335927367055774
Confusion Matrix 
 [[1730 7507]
 [ 768 5415]]
Classification Report 

              precision    recall  f1-score   support

           0       0.69      0.19      0.29      9237
           1       0.42      0.88      0.57      6183

    accuracy                           0.46     15420
   macro avg       0.56      0.53      0.43     15420
weighted avg       0.58      0.46      0.40     15420



## parameter tuning

In [62]:
gaussian.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

# Support Vector Classification

In [18]:
svc = SVC(random_state=42, gamma='auto')
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', svc.score(X_train, y_train))
print('Accuracy Score, Test Set:', svc.score(X_test, y_test))

# confusion matrix
cm = confusion_matrix(y_test, svc_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, svc_pred))

Accuracy Score, Training Set: 0.6317609926931558
Accuracy Score, Test Set: 0.6103761348897536
Confusion Matrix 
 [[8727  510]
 [5498  685]]
Classification Report 

              precision    recall  f1-score   support

           0       0.61      0.94      0.74      9237
           1       0.57      0.11      0.19      6183

    accuracy                           0.61     15420
   macro avg       0.59      0.53      0.46     15420
weighted avg       0.60      0.61      0.52     15420



## parameter tuning

In [49]:
# takes a long time!
# svc.fit(X_train, y_train)

In [50]:
# takes a long time!
# Cs = [0.01, 0.1, 1, 10]
# parameters = dict(C=Cs)

# gsearch = GridSearchCV(svc, parameters, cv=3)
# gsearch.fit(X_train, y_train)
# gsearch.best_params_

# Extra Trees Classifier

In [19]:
etc = ExtraTreesClassifier(random_state=42)
etc.fit(X_train, y_train)
etc_pred = etc.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', etc.score(X_train, y_train))
print('Accuracy Score, Test Set:', etc.score(X_test, y_test))

# confusion matrix
cm = confusion_matrix(y_test, etc_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, etc_pred))

Accuracy Score, Training Set: 0.9999783821176877
Accuracy Score, Test Set: 0.5906614785992218
Confusion Matrix 
 [[7143 2094]
 [4218 1965]]
Classification Report 

              precision    recall  f1-score   support

           0       0.63      0.77      0.69      9237
           1       0.48      0.32      0.38      6183

    accuracy                           0.59     15420
   macro avg       0.56      0.55      0.54     15420
weighted avg       0.57      0.59      0.57     15420



In [44]:
# cross validation - 5-fold
cv_scores = cross_val_score(extra_trees, X, y, cv=5, scoring='accuracy')
    
print('CV Scores: {}'.format(cv_scores))
print('Average 5-Fold CV Score: {}'.format(np.mean(cv_scores)))

CV Scores: [0.59800584 0.59792477 0.56930934 0.5883259  0.58005675]
Average 5-Fold CV Score: 0.586724519822487


In [45]:
# cross validation - 5-fold
cv_scores = cross_val_score(extra_trees, X, y, cv=5, scoring='precision')
    
print('CV Scores: {}'.format(cv_scores))
print('Average 5-Fold CV Score: {}'.format(np.mean(cv_scores)))

CV Scores: [0.48671931 0.48780488 0.43486183 0.47223812 0.45692666]
Average 5-Fold CV Score: 0.46771015968207214


## Bagging Classifier

In [38]:
# initialize
extra_trees = ExtraTreesClassifier(random_state=42)
bc = BaggingClassifier(base_estimator=extra_trees, n_estimators=50, random_state=42)

# fit, predict
bc.fit(X_train, y_train)
bc_pred = bc.predict(X_test)

# accuracy scores
print('Accuracy Score, Training Set:', bc.score(X_train, y_train))
print('Accuracy Score, Test Set:', bc.score(X_test, y_test))

# confusion matrix
cm = confusion_matrix(y_test, bc_pred)
print ('Confusion Matrix \n', cm)

# classification report
print('Classification Report \n')
print(classification_report(y_test, bc_pred))

Accuracy Score, Training Set: 0.9999351463530632
Accuracy Score, Test Set: 0.5997405966277561
Confusion Matrix 
 [[7485 1752]
 [4420 1763]]
Classification Report 

              precision    recall  f1-score   support

           0       0.63      0.81      0.71      9237
           1       0.50      0.29      0.36      6183

    accuracy                           0.60     15420
   macro avg       0.57      0.55      0.54     15420
weighted avg       0.58      0.60      0.57     15420



# K Neighbors Classifier

In [76]:
# GridSearch
knn = KNeighborsClassifier()
parameters = {'n_neighbors':[3, 25]}
gridsearch = GridSearchCV(knn, parameters)

# fit to data
gridsearch.fit(X, y)
gridsearch.best_params_

{'n_neighbors': 25}

In [20]:
knn = KNeighborsClassifier(n_neighbors=25)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
    
print('KNN Classifier \n')
    
# accuracy scores
print('Accuracy Score, Training Set: ', knn.score(X_train, y_train))
print('Accuracy Score, Test Set: ', knn.score(X_test, y_test))
    
# confusion matrix
cm = confusion_matrix(y_test, knn_pred)
    
print('Confusion Matrix: \n', cm)
    
# classificatin report
print('Classification Report \n')
print(classification_report(y_test, knn_pred))

KNN Classifier 

Accuracy Score, Training Set:  0.6610964589908772
Accuracy Score, Test Set:  0.5850194552529183
Confusion Matrix: 
 [[7810 1427]
 [4972 1211]]
Classification Report 

              precision    recall  f1-score   support

           0       0.61      0.85      0.71      9237
           1       0.46      0.20      0.27      6183

    accuracy                           0.59     15420
   macro avg       0.54      0.52      0.49     15420
weighted avg       0.55      0.59      0.54     15420

