# General Overview - Under and Over Sampling

Since there are more patients who are not readmitted than those who are readmitted, we are going to implement random under and over sampling methods in order to increase the precision, recall, and f1 scores of our models. Random under sampling attempts to balance the two classes by removing random samples from the majority class. Random over sampling has the purpose but it does so by duplicating samples in the minority class.

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (cross_val_score, 
                                     GridSearchCV, 
                                     train_test_split, validation_curve)
from sklearn.metrics import (classification_report,
                             confusion_matrix, roc_curve, auc)

from sklearn.metrics import roc_curve, auc

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import imblearn
from imblearn.over_sampling import (RandomOverSampler,
                                    SMOTE,
                                    ADASYN)
from imblearn.under_sampling import (RandomUnderSampler,
                                     TomekLinks,
                                     EditedNearestNeighbours,
                                     NearMiss)

In [2]:
np.random.seed(42)

In [3]:
data = pd.read_csv('diabetes_ml.csv', index_col=0) # import data
diabetes = data.copy() # save a copy of data as diabetes

In [4]:
diabetes.head()

Unnamed: 0,gender,time_in_hospital,num_lab_procedures,num_procedures,num_medications,num_outpatient,num_inpatient,num_diagnoses,change,diabetesMed,...,No_insulin,Steady_insulin,Up_insulin,Elective,Emergency,Newborn,Trauma Center,Unknown_admission_type,Urgent,readmitted
0,1,3,59,0,18,0,0,9,1,1,...,0,0,1,0,1,0,0,0,0,YES
1,1,2,11,5,13,2,1,6,0,1,...,1,0,0,0,1,0,0,0,0,NO
2,0,2,44,1,16,0,0,7,1,1,...,0,0,1,0,1,0,0,0,0,NO
3,0,1,51,0,8,0,0,5,1,1,...,0,1,0,0,1,0,0,0,0,NO
4,0,3,31,6,16,0,0,9,0,1,...,0,1,0,0,0,0,0,0,1,YES


In [51]:
diabetes.shape

(61678, 66)

In [5]:
diabetes.columns

Index(['gender', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'num_outpatient', 'num_inpatient', 'num_diagnoses',
       'change', 'diabetesMed', 'AfricanAmerican', 'Asian', 'Hispanic',
       'Other_race', '[0-10)', '[10-20)', '[20-30)', '[30-40)', '[40-50)',
       '[50-60)', '[60-70)', '[70-80)', '[80-90)', '[90-100)', 'Circulatory_1',
       'Diabetes_1', 'Digestive_1', 'Genitourinary_1', 'Injury_1',
       'Musculoskeletal_1', 'Neoplasms_1', 'Other_1', 'Respiratory_1', '>200',
       '>300', 'Norm_glu', '>7', '>8', 'None_a1c', 'Norm_a1c',
       'Down_metformin', 'Steady_metformin', 'Up_metformin',
       'Down_repaglinide', 'Steady_repaglinide', 'Up_repaglinide',
       'Down_glipizide', 'Steady_glipizide', 'Up_glipizide',
       'Down_pioglitazone', 'Steady_pioglitazone', 'Up_pioglitazone',
       'Down_rosiglitazone', 'Steady_rosiglitazone', 'Up_rosiglitazone',
       'Down_insulin', 'No_insulin', 'Steady_insulin', 'Up_insulin',
       '

In [6]:
# independent and target variables
y = diabetes['readmitted'].values # target variable
X = diabetes.drop('readmitted', axis=1).values

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(49342, 65) (49342,)
(12336, 65) (12336,)


# Baseline - DummyClassifier

In [7]:
# using the stratified strategy
stratified = DummyClassifier(strategy='stratified', random_state=42).fit(X_train, y_train)
print('Accuracy Score: {}'.format(stratified.score(X_test, y_test)))

Accuracy Score: 0.5238326848249028


In [8]:
# using the most frequent strategy
frequent = DummyClassifier(strategy='most_frequent', random_state=42).fit(X_train, y_train)
print('Accuracy Score: {}'.format(frequent.score(X_test, y_test)))

Accuracy Score: 0.6040045395590142


In [9]:
# using the uniform strategy
uniform = DummyClassifier(strategy='uniform', random_state=42).fit(X_train, y_train)
print('Accuracy Score: {}'.format(uniform.score(X_test, y_test)))

Accuracy Score: 0.49464980544747084


# Algorithm Functions

In [10]:
# logistic regression
def logreg(X_train, X_test, y_train, y_test):
    logreg = LogisticRegression(random_state=42)
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    
    print('Logistic Regression \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
    print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [11]:
# decision tree classifier
def decision_tree(X_train, X_test, y_train, y_test):
    decision_tree = DecisionTreeClassifier(random_state=42)
    decision_tree.fit(X_train, y_train)
    y_pred = decision_tree.predict(X_test)
    
    print('Decision Tree Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
    print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [12]:
# random forest classifier
def random_forest(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    print('Random Forest Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', rf.score(X_train, y_train))
    print('Accuracy Score, Test Set:', rf.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [13]:
# Gaussian naive bayes
def gaussian(X_train, X_test, y_train, y_test):
    gaussian = GaussianNB()
    gaussian.fit(X_train, y_train)
    y_pred = gaussian.predict(X_test)
    
    print('Gaussian Naive Bayes \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', gaussian.score(X_train, y_train))
    print('Accuracy Score, Test Set:', gaussian.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [19]:
# SVC
def svc(X_train, X_test, y_train, y_test):
    svc = SVC(random_state=42)
    svc.fit(X_train, y_train)
    y_pred = svc.predict(X_test)
    
    print('Support Vector Classification (SVC) \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', svc.score(X_train, y_train))
    print('Accuracy Score, Test Set:', svc.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [53]:
# Extra Trees Classifier
def etc(X_train, X_test, y_train, y_test):
    etc = ExtraTreesClassifier(random_state=42)
    etc.fit(X_train, y_train)
    etc_pred = etc.predict(X_test)

    # accuracy scores
    print('Accuracy Score, Training Set:', etc.score(X_train, y_train))
    print('Accuracy Score, Test Set:', etc.score(X_test, y_test))

    # confusion matrix
    cm = confusion_matrix(y_test, etc_pred)
    print ('Confusion Matrix \n', cm)

    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, etc_pred))

# Under Sampling Methods

## Random Under Sampler

In [14]:
random_under = RandomUnderSampler(random_state=42)
X_rs, y_rs = random_under.fit_sample(X, y)

print('Random undersampling {}'.format(Counter(y_rs)))

# train test split
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_rs, y_rs, test_size=0.25, random_state=42)

print(X_train_rs.shape, y_train_rs.shape)
print(X_test_rs.shape, y_test_rs.shape)

Random undersampling Counter({'NO': 24422, 'YES': 24422})
(36633, 65) (36633,)
(12211, 65) (12211,)


In [15]:
logreg(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.5815248546392597
Accuracy Score, Test Set:  0.5775120792727868
Confusion Matrix: 
 [[3552 2495]
 [2664 3500]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.57      0.59      0.58      6047
         YES       0.58      0.57      0.58      6164

    accuracy                           0.58     12211
   macro avg       0.58      0.58      0.58     12211
weighted avg       0.58      0.58      0.58     12211



In [16]:
decision_tree(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.9999454044167827
Accuracy Score, Test Set: 0.5226435181393825
Confusion Matrix: 
 [[3201 2846]
 [2983 3181]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.52      0.53      0.52      6047
         YES       0.53      0.52      0.52      6164

    accuracy                           0.52     12211
   macro avg       0.52      0.52      0.52     12211
weighted avg       0.52      0.52      0.52     12211



In [17]:
random_forest(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Random Forest Classifier 

Accuracy Score, Training Set: 0.9999454044167827
Accuracy Score, Test Set: 0.5739087707804439
Confusion Matrix: 
 [[3477 2570]
 [2633 3531]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.57      0.57      0.57      6047
         YES       0.58      0.57      0.58      6164

    accuracy                           0.57     12211
   macro avg       0.57      0.57      0.57     12211
weighted avg       0.57      0.57      0.57     12211



In [18]:
gaussian(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.5295225616247645
Accuracy Score, Test Set: 0.5256735730079437
Confusion Matrix: 
 [[ 964 5083]
 [ 709 5455]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.58      0.16      0.25      6047
         YES       0.52      0.88      0.65      6164

    accuracy                           0.53     12211
   macro avg       0.55      0.52      0.45     12211
weighted avg       0.55      0.53      0.45     12211



In [20]:
svc(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Support Vector Classification (SVC) 

Accuracy Score, Training Set: 0.5755193404853547
Accuracy Score, Test Set: 0.5737449840307919
Confusion Matrix: 
 [[3149 2898]
 [2307 3857]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.58      0.52      0.55      6047
         YES       0.57      0.63      0.60      6164

    accuracy                           0.57     12211
   macro avg       0.57      0.57      0.57     12211
weighted avg       0.57      0.57      0.57     12211



In [68]:
etc(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Accuracy Score, Training Set: 0.9999821057905661
Accuracy Score, Test Set: 0.7087717414644621
Confusion Matrix 
 [[6802 2531]
 [2894 6401]]
Classification Report 

              precision    recall  f1-score   support

          NO       0.70      0.73      0.71      9333
         YES       0.72      0.69      0.70      9295

    accuracy                           0.71     18628
   macro avg       0.71      0.71      0.71     18628
weighted avg       0.71      0.71      0.71     18628



## Edited Nearest Neighbours

In [28]:
enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_enn)))

# train test split
X_train_enn, X_test_enn, y_train_enn, y_test_enn = train_test_split(X_enn, y_enn, test_size=0.2, random_state=42)

print(X_train_enn.shape, y_train_enn.shape)
print(X_test_enn.shape, y_test_enn.shape)

Resampled dataset shape: Counter({'YES': 24422, 'NO': 9504})
(27140, 65) (27140,)
(6786, 65) (6786,)


In [29]:
logreg(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.7372512896094325
Accuracy Score, Test Set:  0.7399056881815502
Confusion Matrix: 
 [[ 373 1531]
 [ 234 4648]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.61      0.20      0.30      1904
         YES       0.75      0.95      0.84      4882

    accuracy                           0.74      6786
   macro avg       0.68      0.57      0.57      6786
weighted avg       0.71      0.74      0.69      6786



In [24]:
decision_tree(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

Decision Tree Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.6683565196887526
Confusion Matrix: 
 [[1038 1333]
 [1480 4631]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.41      0.44      0.42      2371
         YES       0.78      0.76      0.77      6111

    accuracy                           0.67      8482
   macro avg       0.59      0.60      0.60      8482
weighted avg       0.67      0.67      0.67      8482



In [25]:
random_forest(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

Random Forest Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.7552464041499646
Confusion Matrix: 
 [[ 660 1711]
 [ 365 5746]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.64      0.28      0.39      2371
         YES       0.77      0.94      0.85      6111

    accuracy                           0.76      8482
   macro avg       0.71      0.61      0.62      8482
weighted avg       0.74      0.76      0.72      8482



In [26]:
gaussian(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.6697060210658702
Accuracy Score, Test Set: 0.6658806885168592
Confusion Matrix: 
 [[1168 1203]
 [1631 4480]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.42      0.49      0.45      2371
         YES       0.79      0.73      0.76      6111

    accuracy                           0.67      8482
   macro avg       0.60      0.61      0.61      8482
weighted avg       0.68      0.67      0.67      8482



In [27]:
svc(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

Support Vector Classification (SVC) 

Accuracy Score, Training Set: 0.7196588586700204
Accuracy Score, Test Set: 0.7204668710209856
Confusion Matrix: 
 [[   0 2371]
 [   0 6111]]

Classification Report 



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          NO       0.00      0.00      0.00      2371
         YES       0.72      1.00      0.84      6111

    accuracy                           0.72      8482
   macro avg       0.36      0.50      0.42      8482
weighted avg       0.52      0.72      0.60      8482



In [69]:
etc(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.7325375773651636
Confusion Matrix 
 [[ 504 1400]
 [ 415 4467]]
Classification Report 

              precision    recall  f1-score   support

          NO       0.55      0.26      0.36      1904
         YES       0.76      0.91      0.83      4882

    accuracy                           0.73      6786
   macro avg       0.65      0.59      0.59      6786
weighted avg       0.70      0.73      0.70      6786



## Tomek Links

In [30]:
tomek = TomekLinks()
X_tomek, y_tomek = tomek.fit_resample(X, y)

print('TomekLinks undersampling {}'.format(Counter(y_tomek)))

# train test split
X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(X_tomek, y_tomek, test_size=0.25, random_state=42)

print(X_train_tl.shape, y_train_tl.shape)
print(X_test_tl.shape, y_test_tl.shape)

TomekLinks undersampling Counter({'NO': 31621, 'YES': 24422})
(42032, 65) (42032,)
(14011, 65) (14011,)


In [31]:
logreg(X_train_tl, X_test_tl, y_train_tl, y_test_tl)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.5982346783403122
Accuracy Score, Test Set:  0.597102276782528
Confusion Matrix: 
 [[6361 1554]
 [4091 2005]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.61      0.80      0.69      7915
         YES       0.56      0.33      0.42      6096

    accuracy                           0.60     14011
   macro avg       0.59      0.57      0.55     14011
weighted avg       0.59      0.60      0.57     14011



In [32]:
decision_tree(X_train_tl, X_test_tl, y_train_tl, y_test_tl)

Decision Tree Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.5418599671686533
Confusion Matrix: 
 [[4688 3227]
 [3192 2904]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.59      0.59      0.59      7915
         YES       0.47      0.48      0.48      6096

    accuracy                           0.54     14011
   macro avg       0.53      0.53      0.53     14011
weighted avg       0.54      0.54      0.54     14011



In [33]:
random_forest(X_train_tl, X_test_tl, y_train_tl, y_test_tl)

Random Forest Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.6028120762258226
Confusion Matrix: 
 [[5947 1968]
 [3597 2499]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.62      0.75      0.68      7915
         YES       0.56      0.41      0.47      6096

    accuracy                           0.60     14011
   macro avg       0.59      0.58      0.58     14011
weighted avg       0.60      0.60      0.59     14011



In [34]:
gaussian(X_train_tl, X_test_tl, y_train_tl, y_test_tl)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.487557099352874
Accuracy Score, Test Set: 0.4874027549782314
Confusion Matrix: 
 [[1428 6487]
 [ 695 5401]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.67      0.18      0.28      7915
         YES       0.45      0.89      0.60      6096

    accuracy                           0.49     14011
   macro avg       0.56      0.53      0.44     14011
weighted avg       0.58      0.49      0.42     14011



In [35]:
svc(X_train_tl, X_test_tl, y_train_tl, y_test_tl)

Support Vector Classification (SVC) 

Accuracy Score, Training Set: 0.5724685953559193
Accuracy Score, Test Set: 0.5745485689815145
Confusion Matrix: 
 [[7661  254]
 [5707  389]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.57      0.97      0.72      7915
         YES       0.60      0.06      0.12      6096

    accuracy                           0.57     14011
   macro avg       0.59      0.52      0.42     14011
weighted avg       0.59      0.57      0.46     14011



In [70]:
etc(X_train_tl, X_test_tl, y_train_tl, y_test_tl)

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.5865391478124331
Confusion Matrix 
 [[5617 2298]
 [3495 2601]]
Classification Report 

              precision    recall  f1-score   support

          NO       0.62      0.71      0.66      7915
         YES       0.53      0.43      0.47      6096

    accuracy                           0.59     14011
   macro avg       0.57      0.57      0.57     14011
weighted avg       0.58      0.59      0.58     14011



## Near Miss

In [36]:
nm = NearMiss()
X_nm, y_nm = nm.fit_resample(X, y)
print('Resampled dataset shape: {}'.format(Counter(y_nm)))

# train test split
X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(X_nm, y_nm, test_size=0.25, random_state=42)

print(X_train_nm.shape, y_train_nm.shape)
print(X_test_nm.shape, y_test_nm.shape)

Resampled dataset shape: Counter({'NO': 24422, 'YES': 24422})
(36633, 65) (36633,)
(12211, 65) (12211,)


In [37]:
logreg(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.6261021483361996
Accuracy Score, Test Set:  0.6290230120383261
Confusion Matrix: 
 [[4220 1827]
 [2703 3461]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.61      0.70      0.65      6047
         YES       0.65      0.56      0.60      6164

    accuracy                           0.63     12211
   macro avg       0.63      0.63      0.63     12211
weighted avg       0.63      0.63      0.63     12211



In [38]:
decision_tree(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.9999727022083913
Accuracy Score, Test Set: 0.5618704446810253
Confusion Matrix: 
 [[3431 2616]
 [2734 3430]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.56      0.57      0.56      6047
         YES       0.57      0.56      0.56      6164

    accuracy                           0.56     12211
   macro avg       0.56      0.56      0.56     12211
weighted avg       0.56      0.56      0.56     12211



In [39]:
random_forest(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

Random Forest Classifier 

Accuracy Score, Training Set: 0.9999727022083913
Accuracy Score, Test Set: 0.622471542052248
Confusion Matrix: 
 [[4086 1961]
 [2649 3515]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.61      0.68      0.64      6047
         YES       0.64      0.57      0.60      6164

    accuracy                           0.62     12211
   macro avg       0.62      0.62      0.62     12211
weighted avg       0.62      0.62      0.62     12211



In [40]:
gaussian(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.6146916714437802
Accuracy Score, Test Set: 0.6105970027024814
Confusion Matrix: 
 [[4207 1840]
 [2915 3249]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.59      0.70      0.64      6047
         YES       0.64      0.53      0.58      6164

    accuracy                           0.61     12211
   macro avg       0.61      0.61      0.61     12211
weighted avg       0.61      0.61      0.61     12211



In [42]:
svc(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

Support Vector Classification (SVC) 

Accuracy Score, Training Set: 0.62711216662572
Accuracy Score, Test Set: 0.6290230120383261
Confusion Matrix: 
 [[4808 1239]
 [3291 2873]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.59      0.80      0.68      6047
         YES       0.70      0.47      0.56      6164

    accuracy                           0.63     12211
   macro avg       0.65      0.63      0.62     12211
weighted avg       0.65      0.63      0.62     12211



In [71]:
etc(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

Accuracy Score, Training Set: 0.9999727022083913
Accuracy Score, Test Set: 0.60167062484645
Confusion Matrix 
 [[3826 2221]
 [2643 3521]]
Classification Report 

              precision    recall  f1-score   support

          NO       0.59      0.63      0.61      6047
         YES       0.61      0.57      0.59      6164

    accuracy                           0.60     12211
   macro avg       0.60      0.60      0.60     12211
weighted avg       0.60      0.60      0.60     12211



# Over Sampling Methods 

## Random Over Sampler

In [45]:
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_ros)))

# train test split
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_ros, y_ros, test_size=0.25, random_state=42)

print(X_train_rs.shape, y_train_rs.shape)
print(X_test_rs.shape, y_test_rs.shape)

Resampled dataset shape: Counter({'YES': 37256, 'NO': 37256})
(55884, 65) (55884,)
(18628, 65) (18628,)


In [46]:
logreg(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.5860711473767088
Accuracy Score, Test Set:  0.5759609190465965
Confusion Matrix: 
 [[5482 3851]
 [4048 5247]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.58      0.59      0.58      9333
         YES       0.58      0.56      0.57      9295

    accuracy                           0.58     18628
   macro avg       0.58      0.58      0.58     18628
weighted avg       0.58      0.58      0.58     18628



In [47]:
decision_tree(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.9999821057905661
Accuracy Score, Test Set: 0.6647519862572472
Confusion Matrix: 
 [[5618 3715]
 [2530 6765]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.69      0.60      0.64      9333
         YES       0.65      0.73      0.68      9295

    accuracy                           0.66     18628
   macro avg       0.67      0.66      0.66     18628
weighted avg       0.67      0.66      0.66     18628



In [48]:
random_forest(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Random Forest Classifier 

Accuracy Score, Training Set: 0.9999821057905661
Accuracy Score, Test Set: 0.7116169207644406
Confusion Matrix: 
 [[6605 2728]
 [2644 6651]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.71      0.71      0.71      9333
         YES       0.71      0.72      0.71      9295

    accuracy                           0.71     18628
   macro avg       0.71      0.71      0.71     18628
weighted avg       0.71      0.71      0.71     18628



In [49]:
gaussian(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.5299191181733591
Accuracy Score, Test Set: 0.5266265836375349
Confusion Matrix: 
 [[1512 7821]
 [ 997 8298]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.60      0.16      0.26      9333
         YES       0.51      0.89      0.65      9295

    accuracy                           0.53     18628
   macro avg       0.56      0.53      0.45     18628
weighted avg       0.56      0.53      0.45     18628



In [50]:
svc(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Support Vector Classification (SVC) 

Accuracy Score, Training Set: 0.5784661083673324
Accuracy Score, Test Set: 0.5726325960919046
Confusion Matrix: 
 [[4766 4567]
 [3394 5901]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.58      0.51      0.54      9333
         YES       0.56      0.63      0.60      9295

    accuracy                           0.57     18628
   macro avg       0.57      0.57      0.57     18628
weighted avg       0.57      0.57      0.57     18628



In [72]:
etc(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Accuracy Score, Training Set: 0.9999821057905661
Accuracy Score, Test Set: 0.7087717414644621
Confusion Matrix 
 [[6802 2531]
 [2894 6401]]
Classification Report 

              precision    recall  f1-score   support

          NO       0.70      0.73      0.71      9333
         YES       0.72      0.69      0.70      9295

    accuracy                           0.71     18628
   macro avg       0.71      0.71      0.71     18628
weighted avg       0.71      0.71      0.71     18628



## SMOTE

In [54]:
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_sm)))

# train test split
X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_sm, y_sm, test_size=0.2, random_state=42)

print(X_train_sm.shape, y_train_sm.shape)
print(X_test_sm.shape, y_test_sm.shape)

Resampled dataset shape: Counter({'YES': 37256, 'NO': 37256})
(59609, 65) (59609,)
(14903, 65) (14903,)


In [55]:
logreg(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.6668959385327718
Accuracy Score, Test Set:  0.658726430919949
Confusion Matrix: 
 [[5849 1621]
 [3465 3968]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.63      0.78      0.70      7470
         YES       0.71      0.53      0.61      7433

    accuracy                           0.66     14903
   macro avg       0.67      0.66      0.65     14903
weighted avg       0.67      0.66      0.65     14903



In [56]:
decision_tree(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.9999832240097972
Accuracy Score, Test Set: 0.609340401261491
Confusion Matrix: 
 [[4440 3030]
 [2792 4641]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.61      0.59      0.60      7470
         YES       0.61      0.62      0.61      7433

    accuracy                           0.61     14903
   macro avg       0.61      0.61      0.61     14903
weighted avg       0.61      0.61      0.61     14903



In [57]:
random_forest(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Random Forest Classifier 

Accuracy Score, Training Set: 0.9999832240097972
Accuracy Score, Test Set: 0.6721465476749647
Confusion Matrix: 
 [[5637 1833]
 [3053 4380]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.65      0.75      0.70      7470
         YES       0.70      0.59      0.64      7433

    accuracy                           0.67     14903
   macro avg       0.68      0.67      0.67     14903
weighted avg       0.68      0.67      0.67     14903



In [58]:
gaussian(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.5542787163012297
Accuracy Score, Test Set: 0.5492182781990204
Confusion Matrix: 
 [[1577 5893]
 [ 825 6608]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.66      0.21      0.32      7470
         YES       0.53      0.89      0.66      7433

    accuracy                           0.55     14903
   macro avg       0.59      0.55      0.49     14903
weighted avg       0.59      0.55      0.49     14903



In [59]:
svc(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Support Vector Classification (SVC) 

Accuracy Score, Training Set: 0.6421681289738127
Accuracy Score, Test Set: 0.6364490371066228
Confusion Matrix: 
 [[5142 2328]
 [3090 4343]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.62      0.69      0.65      7470
         YES       0.65      0.58      0.62      7433

    accuracy                           0.64     14903
   macro avg       0.64      0.64      0.64     14903
weighted avg       0.64      0.64      0.64     14903



In [60]:
etc(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Accuracy Score, Training Set: 0.9999832240097972
Accuracy Score, Test Set: 0.6547003958934443
Confusion Matrix 
 [[5436 2034]
 [3112 4321]]
Classification Report 

              precision    recall  f1-score   support

          NO       0.64      0.73      0.68      7470
         YES       0.68      0.58      0.63      7433

    accuracy                           0.65     14903
   macro avg       0.66      0.65      0.65     14903
weighted avg       0.66      0.65      0.65     14903



## ADASYN - Adaptive Synthetic

In [61]:
ada = ADASYN(random_state=42)
X_ada, y_ada = ada.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_ada)))

# train test split
X_train_ada, X_test_ada, y_train_ada, y_test_ada = train_test_split(X_ada, y_ada, test_size=0.25, random_state=42)

print(X_train_ada.shape, y_train_ada.shape)
print(X_test_ada.shape, y_test_ada.shape)

Resampled dataset shape: Counter({'YES': 40616, 'NO': 37256})
(58404, 65) (58404,)
(19468, 65) (19468,)


In [62]:
logreg(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.6793541538250805
Accuracy Score, Test Set:  0.6714608588452846
Confusion Matrix: 
 [[7230 2010]
 [4386 5842]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.62      0.78      0.69      9240
         YES       0.74      0.57      0.65     10228

    accuracy                           0.67     19468
   macro avg       0.68      0.68      0.67     19468
weighted avg       0.69      0.67      0.67     19468



In [63]:
decision_tree(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Decision Tree Classifier 

Accuracy Score, Training Set: 0.9999828778850763
Accuracy Score, Test Set: 0.6337065954386686
Confusion Matrix: 
 [[5675 3565]
 [3566 6662]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.61      0.61      0.61      9240
         YES       0.65      0.65      0.65     10228

    accuracy                           0.63     19468
   macro avg       0.63      0.63      0.63     19468
weighted avg       0.63      0.63      0.63     19468



In [64]:
random_forest(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Random Forest Classifier 

Accuracy Score, Training Set: 0.9999828778850763
Accuracy Score, Test Set: 0.6744401068419972
Confusion Matrix: 
 [[6851 2389]
 [3949 6279]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.63      0.74      0.68      9240
         YES       0.72      0.61      0.66     10228

    accuracy                           0.67     19468
   macro avg       0.68      0.68      0.67     19468
weighted avg       0.68      0.67      0.67     19468



In [65]:
gaussian(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.574618176837203
Accuracy Score, Test Set: 0.5752003287446066
Confusion Matrix: 
 [[2168 7072]
 [1198 9030]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.64      0.23      0.34      9240
         YES       0.56      0.88      0.69     10228

    accuracy                           0.58     19468
   macro avg       0.60      0.56      0.51     19468
weighted avg       0.60      0.58      0.52     19468



In [66]:
svc(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Support Vector Classification (SVC) 

Accuracy Score, Training Set: 0.652849119923293
Accuracy Score, Test Set: 0.6434148346003699
Confusion Matrix: 
 [[5850 3390]
 [3552 6676]]

Classification Report 

              precision    recall  f1-score   support

          NO       0.62      0.63      0.63      9240
         YES       0.66      0.65      0.66     10228

    accuracy                           0.64     19468
   macro avg       0.64      0.64      0.64     19468
weighted avg       0.64      0.64      0.64     19468



In [67]:
etc(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Accuracy Score, Training Set: 0.9999828778850763
Accuracy Score, Test Set: 0.6667865214711322
Confusion Matrix 
 [[6651 2589]
 [3898 6330]]
Classification Report 

              precision    recall  f1-score   support

          NO       0.63      0.72      0.67      9240
         YES       0.71      0.62      0.66     10228

    accuracy                           0.67     19468
   macro avg       0.67      0.67      0.67     19468
weighted avg       0.67      0.67      0.67     19468

