# Classification Modeling

In [1]:
import classifiers as clf

import pickle
import pandas as pd
import numpy  as np

import warnings
warnings.filterwarnings('ignore')

### Load Train and Test Sets

In [2]:
X_train = pd.read_pickle("../Data/X_train.pkl")
X_test  = pd.read_pickle("../Data/X_test.pkl")
y_train = pd.read_pickle("../Data/y_train.pkl")
y_test  = pd.read_pickle("../Data/y_test.pkl")

X_train_smote = pd.read_pickle("../Data/X_train_smote.pkl")
y_train_smote = pd.read_pickle("../Data/y_train_smote.pkl")

X_train_under = pd.read_pickle("../Data/X_train_under.pkl")
y_train_under = pd.read_pickle("../Data/y_train_under.pkl")

y_labels = pd.read_pickle('../Data/y_labels.pkl')

### Classification Model Hyper Parameter Grid Search

In [3]:
# define parameter grid search for all classifiers
classifiers = []

# dummy classifier
classifiers.append(
    clf.grid_search_dummy_classifier(dict(
        strategy=['most_frequent','stratified'])))

# logistic regression
classifiers.append(
    clf.grid_search_logistic_regression(dict(
        C=[1e-2,1e0,1e2,1e6,1e12],
        penalty=['l1', 'l2'],
        fit_intercept=[True, False],
        multi_class=['ovr'],
        solver=['liblinear'])))

# multinomial naive bayes classifer
classifiers.append(
    clf.grid_search_multinomial_nb(dict(
        alpha=[0.0,1.0],
        fit_prior=[True])))

# k nearest neighbors classifier
classifiers.append(
    clf.grid_search_k_neighbors_classifier(dict(
        n_neighbors=[5,11],
        weights=['uniform', 'distance'],
        algorithm=['ball_tree','kd_tree'],
        leaf_size=[100,200])))

# decision tree classifier
classifiers.append(
    clf.grid_search_decision_tree_classifier(dict(
        criterion=['gini','entropy'],
        max_depth=[6,8],
        min_samples_leaf=[20,50,100],
        max_features=[5,10,20],
        min_impurity_decrease=[0.01,0.03,0.05])))

# random forest classifier
classifiers.append(
    clf.grid_search_random_forest_classifier(dict(
        n_estimators=[100,200,300],
        max_depth=[2,3,4],
        min_samples_leaf=[100,200],
        max_features=[5,10],
        min_impurity_decrease=[0.01,0.03,0.05])))

# ada boost classifer
classifiers.append(
    clf.grid_search_ada_boost_classifier(dict(
        n_estimators=[100,200,300],
        learning_rate=[0.5,1.0])))

### Classification Model:  Unbalanced Classes

In [4]:
# run unbalanced dataset
unbalanced = clf.fit_predict_measure(
    'Unbalanced', X_train, X_test, y_train, y_test, list(y_labels[0]), classifiers)
unbalanced

Running jobs: Dummy
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Running jobs: Logistic Regression
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   11.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Running jobs: Multinomial Naive Bayes
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Batch computation too fast (0.1039s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.2s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished


Running jobs: K Nearest Neighbors
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   43.0s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   50.1s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  1.1min finished


Running jobs: Decision Tree
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0749s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 336 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 396 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 456 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:    9.5s finished
[Parallel(n_jobs=-1)]: Usi

Running jobs: Random Forest
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   27.5s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   35.1s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   39.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   44.2s
[Paralle

Running jobs: Ada Boost
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:   34.0s remaining:    3.8s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   37.7s finished


Unnamed: 0,Data,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Confusion Matrix
0,Unbalanced,Dummy,{'strategy': 'most_frequent'},Train,0.764790,0.584904,0.764790,0.662860,
1,Unbalanced,Dummy,{'strategy': 'stratified'},Train,0.620359,0.619068,0.619698,0.616299,
2,Unbalanced,Dummy,{'strategy': 'most_frequent'},Test,0.763436,0.582835,0.763436,0.661022,"[[1733, 0, 0], [164, 0, 0], [373, 0, 0]]"
3,Unbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.764790,0.584904,0.764790,0.662860,
4,Unbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.767875,0.726059,0.767875,0.671165,
5,Unbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.769748,0.709923,0.769748,0.678317,
6,Unbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.779443,0.716726,0.779443,0.703343,
7,Unbalanced,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.786824,0.721505,0.786824,0.721361,
8,Unbalanced,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.786934,0.721696,0.786934,0.721554,
9,Unbalanced,Logistic Regression,"{'C': 1.0, 'fit_intercept': False, 'multi_clas...",Train,0.786163,0.719088,0.786163,0.721032,


In [5]:
# statistics of unbalanced classifiers (test and training sets)
unbalanced.describe()

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
count,269.0,269.0,269.0,269.0
mean,0.769938,0.628853,0.769935,0.68133
std,0.012594,0.058173,0.012623,0.02545
min,0.620359,0.582835,0.619698,0.616299
25%,0.76479,0.584904,0.76479,0.66286
50%,0.76479,0.584904,0.76479,0.66286
75%,0.776578,0.695244,0.776578,0.708008
max,0.789688,0.749627,0.789688,0.733284


In [6]:
# best estimators of unbalanced dataset
unbalanced_test = unbalanced[unbalanced['Split'] == 'Test']
unbalanced_test.sort_values(by=['F1 Score'], ascending=False)

Unnamed: 0,Data,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Confusion Matrix
43,Unbalanced,K Nearest Neighbors,"{'algorithm': 'kd_tree', 'leaf_size': 100, 'n_...",Test,0.782379,0.729329,0.782379,0.727244,"[[1689, 14, 30], [149, 4, 11], [282, 8, 83]]"
268,Unbalanced,Ada Boost,"{'learning_rate': 1.0, 'n_estimators': 100}",Test,0.784581,0.734862,0.784581,0.722891,"[[1705, 6, 22], [151, 3, 10], [295, 5, 73]]"
23,Unbalanced,Logistic Regression,"{'C': 1000000.0, 'fit_intercept': True, 'multi...",Test,0.781498,0.709913,0.781498,0.712267,"[[1712, 0, 21], [154, 0, 10], [311, 0, 62]]"
26,Unbalanced,Multinomial Naive Bayes,"{'alpha': 0.0, 'fit_prior': True}",Test,0.781498,0.709913,0.781498,0.712267,"[[1712, 0, 21], [154, 0, 10], [311, 0, 62]]"
152,Unbalanced,Decision Tree,"{'criterion': 'gini', 'max_depth': 6, 'max_fea...",Test,0.781498,0.709913,0.781498,0.712267,"[[1712, 0, 21], [154, 0, 10], [311, 0, 62]]"
2,Unbalanced,Dummy,{'strategy': 'most_frequent'},Test,0.763436,0.582835,0.763436,0.661022,"[[1733, 0, 0], [164, 0, 0], [373, 0, 0]]"
261,Unbalanced,Random Forest,"{'max_depth': 2, 'max_features': 5, 'min_impur...",Test,0.763436,0.582835,0.763436,0.661022,"[[1733, 0, 0], [164, 0, 0], [373, 0, 0]]"


### Classification Model: Classes Balanced with SMOTE Oversampling

In [7]:
# run SMOTE balanced dataset
balanced_smote = clf.fit_predict_measure(
    'SMOTE Oversampled', X_train_smote, X_test, y_train_smote, y_test, list(y_labels[0]), classifiers)
balanced_smote

Running jobs: Dummy
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Running jobs: Logistic Regression
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   46.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   50.8s finished


Running jobs: Multinomial Naive Bayes
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.4s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.5s finished


Running jobs: K Nearest Neighbors
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  7.4min finished


Running jobs: Decision Tree
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1630s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 172 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 202 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 266 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 300 tas

Running jobs: Random Forest
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   45.3s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.4min
[Paralle

Running jobs: Ada Boost
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   27.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:  1.3min remaining:    8.5s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.4min finished


Unnamed: 0,Data,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Confusion Matrix
0,SMOTE Oversampled,Dummy,{'strategy': 'most_frequent'},Train,0.333333,0.111111,0.333333,0.166667,
1,SMOTE Oversampled,Dummy,{'strategy': 'stratified'},Train,0.327715,0.335075,0.332901,0.337774,
2,SMOTE Oversampled,Dummy,{'strategy': 'stratified'},Test,0.329515,0.608010,0.329515,0.396527,"[[579, 581, 573], [59, 50, 55], [128, 126, 119]]"
3,SMOTE Oversampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.540190,0.571752,0.540190,0.537853,
4,SMOTE Oversampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.535533,0.552461,0.535533,0.530717,
5,SMOTE Oversampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.527466,0.539646,0.527466,0.524760,
6,SMOTE Oversampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.535100,0.550336,0.535100,0.529968,
7,SMOTE Oversampled,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.548689,0.569529,0.548689,0.543550,
8,SMOTE Oversampled,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.548449,0.569263,0.548449,0.543434,
9,SMOTE Oversampled,Logistic Regression,"{'C': 1.0, 'fit_intercept': False, 'multi_clas...",Train,0.541295,0.553196,0.541295,0.537396,


In [8]:
# statistics of SMOTE balanced classifiers (test and training sets)
balanced_smote.describe()

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
count,269.0,269.0,269.0,269.0
mean,0.440609,0.370494,0.440628,0.354746
std,0.087457,0.192748,0.087432,0.149421
min,0.327715,0.111111,0.329515,0.166667
25%,0.333333,0.111111,0.333333,0.166667
50%,0.437866,0.357704,0.437866,0.34013
75%,0.49563,0.562046,0.49563,0.464677
max,0.676211,0.740513,0.676211,0.689831


In [9]:
# best estimators of SMOTE balanced dataset
balanced_smote_test = balanced_smote[balanced_smote['Split'] == 'Test']
balanced_smote_test.sort_values(by=['F1 Score'], ascending=False)

Unnamed: 0,Data,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Confusion Matrix
23,SMOTE Oversampled,Logistic Regression,"{'C': 100.0, 'fit_intercept': True, 'multi_cla...",Test,0.656828,0.740513,0.656828,0.689831,"[[1251, 268, 214], [73, 67, 24], [124, 76, 173]]"
43,SMOTE Oversampled,K Nearest Neighbors,"{'algorithm': 'kd_tree', 'leaf_size': 200, 'n_...",Test,0.676211,0.710974,0.676211,0.688653,"[[1310, 108, 315], [108, 16, 40], [139, 25, 209]]"
26,SMOTE Oversampled,Multinomial Naive Bayes,"{'alpha': 0.0, 'fit_prior': True}",Test,0.66652,0.724553,0.66652,0.677891,"[[1369, 306, 58], [77, 76, 11], [209, 96, 68]]"
152,SMOTE Oversampled,Decision Tree,"{'criterion': 'entropy', 'max_depth': 6, 'max_...",Test,0.589868,0.73861,0.589868,0.641933,"[[1088, 429, 216], [65, 85, 14], [106, 101, 166]]"
268,SMOTE Oversampled,Ada Boost,"{'learning_rate': 1.0, 'n_estimators': 300}",Test,0.564317,0.728031,0.564317,0.618791,"[[1024, 427, 282], [66, 77, 21], [99, 94, 180]]"
261,SMOTE Oversampled,Random Forest,"{'max_depth': 4, 'max_features': 10, 'min_impu...",Test,0.560352,0.731976,0.560352,0.618355,"[[1022, 476, 235], [65, 82, 17], [100, 105, 168]]"
2,SMOTE Oversampled,Dummy,{'strategy': 'stratified'},Test,0.329515,0.60801,0.329515,0.396527,"[[579, 581, 573], [59, 50, 55], [128, 126, 119]]"


### Classification Model:  Classes Balanced with Near Miss Undersampling

In [10]:
# run Near Miss balanced dataset
balanced_under = clf.fit_predict_measure(
    'Near Miss Undersampled', X_train_under, X_test, y_train_under, y_test, list(y_labels[0]), classifiers)
balanced_under

Running jobs: Dummy
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Running jobs: Logistic Regression
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0203s.) Setting batch_size=18.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  82 out of 100 | elapsed:    1.4s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0308s.) Setting batch_size=12.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished


Running jobs: Multinomial Naive Bayes
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Running jobs: K Nearest Neighbors
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1576s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    2.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0204s.) Setting batch_size=18.


Running jobs: Decision Tree
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 170 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 296 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:    3.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Running jobs: Random Forest
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   24.3s
[Paralle

Running jobs: Ada Boost
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:    8.9s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    9.8s finished


Unnamed: 0,Data,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Confusion Matrix
0,Near Miss Undersampled,Dummy,{'strategy': 'most_frequent'},Train,0.333333,0.111111,0.333333,0.166667,
1,Near Miss Undersampled,Dummy,{'strategy': 'stratified'},Train,0.330128,0.332656,0.327991,0.353523,
2,Near Miss Undersampled,Dummy,{'strategy': 'stratified'},Test,0.329075,0.609456,0.329075,0.392951,"[[564, 602, 567], [56, 58, 50], [127, 121, 125]]"
3,Near Miss Undersampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.477030,0.335472,0.477030,0.384808,
4,Near Miss Undersampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.568376,0.574881,0.568376,0.565020,
5,Near Miss Undersampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.487179,0.488976,0.487179,0.469753,
6,Near Miss Undersampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.568910,0.574756,0.568910,0.567885,
7,Near Miss Undersampled,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.581197,0.591412,0.581197,0.580523,
8,Near Miss Undersampled,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.580662,0.590768,0.580662,0.579958,
9,Near Miss Undersampled,Logistic Regression,"{'C': 1.0, 'fit_intercept': False, 'multi_clas...",Train,0.577991,0.588250,0.577991,0.577250,


In [11]:
# statistics of Near Miss balanced classifiers (test and training sets)
balanced_under.describe()

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
count,269.0,269.0,269.0,269.0
mean,0.532285,0.524359,0.532277,0.502967
std,0.05996,0.125517,0.059987,0.097518
min,0.329075,0.111111,0.327991,0.166667
25%,0.501603,0.460865,0.501603,0.429327
50%,0.565705,0.587305,0.565705,0.556822
75%,0.56891,0.597337,0.56891,0.559641
max,0.606303,0.746374,0.606303,0.618208


In [12]:
# best estimators of Near Miss balanced dataset
balanced_under_test = balanced_under[balanced_under['Split'] == 'Test']
balanced_under_test.sort_values(by=['F1 Score'], ascending=False)

Unnamed: 0,Data,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Confusion Matrix
26,Near Miss Undersampled,Multinomial Naive Bayes,"{'alpha': 1.0, 'fit_prior': True}",Test,0.552423,0.74337,0.552423,0.618208,"[[1022, 567, 144], [65, 86, 13], [102, 125, 146]]"
152,Near Miss Undersampled,Decision Tree,"{'criterion': 'entropy', 'max_depth': 6, 'max_...",Test,0.492952,0.746374,0.492952,0.570614,"[[877, 722, 134], [53, 98, 13], [88, 141, 144]]"
23,Near Miss Undersampled,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Test,0.464758,0.739861,0.464758,0.541849,"[[796, 704, 233], [54, 86, 24], [61, 139, 173]]"
261,Near Miss Undersampled,Random Forest,"{'max_depth': 2, 'max_features': 10, 'min_impu...",Test,0.459912,0.717761,0.459912,0.535518,"[[839, 689, 205], [59, 100, 5], [79, 189, 105]]"
268,Near Miss Undersampled,Ada Boost,"{'learning_rate': 0.5, 'n_estimators': 100}",Test,0.420705,0.729045,0.420705,0.49753,"[[711, 801, 221], [40, 109, 15], [69, 169, 135]]"
43,Near Miss Undersampled,K Nearest Neighbors,"{'algorithm': 'ball_tree', 'leaf_size': 100, '...",Test,0.381498,0.724419,0.381498,0.450721,"[[603, 793, 337], [35, 104, 25], [52, 162, 159]]"
2,Near Miss Undersampled,Dummy,{'strategy': 'stratified'},Test,0.329075,0.609456,0.329075,0.392951,"[[564, 602, 567], [56, 58, 50], [127, 121, 125]]"


### Data Loading

In [13]:
unbalanced.to_pickle(    '../Data/unbalanced.pkl')
balanced_smote.to_pickle('../Data/balanced_smote.pkl')
balanced_under.to_pickle('../Data/balanced_under.pkl')