# Classification Modeling

In [1]:
import classifiers as clf

import pickle
import pandas as pd
import numpy  as np

import warnings
warnings.filterwarnings('ignore')

### Load Train and Test Sets

In [2]:
X_train = pd.read_pickle("../Data/X_train.pkl")
X_test  = pd.read_pickle("../Data/X_test.pkl")
y_train = pd.read_pickle("../Data/y_train.pkl")
y_test  = pd.read_pickle("../Data/y_test.pkl")

X_train_smote = pd.read_pickle("../Data/X_train_smote.pkl")
y_train_smote = pd.read_pickle("../Data/y_train_smote.pkl")

X_train_under = pd.read_pickle("../Data/X_train_under.pkl")
y_train_under = pd.read_pickle("../Data/y_train_under.pkl")

y_labels = pd.read_pickle('../Data/y_labels.pkl')

### Classification Model Hyper Parameter Grid Search

In [3]:
# define parameter grid search for all classifiers
classifiers = []

# dummy classifier
classifiers.append(
    clf.grid_search_dummy_classifier(dict(
        strategy=['most_frequent','stratified'])))

# logistic regression
classifiers.append(
    clf.grid_search_logistic_regression(dict(
        C=[1e-2,1e0,1e2,1e6,1e12],
        penalty=['l1', 'l2'],
        fit_intercept=[True, False],
        multi_class=['ovr'],
        solver=['liblinear'])))

# multinomial naive bayes classifer
classifiers.append(
    clf.grid_search_multinomial_nb(dict(
        alpha=[0.0,1.0],
        fit_prior=[True])))

# k nearest neighbors classifier
classifiers.append(
    clf.grid_search_k_neighbors_classifier(dict(
        n_neighbors=[5,11],
        weights=['uniform', 'distance'],
        algorithm=['ball_tree','kd_tree'],
        leaf_size=[100,200])))

# decision tree classifier
classifiers.append(
    clf.grid_search_decision_tree_classifier(dict(
        criterion=['gini','entropy'],
        max_depth=[6,8],
        min_samples_leaf=[20,50,100],
        max_features=[5,10,20],
        min_impurity_decrease=[0.01,0.03,0.05])))

# random forest classifier
classifiers.append(
    clf.grid_search_random_forest_classifier(dict(
        n_estimators=[100,200,300],
        max_depth=[2,3,4],
        min_samples_leaf=[100,200],
        max_features=[5,10],
        min_impurity_decrease=[0.01,0.03,0.05])))

# ada boost classifer
classifiers.append(
    clf.grid_search_ada_boost_classifier(dict(
        n_estimators=[100,200,300],
        learning_rate=[0.5,1.0])))

### Classification Model:  Unbalanced Classes

In [4]:
# run unbalanced dataset
unbalanced = clf.fit_predict_measure(
    'Unbalanced', X_train, X_test, y_train, y_test, list(y_labels[0]), classifiers)
unbalanced

Running jobs: Dummy
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Running jobs: Logistic Regression
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Running jobs: Multinomial Naive Bayes
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Batch computation too fast (0.0960s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.1s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.2s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Running jobs: K Nearest Neighbors
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   41.5s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   48.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   53.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   59.0s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  1.1min finished


Running jobs: Decision Tree
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0743s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 336 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 396 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done 456 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:    9.2s finished
[Parallel(n_jobs=-1)]: Usi

Running jobs: Random Forest
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   43.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   48.6s
[Paralle

Running jobs: Ada Boost
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:   37.1s remaining:    4.1s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   40.8s finished


Unnamed: 0,Data,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Fit Time,Score Time,Total Time,Confusion Matrix
0,Unbalanced,Dummy,{'strategy': 'most_frequent'},Train,0.764790,0.254930,0.333333,0.288907,0.005008,0.012733,0.017741,
1,Unbalanced,Dummy,{'strategy': 'stratified'},Train,0.611215,0.331002,0.337882,0.334799,0.004689,0.019909,0.024598,
2,Unbalanced,Dummy,{'strategy': 'stratified'},Test,0.619824,0.337195,0.337247,0.337210,0.004689,0.019909,0.024598,"[[1325, 126, 282], [131, 8, 25], [266, 33, 74]]"
3,Unbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.764790,0.254930,0.333333,0.288907,0.057564,0.044325,0.101889,
4,Unbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.767875,0.534024,0.340372,0.303554,0.052870,0.046388,0.099258,
5,Unbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.769748,0.498802,0.346884,0.317084,0.058769,0.044552,0.103321,
6,Unbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.779443,0.499122,0.371473,0.360789,0.048593,0.043310,0.091904,
7,Unbalanced,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.786824,0.497224,0.392296,0.392112,0.336856,0.043323,0.380179,
8,Unbalanced,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.786934,0.497499,0.392517,0.392431,0.069416,0.044493,0.113909,
9,Unbalanced,Logistic Regression,"{'C': 1.0, 'fit_intercept': False, 'multi_clas...",Train,0.786163,0.492148,0.392181,0.391786,0.102225,0.040960,0.143185,


In [5]:
# statistics of unbalanced classifiers (test and training sets)
unbalanced.describe()

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,Fit Time,Score Time,Total Time
count,269.0,269.0,269.0,269.0,269.0,269.0,269.0
mean,0.769321,0.335684,0.353435,0.323774,0.44488,0.359454,0.804335
std,0.015948,0.109568,0.028546,0.048271,0.841594,0.804978,1.19885
min,0.611215,0.254479,0.333333,0.288617,0.004689,0.012733,0.017741
25%,0.76479,0.25493,0.333333,0.288907,0.022011,0.041753,0.063999
50%,0.76479,0.25493,0.333333,0.288907,0.11319,0.049368,0.380179
75%,0.777349,0.448079,0.379775,0.37045,0.686698,0.352777,1.133704
max,0.789688,0.610098,0.431055,0.438888,7.20207,4.760983,7.957126


In [6]:
# best estimators of unbalanced dataset
unbalanced_test = unbalanced[unbalanced['Split'] == 'Test']
unbalanced_test.sort_values(by=['F1 Score'], ascending=False)

Unnamed: 0,Data,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Fit Time,Score Time,Total Time,Confusion Matrix
43,Unbalanced,K Nearest Neighbors,"{'algorithm': 'kd_tree', 'leaf_size': 100, 'n_...",Test,0.782379,0.539966,0.407174,0.41761,0.110649,1.89013,2.000779,"[[1689, 14, 30], [149, 4, 11], [282, 8, 83]]"
268,Unbalanced,Ada Boost,"{'learning_rate': 1.0, 'n_estimators': 200}",Test,0.784141,0.562599,0.39909,0.405544,4.428128,0.490464,4.918592,"[[1704, 7, 22], [151, 3, 10], [295, 5, 73]]"
23,Unbalanced,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Test,0.781498,0.484357,0.384701,0.380599,0.069416,0.044493,0.113909,"[[1712, 0, 21], [154, 0, 10], [311, 0, 62]]"
26,Unbalanced,Multinomial Naive Bayes,"{'alpha': 0.0, 'fit_prior': True}",Test,0.781498,0.484357,0.384701,0.380599,0.048863,0.049368,0.098231,"[[1712, 0, 21], [154, 0, 10], [311, 0, 62]]"
152,Unbalanced,Decision Tree,"{'criterion': 'gini', 'max_depth': 6, 'max_fea...",Test,0.781498,0.484357,0.384701,0.380599,0.026576,0.041183,0.06776,"[[1712, 0, 21], [154, 0, 10], [311, 0, 62]]"
2,Unbalanced,Dummy,{'strategy': 'stratified'},Test,0.619824,0.337195,0.337247,0.33721,0.004689,0.019909,0.024598,"[[1325, 126, 282], [131, 8, 25], [266, 33, 74]]"
261,Unbalanced,Random Forest,"{'max_depth': 2, 'max_features': 5, 'min_impur...",Test,0.763436,0.254479,0.333333,0.288617,0.314799,0.159065,0.473865,"[[1733, 0, 0], [164, 0, 0], [373, 0, 0]]"


### Classification Model: Classes Balanced with SMOTE Oversampling

In [7]:
# run SMOTE balanced dataset
balanced_smote = clf.fit_predict_measure(
    'SMOTE Oversampled', X_train_smote, X_test, y_train_smote, y_test, list(y_labels[0]), classifiers)
balanced_smote

Running jobs: Dummy
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Running jobs: Logistic Regression
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   31.7s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   46.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   49.1s finished


Running jobs: Multinomial Naive Bayes
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1988s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.2s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.4s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Running jobs: K Nearest Neighbors
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  8.3min finished


Running jobs: Decision Tree
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   

Running jobs: Random Forest
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   26.4s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   31.4s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   43.8s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   51.9s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.6min
[Paralle

Running jobs: Ada Boost
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   51.1s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:  1.5min remaining:    9.7s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.6min finished


Unnamed: 0,Data,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Fit Time,Score Time,Total Time,Confusion Matrix
0,SMOTE Oversampled,Dummy,{'strategy': 'most_frequent'},Train,0.333333,0.111111,0.333333,0.166667,0.015047,0.038950,0.053996,
1,SMOTE Oversampled,Dummy,{'strategy': 'stratified'},Train,0.336022,0.334946,0.330452,0.330916,0.011090,0.043585,0.054674,
2,SMOTE Oversampled,Dummy,{'strategy': 'stratified'},Test,0.349339,0.337767,0.360943,0.282121,0.011090,0.043585,0.054674,"[[603, 551, 579], [52, 66, 46], [142, 107, 124]]"
3,SMOTE Oversampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.540190,0.571752,0.540190,0.537853,0.201957,0.098438,0.300395,
4,SMOTE Oversampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.535533,0.552461,0.535533,0.530717,0.116871,0.087623,0.204494,
5,SMOTE Oversampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.527466,0.539638,0.527466,0.524760,0.139790,0.084911,0.224701,
6,SMOTE Oversampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.535100,0.550336,0.535100,0.529968,0.107427,0.086733,0.194160,
7,SMOTE Oversampled,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.548689,0.569529,0.548689,0.543550,1.256680,0.092869,1.349550,
8,SMOTE Oversampled,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.548449,0.569263,0.548449,0.543434,0.229410,0.088135,0.317545,
9,SMOTE Oversampled,Logistic Regression,"{'C': 1.0, 'fit_intercept': False, 'multi_clas...",Train,0.541295,0.553196,0.541295,0.537396,0.222911,0.099974,0.322885,


In [8]:
# statistics of SMOTE balanced classifiers (test and training sets)
balanced_smote.describe()

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,Fit Time,Score Time,Total Time
count,269.0,269.0,269.0,269.0,269.0,269.0,269.0
mean,0.441096,0.362859,0.438536,0.350064,1.122183,1.784575,2.906758
std,0.087064,0.184604,0.083004,0.142486,2.293495,6.262955,6.63808
min,0.333333,0.111111,0.330452,0.166667,0.01109,0.03895,0.053996
25%,0.333333,0.111111,0.333333,0.166667,0.05592,0.092284,0.15455
50%,0.437482,0.354985,0.437482,0.338861,0.575759,0.173515,0.844684
75%,0.49563,0.555649,0.49563,0.463748,1.334995,0.597979,2.349658
max,0.676211,0.632455,0.60266,0.598817,16.476759,37.368858,38.077082


In [9]:
# best estimators of SMOTE balanced dataset
balanced_smote_test = balanced_smote[balanced_smote['Split'] == 'Test']
balanced_smote_test.sort_values(by=['F1 Score'], ascending=False)

Unnamed: 0,Data,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Fit Time,Score Time,Total Time,Confusion Matrix
23,SMOTE Oversampled,Logistic Regression,"{'C': 100.0, 'fit_intercept': True, 'multi_cla...",Test,0.656828,0.482631,0.531404,0.486972,8.913941,0.110586,9.024527,"[[1251, 268, 214], [73, 67, 24], [124, 76, 173]]"
152,SMOTE Oversampled,Decision Tree,"{'criterion': 'entropy', 'max_depth': 6, 'max_...",Test,0.6163,0.499489,0.537013,0.476402,0.070661,0.086502,0.157163,"[[1155, 429, 149], [65, 85, 14], [113, 101, 159]]"
43,SMOTE Oversampled,K Nearest Neighbors,"{'algorithm': 'kd_tree', 'leaf_size': 200, 'n_...",Test,0.676211,0.439771,0.471266,0.448231,0.585003,14.350573,14.935576,"[[1310, 108, 315], [108, 16, 40], [139, 25, 209]]"
268,SMOTE Oversampled,Ada Boost,"{'learning_rate': 1.0, 'n_estimators': 300}",Test,0.564317,0.45422,0.514323,0.441183,15.048985,1.149184,16.198169,"[[1024, 427, 282], [66, 77, 21], [99, 94, 180]]"
261,SMOTE Oversampled,Random Forest,"{'max_depth': 4, 'max_features': 10, 'min_impu...",Test,0.560352,0.461558,0.513377,0.440672,2.174344,0.564954,2.739298,"[[1022, 476, 235], [65, 82, 17], [100, 105, 168]]"
26,SMOTE Oversampled,Multinomial Naive Bayes,"{'alpha': 0.0, 'fit_prior': True}",Test,0.66652,0.494179,0.47856,0.437191,0.10678,0.067284,0.174064,"[[1369, 306, 58], [77, 76, 11], [209, 96, 68]]"
2,SMOTE Oversampled,Dummy,{'strategy': 'stratified'},Test,0.349339,0.337767,0.360943,0.282121,0.01109,0.043585,0.054674,"[[603, 551, 579], [52, 66, 46], [142, 107, 124]]"


### Classification Model:  Classes Balanced with Near Miss Undersampling

In [10]:
# run Near Miss balanced dataset
balanced_under = clf.fit_predict_measure(
    'Near Miss Undersampled', X_train_under, X_test, y_train_under, y_test, list(y_labels[0]), classifiers)
balanced_under

Running jobs: Dummy
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Running jobs: Logistic Regression
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0203s.) Setting batch_size=18.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  82 out of 100 | elapsed:    1.4s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0296s.) Setting batch_size=12.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.1s remaining:    0.1s


Running jobs: Multinomial Naive Bayes
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Running jobs: K Nearest Neighbors
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1597s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    2.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0177s.) Setting batch_size=22.


Running jobs: Decision Tree
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 206 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 360 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:    3.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Running jobs: Random Forest
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   22.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   26.0s
[Paralle

Running jobs: Ada Boost
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:    9.1s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   10.0s finished


Unnamed: 0,Data,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Fit Time,Score Time,Total Time,Confusion Matrix
0,Near Miss Undersampled,Dummy,{'strategy': 'most_frequent'},Train,0.333333,0.111111,0.333333,0.166667,0.002060,0.004182,0.006242,
1,Near Miss Undersampled,Dummy,{'strategy': 'stratified'},Train,0.339744,0.314064,0.317842,0.339204,0.001506,0.004657,0.006164,
2,Near Miss Undersampled,Dummy,{'strategy': 'stratified'},Test,0.326872,0.331467,0.330364,0.263198,0.001506,0.004657,0.006164,"[[567, 590, 576], [56, 57, 51], [120, 135, 118]]"
3,Near Miss Undersampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.491987,0.343286,0.491987,0.395073,0.006549,0.011436,0.017985,
4,Near Miss Undersampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.568376,0.574881,0.568376,0.565020,0.009331,0.011178,0.020509,
5,Near Miss Undersampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.487179,0.488976,0.487179,0.469753,0.007972,0.011420,0.019392,
6,Near Miss Undersampled,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.568910,0.574756,0.568910,0.567885,0.007981,0.011420,0.019401,
7,Near Miss Undersampled,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.581197,0.591412,0.581197,0.580523,0.052325,0.011147,0.063472,
8,Near Miss Undersampled,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.580662,0.590768,0.580662,0.579958,0.013481,0.011818,0.025300,
9,Near Miss Undersampled,Logistic Regression,"{'C': 1.0, 'fit_intercept': False, 'multi_clas...",Train,0.577991,0.588250,0.577991,0.577250,0.020281,0.011617,0.031899,


In [11]:
# statistics of Near Miss balanced classifiers (test and training sets)
balanced_under.describe()

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,Fit Time,Score Time,Total Time
count,269.0,269.0,269.0,269.0,269.0,269.0,269.0
mean,0.531674,0.515519,0.532024,0.497643,0.18912,0.087517,0.276637
std,0.061349,0.124931,0.060675,0.102139,0.256493,0.088105,0.336986
min,0.326872,0.111111,0.317842,0.166667,0.001506,0.004182,0.006164
25%,0.501603,0.443991,0.501603,0.42283,0.006549,0.013595,0.020556
50%,0.563568,0.583652,0.563568,0.554963,0.01148,0.019905,0.063472
75%,0.56891,0.596087,0.56891,0.559636,0.370821,0.153547,0.530849
max,0.606303,0.626688,0.606303,0.602706,1.609996,0.368339,1.916706


In [12]:
# best estimators of Near Miss balanced dataset
balanced_under_test = balanced_under[balanced_under['Split'] == 'Test']
balanced_under_test.sort_values(by=['F1 Score'], ascending=False)

Unnamed: 0,Data,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Fit Time,Score Time,Total Time,Confusion Matrix
26,Near Miss Undersampled,Multinomial Naive Bayes,"{'alpha': 1.0, 'fit_prior': True}",Test,0.552423,0.483978,0.501847,0.438021,0.012275,0.010982,0.023257,"[[1022, 567, 144], [65, 86, 13], [102, 125, 146]]"
152,Near Miss Undersampled,Decision Tree,"{'criterion': 'entropy', 'max_depth': 8, 'max_...",Test,0.492952,0.486105,0.49656,0.415181,0.006574,0.016157,0.022731,"[[877, 722, 134], [53, 98, 13], [88, 141, 144]]"
23,Near Miss Undersampled,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Test,0.464758,0.456221,0.482505,0.396789,0.052325,0.011147,0.063472,"[[796, 704, 233], [54, 86, 24], [61, 139, 173]]"
261,Near Miss Undersampled,Random Forest,"{'max_depth': 2, 'max_features': 10, 'min_impu...",Test,0.463877,0.434642,0.459493,0.368877,0.374579,0.160983,0.535563,"[[849, 689, 195], [60, 100, 4], [80, 189, 104]]"
268,Near Miss Undersampled,Ada Boost,"{'learning_rate': 0.5, 'n_estimators': 100}",Test,0.420705,0.443991,0.478945,0.365092,0.552172,0.130602,0.682774,"[[711, 801, 221], [40, 109, 15], [69, 169, 135]]"
43,Near Miss Undersampled,K Nearest Neighbors,"{'algorithm': 'ball_tree', 'leaf_size': 100, '...",Test,0.381498,0.425767,0.469457,0.341169,0.007972,0.127165,0.135137,"[[603, 793, 337], [35, 104, 25], [52, 162, 159]]"
2,Near Miss Undersampled,Dummy,{'strategy': 'stratified'},Test,0.326872,0.331467,0.330364,0.263198,0.001506,0.004657,0.006164,"[[567, 590, 576], [56, 57, 51], [120, 135, 118]]"


### Data Loading

In [13]:
unbalanced.to_pickle(    '../Data/unbalanced.pkl')
balanced_smote.to_pickle('../Data/balanced_smote.pkl')
balanced_under.to_pickle('../Data/balanced_under.pkl')