# Classification Modeling

In [1]:
import classifiers as clf

import pickle
import pandas as pd
import numpy  as np

import warnings
warnings.filterwarnings('ignore')

### Load Train and Test Sets

In [2]:
X_train = pd.read_pickle("../Data/X_train.pkl")
X_test  = pd.read_pickle("../Data/X_test.pkl")
y_train = pd.read_pickle("../Data/y_train.pkl")
y_test  = pd.read_pickle("../Data/y_test.pkl")

X_train_smote = pd.read_pickle("../Data/X_train_smote.pkl")
y_train_smote = pd.read_pickle("../Data/y_train_smote.pkl")

X_train_under = pd.read_pickle("../Data/X_train_under.pkl")
y_train_under = pd.read_pickle("../Data/y_train_under.pkl")

y_labels = pd.read_pickle('../Data/y_labels.pkl')

### Classification Model Hyper Parameter Grid Search

In [3]:
# define parameter grid search for all classifiers
classifiers = []

# dummy classifier
classifiers.append(
    clf.grid_search_dummy_classifier(dict(
        strategy=['most_frequent','stratified','uniform'])))

# logistic regression
classifiers.append(
    clf.grid_search_logistic_regression([dict(
        C=[1e-2,1e0,1e1,1e2,1e4,1e6,1e12],
        penalty=['l1', 'l2'],
        fit_intercept=[True, False],
        multi_class=['ovr'],
        solver=['liblinear']),
        dict(
        C=[1e-2,1e0,1e1,1e2,1e4,1e6,1e12],
        penalty=['l2'],
        fit_intercept=[True, False],
        multi_class=['multinomial'],
        solver=['newton-cg'])]))

# multinomial naive bayes classifer
classifiers.append(
    clf.grid_search_multinomial_nb(dict(
        alpha=[0.0,0.2,0.4,0.6,0.8,1.0],
        fit_prior=[True,False])))

# k nearest neighbors classifier
classifiers.append(
    clf.grid_search_k_neighbors_classifier(dict(
        n_neighbors=list(range(3,20,4)),
        weights=['uniform', 'distance'],
        algorithm=['ball_tree','kd_tree'],
        leaf_size=list(range(10,31,10)),
        p=[1,2])))

# decision tree classifier
classifiers.append(
    clf.grid_search_decision_tree_classifier(dict(
        criterion=['gini','entropy'],
        max_depth=list(range(1,8,3)),
        min_samples_split=list(range(2,6,3)),
        min_samples_leaf=list(range(1,5,3)),
        max_features=list(range(300,601,300)),
        min_impurity_decrease=[0.01,0.03])))

# random forest classifier
classifiers.append(
    clf.grid_search_random_forest_classifier(dict(
        n_estimators=list(range(100,301,100)),
        criterion=['gini','entropy'],
        max_depth=list(range(1,8,3)),
        min_samples_split=list(range(2,6,3)),
        min_samples_leaf=list(range(1,5,3)),
        max_features=list(range(300,601,300)),
        min_impurity_decrease=[0.01,0.03])))

# ada boost classifer
classifiers.append(
    clf.grid_search_ada_boost_classifier(dict(
        n_estimators=list(range(50,501,50)),
        learning_rate=[0.1,0.5,1.0,2.0])))

### Classification Model:  Imbalanced Classes

In [4]:
# run imbalanced dataset
imbalanced, imbalanced_classifiers = clf.fit_predict_measure(
    'Imbalanced', X_train, X_test, y_train, y_test, list(y_labels[0]), classifiers)
imbalanced

Running jobs: Dummy
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Running jobs: Logistic Regression
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   18.9s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | ela

Running jobs: Multinomial Naive Bayes
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1163s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Running jobs: K Nearest Neighbors
Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   47.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 14.1min
[Paralle

Running jobs: Decision Tree
Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1202s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 172 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 202 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 266 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 300 tas

Running jobs: Random Forest
Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   30.0s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   34.4s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   39.2s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   52.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.4min
[Paralle

Running jobs: Ada Boost
Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   48.1s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  5

Unnamed: 0,Balance,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Fit Time,Score Time,Total Time,Confusion Matrix
0,Imbalanced,Dummy,{'strategy': 'most_frequent'},Train,0.705479,0.235160,0.333333,0.275770,0.018400,0.020391,0.038792,
1,Imbalanced,Dummy,{'strategy': 'stratified'},Train,0.534247,0.342899,0.318229,0.323173,0.014031,0.017121,0.031151,
2,Imbalanced,Dummy,{'strategy': 'uniform'},Train,0.340868,0.336295,0.327415,0.292747,0.008418,0.009534,0.017953,
3,Imbalanced,Dummy,{'strategy': 'stratified'},Test,0.544292,0.339577,0.339593,0.339569,0.014031,0.017121,0.031151,"[[544, 97, 128], [94, 18, 28], [124, 28, 34]]"
4,Imbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.705479,0.235160,0.333333,0.275770,0.088994,0.040976,0.129970,
5,Imbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.705479,0.235160,0.333333,0.275770,0.084963,0.045544,0.130507,
6,Imbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.705479,0.235160,0.333333,0.275770,0.079071,0.037160,0.116231,
7,Imbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.705479,0.235160,0.333333,0.275770,0.068594,0.037665,0.106259,
8,Imbalanced,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.746119,0.647832,0.492661,0.526445,0.265630,0.037240,0.302870,
9,Imbalanced,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.743836,0.642274,0.485224,0.517756,0.097978,0.036570,0.134548,


In [5]:
# statistics of imbalanced classifiers (test and training sets)
imbalanced.describe()

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,Fit Time,Score Time,Total Time
count,608.0,608.0,608.0,608.0,608.0,608.0,608.0
mean,0.716023,0.390587,0.407929,0.378591,1.852117,3.553671,5.405789
std,0.033263,0.180926,0.1026,0.136263,3.022015,6.868807,6.782086
min,0.340868,0.23516,0.318229,0.27577,0.008418,0.009534,0.017953
25%,0.705479,0.23516,0.333333,0.27577,0.180338,0.066938,0.69126
50%,0.705479,0.23516,0.333333,0.27577,0.937331,0.176226,2.019142
75%,0.740297,0.605898,0.520476,0.547684,2.283555,0.495381,8.372385
max,0.778082,0.822644,0.616496,0.629215,23.042675,26.114778,26.292144


In [6]:
# best estimators of imbalanced dataset
imbalanced_test = imbalanced[imbalanced['Split'] == 'Test']
imbalanced_test.sort_values(by=['F1 Score'], ascending=False)

Unnamed: 0,Balance,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Fit Time,Score Time,Total Time,Confusion Matrix
180,Imbalanced,K Nearest Neighbors,"{'algorithm': 'ball_tree', 'leaf_size': 10, 'n...",Test,0.774429,0.668664,0.599096,0.625391,0.24125,18.027862,18.269112,"[[701, 25, 43], [68, 54, 18], [78, 15, 93]]"
46,Imbalanced,Logistic Regression,"{'C': 1000000.0, 'fit_intercept': True, 'multi...",Test,0.747945,0.628553,0.546944,0.5744,19.679999,0.04078,19.720779,"[[702, 34, 33], [61, 56, 23], [116, 9, 61]]"
59,Imbalanced,Multinomial Naive Bayes,"{'alpha': 0.6, 'fit_prior': True}",Test,0.731507,0.60452,0.547882,0.566858,0.044606,0.031428,0.076034,"[[678, 54, 37], [70, 57, 13], [104, 16, 66]]"
607,Imbalanced,Ada Boost,"{'learning_rate': 1.0, 'n_estimators': 350}",Test,0.7379,0.604464,0.530898,0.555372,8.626899,1.097643,9.724541,"[[698, 28, 43], [69, 53, 18], [111, 18, 57]]"
566,Imbalanced,Random Forest,"{'criterion': 'entropy', 'max_depth': 7, 'max_...",Test,0.723288,0.471217,0.385421,0.368374,6.076363,0.181682,6.258045,"[[761, 0, 8], [134, 0, 6], [155, 0, 31]]"
3,Imbalanced,Dummy,{'strategy': 'stratified'},Test,0.544292,0.339577,0.339593,0.339569,0.014031,0.017121,0.031151,"[[544, 97, 128], [94, 18, 28], [124, 28, 34]]"
277,Imbalanced,Decision Tree,"{'criterion': 'entropy', 'max_depth': 7, 'max_...",Test,0.712329,0.449296,0.363916,0.335269,0.060282,0.039252,0.099534,"[[761, 0, 8], [137, 0, 3], [167, 0, 19]]"


### Classification Model: Classes Balanced with SMOTE Oversampling

In [7]:
# run SMOTE balanced dataset
balanced_smote, smote_classifiers = clf.fit_predict_measure(
    'SMOTE Oversampled', X_train_smote, X_test, y_train_smote, y_test, list(y_labels[0]), classifiers)
balanced_smote

Running jobs: Dummy
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Running jobs: Logistic Regression
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   38.6s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   56.0s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 11.4min
[Paralle

Running jobs: Multinomial Naive Bayes
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Running jobs: K Nearest Neighbors
Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 24.3min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 28.0min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 32.9min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 37.5min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 42.6min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 47.5min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed: 53.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 59.8min
[Paralle

KeyboardInterrupt: 

In [None]:
# statistics of SMOTE balanced classifiers (test and training sets)
balanced_smote.describe()

In [None]:
# best estimators of SMOTE balanced dataset
balanced_smote_test = balanced_smote[balanced_smote['Split'] == 'Test']
balanced_smote_test.sort_values(by=['F1 Score'], ascending=False)

### Classification Model:  Classes Balanced with Near Miss Undersampling

In [None]:
# run Near Miss balanced dataset
balanced_under, under_classifiers = clf.fit_predict_measure(
    'Near Miss Undersampled', X_train_under, X_test, y_train_under, y_test, list(y_labels[0]), classifiers)
balanced_under

In [None]:
# statistics of Near Miss balanced classifiers (test and training sets)
balanced_under.describe()

In [None]:
# best estimators of Near Miss balanced dataset
balanced_under_test = balanced_under[balanced_under['Split'] == 'Test']
balanced_under_test.sort_values(by=['F1 Score'], ascending=False)

### Data Loading

In [None]:
imbalanced.to_pickle(    '../Data/imbalanced.pkl')
balanced_smote.to_pickle('../Data/balanced_smote.pkl')
balanced_under.to_pickle('../Data/balanced_under.pkl')

pickle.dump(imbalanced_classifiers, open('imbalanced_classifiers.pkl', 'wb'))
pickle.dump(smote_classifiers,      open('smote_classifiers.pkl',      'wb'))
pickle.dump(under_classifiers,      open('under_classifiers.pkl',      'wb'))