# Classification Modeling

In [1]:
import classifiers as clf

import pickle
import pandas as pd
import numpy  as np

import warnings
warnings.filterwarnings('ignore')

### Load Train and Test Sets

In [2]:
X_train = pd.read_pickle("../Data/X_train.pkl")
X_test  = pd.read_pickle("../Data/X_test.pkl")
y_train = pd.read_pickle("../Data/y_train.pkl")
y_test  = pd.read_pickle("../Data/y_test.pkl")

X_train_smote = pd.read_pickle("../Data/X_train_smote.pkl")
y_train_smote = pd.read_pickle("../Data/y_train_smote.pkl")

X_train_under = pd.read_pickle("../Data/X_train_under.pkl")
y_train_under = pd.read_pickle("../Data/y_train_under.pkl")

y_labels = pd.read_pickle('../Data/y_labels.pkl')

### Classification Model Hyper Parameter Grid Search

In [3]:
# define parameter grid search for all classifiers
classifiers = []

# dummy classifier
classifiers.append(
    clf.grid_search_dummy_classifier(dict(
        strategy=['most_frequent','stratified'])))

# logistic regression
classifiers.append(
    clf.grid_search_logistic_regression(dict(
        C=[1e-2,1e0,1e2,1e6,1e12],
        penalty=['l1', 'l2'],
        fit_intercept=[True, False],
        multi_class=['ovr'],
        solver=['liblinear'])))

# multinomial naive bayes classifer
classifiers.append(
    clf.grid_search_multinomial_nb(dict(
        alpha=[0.0,1.0],
        fit_prior=[True])))

# k nearest neighbors classifier
classifiers.append(
    clf.grid_search_k_neighbors_classifier(dict(
        n_neighbors=[5,11],
        weights=['uniform', 'distance'],
        algorithm=['ball_tree','kd_tree'],
        leaf_size=[100,200])))

# decision tree classifier
classifiers.append(
    clf.grid_search_decision_tree_classifier(dict(
        criterion=['gini','entropy'],
        max_depth=[6,8],
        min_samples_leaf=[20,50,100],
        max_features=[5,10,20],
        min_impurity_decrease=[0.01,0.03,0.05])))

# random forest classifier
classifiers.append(
    clf.grid_search_random_forest_classifier(dict(
        n_estimators=[100,200,300],
        max_depth=[2,3,4],
        min_samples_leaf=[100,200],
        max_features=[5,10],
        min_impurity_decrease=[0.01,0.03,0.05])))

# ada boost classifer
classifiers.append(
    clf.grid_search_ada_boost_classifier(dict(
        n_estimators=[100,200,300],
        learning_rate=[0.5,1.0])))

### Classification Model:  Unbalanced Classes

In [4]:
# run unbalanced dataset
unbalanced = clf.fit_predict_measure(
    'Unbalanced', X_train, X_test, y_train, y_test, list(y_labels[0]), classifiers)
unbalanced

Running jobs: Dummy
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Running jobs: Logistic Regression
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   11.3s finished


Running jobs: Multinomial Naive Bayes
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1110s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.1s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.2s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Running jobs: K Nearest Neighbors
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   52.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   58.2s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  1.2min finished


Running jobs: Decision Tree
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0721s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 336 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 396 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 456 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:   10.1s finished
[Parallel(n_jobs=-1)]: Usi

Running jobs: Random Forest
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    6.1s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   37.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   41.9s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   47.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   52.4s
[Paralle

Running jobs: Ada Boost
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:   35.1s remaining:    3.9s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   39.3s finished


Unnamed: 0,Data,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Fit Time,Score Time,Confusion Matrix
0,Unbalanced,Dummy,{'strategy': 'most_frequent'},Train,0.764790,0.584904,0.764790,0.662860,0.004948,0.012792,
1,Unbalanced,Dummy,{'strategy': 'stratified'},Train,0.622783,0.619765,0.610995,0.617685,0.004675,0.019233,
2,Unbalanced,Dummy,{'strategy': 'most_frequent'},Test,0.763436,0.582835,0.763436,0.661022,,,"[[1733, 0, 0], [164, 0, 0], [373, 0, 0]]"
3,Unbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.764790,0.584904,0.764790,0.662860,0.083684,0.056728,
4,Unbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': True, 'multi_clas...",Train,0.767875,0.726059,0.767875,0.671165,0.051202,0.044381,
5,Unbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.769748,0.709923,0.769748,0.678317,0.080370,0.078228,
6,Unbalanced,Logistic Regression,"{'C': 0.01, 'fit_intercept': False, 'multi_cla...",Train,0.779443,0.716726,0.779443,0.703343,0.058564,0.049560,
7,Unbalanced,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.786824,0.721505,0.786824,0.721361,0.461737,0.042975,
8,Unbalanced,Logistic Regression,"{'C': 1.0, 'fit_intercept': True, 'multi_class...",Train,0.786934,0.721696,0.786934,0.721554,0.070315,0.044069,
9,Unbalanced,Logistic Regression,"{'C': 1.0, 'fit_intercept': False, 'multi_clas...",Train,0.786163,0.719088,0.786163,0.721032,0.108279,0.041428,


In [5]:
# statistics of unbalanced classifiers (test and training sets)
unbalanced.describe()

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,Fit Time,Score Time
count,269.0,269.0,269.0,269.0,262.0,262.0
mean,0.769965,0.628898,0.769921,0.681361,0.421741,0.371432
std,0.012471,0.057927,0.013,0.025348,0.77224,0.873985
min,0.622783,0.582835,0.610995,0.617685,0.004675,0.012792
25%,0.76479,0.584904,0.76479,0.66286,0.022013,0.042651
50%,0.76479,0.584904,0.76479,0.66286,0.122168,0.082516
75%,0.777349,0.693925,0.777349,0.708008,0.646595,0.33144
max,0.789688,0.749627,0.789688,0.733284,6.413338,5.412229


In [6]:
# best estimators of unbalanced dataset
unbalanced_test = unbalanced[unbalanced['Split'] == 'Test']
unbalanced_test.sort_values(by=['F1 Score'], ascending=False)

Unnamed: 0,Data,Classifier,Parameters,Split,Accuracy,Precision,Recall,F1 Score,Fit Time,Score Time,Confusion Matrix
43,Unbalanced,K Nearest Neighbors,"{'algorithm': 'kd_tree', 'leaf_size': 100, 'n_...",Test,0.782379,0.729329,0.782379,0.727244,,,"[[1689, 14, 30], [149, 4, 11], [282, 8, 83]]"
268,Unbalanced,Ada Boost,"{'learning_rate': 1.0, 'n_estimators': 100}",Test,0.784581,0.734862,0.784581,0.722891,,,"[[1705, 6, 22], [151, 3, 10], [295, 5, 73]]"
23,Unbalanced,Logistic Regression,"{'C': 100.0, 'fit_intercept': True, 'multi_cla...",Test,0.781498,0.709913,0.781498,0.712267,,,"[[1712, 0, 21], [154, 0, 10], [311, 0, 62]]"
26,Unbalanced,Multinomial Naive Bayes,"{'alpha': 0.0, 'fit_prior': True}",Test,0.781498,0.709913,0.781498,0.712267,,,"[[1712, 0, 21], [154, 0, 10], [311, 0, 62]]"
152,Unbalanced,Decision Tree,"{'criterion': 'gini', 'max_depth': 6, 'max_fea...",Test,0.781498,0.709913,0.781498,0.712267,,,"[[1712, 0, 21], [154, 0, 10], [311, 0, 62]]"
2,Unbalanced,Dummy,{'strategy': 'most_frequent'},Test,0.763436,0.582835,0.763436,0.661022,,,"[[1733, 0, 0], [164, 0, 0], [373, 0, 0]]"
261,Unbalanced,Random Forest,"{'max_depth': 2, 'max_features': 10, 'min_impu...",Test,0.763436,0.582835,0.763436,0.661022,,,"[[1733, 0, 0], [164, 0, 0], [373, 0, 0]]"


### Classification Model: Classes Balanced with SMOTE Oversampling

In [7]:
# run SMOTE balanced dataset
balanced_smote = clf.fit_predict_measure(
    'SMOTE Oversampled', X_train_smote, X_test, y_train_smote, y_test, list(y_labels[0]), classifiers)
balanced_smote

Running jobs: Dummy
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Running jobs: Logistic Regression
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   41.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   42.7s finished


Running jobs: Multinomial Naive Bayes
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.4s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.5s finished


Running jobs: K Nearest Neighbors
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.0min


KeyboardInterrupt: 

In [None]:
# statistics of SMOTE balanced classifiers (test and training sets)
balanced_smote.describe()

In [None]:
# best estimators of SMOTE balanced dataset
balanced_smote_test = balanced_smote[balanced_smote['Split'] == 'Test']
balanced_smote_test.sort_values(by=['F1 Score'], ascending=False)

### Classification Model:  Classes Balanced with Near Miss Undersampling

In [None]:
# run Near Miss balanced dataset
balanced_under = clf.fit_predict_measure(
    'Near Miss Undersampled', X_train_under, X_test, y_train_under, y_test, list(y_labels[0]), classifiers)
balanced_under

In [None]:
# statistics of Near Miss balanced classifiers (test and training sets)
balanced_under.describe()

In [None]:
# best estimators of Near Miss balanced dataset
balanced_under_test = balanced_under[balanced_under['Split'] == 'Test']
balanced_under_test.sort_values(by=['F1 Score'], ascending=False)

### Data Loading

In [None]:
unbalanced.to_pickle(    '../Data/unbalanced.pkl')
balanced_smote.to_pickle('../Data/balanced_smote.pkl')
balanced_under.to_pickle('../Data/balanced_under.pkl')