# Classification Modeling

In [1]:
import classifiers as clf

import pickle
import pandas as pd
import numpy  as np

import warnings
warnings.filterwarnings('ignore')

### Load Train and Test Sets

In [2]:
X_train = pd.read_pickle("../Data/X_train.pkl")
X_test  = pd.read_pickle("../Data/X_test.pkl")
y_train = pd.read_pickle("../Data/y_train.pkl")
y_test  = pd.read_pickle("../Data/y_test.pkl")

X_train_smote = pd.read_pickle("../Data/X_train_smote.pkl")
y_train_smote = pd.read_pickle("../Data/y_train_smote.pkl")

X_train_under = pd.read_pickle("../Data/X_train_under.pkl")
y_train_under = pd.read_pickle("../Data/y_train_under.pkl")

y_labels = pd.read_pickle('../Data/y_labels.pkl')

### Classification Model Hyper Parameter Grid Search

In [3]:
# define parameter grid search for all classifiers
classifiers = []

# dummy classifier
classifiers.append(
    clf.grid_search_dummy_classifier(dict(
        strategy=['most_frequent','stratified','uniform'])))

# logistic regression
classifiers.append(
    clf.grid_search_logistic_regression([dict(
        C=[1e-2,1e0,1e1,1e2,1e4,1e6,1e12],
        penalty=['l1', 'l2'],
        fit_intercept=[True, False],
        multi_class=['ovr'],
        solver=['liblinear']),
        dict(
        C=[1e-2,1e0,1e1,1e2,1e4,1e6,1e12],
        penalty=['l2'],
        fit_intercept=[True, False],
        multi_class=['multinomial'],
        solver=['newton-cg'])]))

# multinomial naive bayes classifer
classifiers.append(
    clf.grid_search_multinomial_nb(dict(
        alpha=[0.0,0.2,0.4,0.6,0.8,1.0],
        fit_prior=[True,False])))

# k nearest neighbors classifier
classifiers.append(
    clf.grid_search_k_neighbors_classifier(dict(
        n_neighbors=list(range(3,21,2)),
        weights=['uniform', 'distance'],
        algorithm=['ball_tree','kd_tree'],
        leaf_size=list(range(30,210,30)),
        p=[1,2])))

# decision tree classifier
classifiers.append(
    clf.grid_search_decision_tree_classifier(dict(
        criterion=['gini','entropy'],
        max_depth=list(range(1,10)),
        min_samples_leaf=list(range(1,10,2)),
        max_features=list(range(10,50,10)),
        min_impurity_decrease=list(range(0.01,0.05,0.02)))))

# random forest classifier
classifiers.append(
    clf.grid_search_random_forest_classifier(dict(
        n_estimators=list(range(100,500,100)),
        criterion=['gini','entropy'],
        max_depth=list(range(1,5)),
        min_samples_leaf=list(range(1,50,10)),
        max_features=list(range(10,50,10)),
        min_impurity_decrease=list(range(0.01,0.05,0.02)))))

# ada boost classifer
classifiers.append(
    clf.grid_search_ada_boost_classifier(dict(
        n_estimators=list(range(100,500,100)),
        learning_rate=[0.1,0.5,1.0,2.0])))

### Classification Model:  Unbalanced Classes

In [4]:
# run unbalanced dataset
unbalanced = clf.fit_predict_measure(
    'Unbalanced', X_train, X_test, y_train, y_test, list(y_labels[0]), classifiers)
unbalanced

Running jobs: Dummy
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Running jobs: Logistic Regression
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | ela

Running jobs: Multinomial Naive Bayes
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Batch computation too fast (0.0933s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.2s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Running jobs: K Nearest Neighbors
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   43.8s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   50.5s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   56.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  1.2min finished


Running jobs: Decision Tree
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0595s.) Setting batch_size=6.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  62 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 212 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 278 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 344 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 422 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 500 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:    9.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Running jobs: Random Forest
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   19.9s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   37.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   41.1s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   46.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   51.4s
[Paralle

KeyboardInterrupt: 

In [None]:
# statistics of unbalanced classifiers (test and training sets)
unbalanced.describe()

In [None]:
# best estimators of unbalanced dataset
unbalanced_test = unbalanced[unbalanced['Split'] == 'Test']
unbalanced_test.sort_values(by=['F1 Score'], ascending=False)

### Classification Model: Classes Balanced with SMOTE Oversampling

In [None]:
# run SMOTE balanced dataset
balanced_smote = clf.fit_predict_measure(
    'SMOTE Oversampled', X_train_smote, X_test, y_train_smote, y_test, list(y_labels[0]), classifiers)
balanced_smote

In [None]:
# statistics of SMOTE balanced classifiers (test and training sets)
balanced_smote.describe()

In [None]:
# best estimators of SMOTE balanced dataset
balanced_smote_test = balanced_smote[balanced_smote['Split'] == 'Test']
balanced_smote_test.sort_values(by=['F1 Score'], ascending=False)

### Classification Model:  Classes Balanced with Near Miss Undersampling

In [None]:
# run Near Miss balanced dataset
balanced_under = clf.fit_predict_measure(
    'Near Miss Undersampled', X_train_under, X_test, y_train_under, y_test, list(y_labels[0]), classifiers)
balanced_under

In [None]:
# statistics of Near Miss balanced classifiers (test and training sets)
balanced_under.describe()

In [None]:
# best estimators of Near Miss balanced dataset
balanced_under_test = balanced_under[balanced_under['Split'] == 'Test']
balanced_under_test.sort_values(by=['F1 Score'], ascending=False)

### Data Loading

In [None]:
unbalanced.to_pickle(    '../Data/unbalanced.pkl')
balanced_smote.to_pickle('../Data/balanced_smote.pkl')
balanced_under.to_pickle('../Data/balanced_under.pkl')