In [1]:
import pickle
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import statistics

# Load sklearn utilities
# ----------------------
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, brier_score_loss, mean_squared_error, r2_score
from sklearn.metrics import precision_recall_fscore_support as score

from sklearn.calibration import calibration_curve

# Load classifiers
# ----------------
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier

In [2]:
# import final dataframe
cwd = os.getcwd()
final_df = pickle.load(open( cwd+"\\data\\final_df.pkl", "rb" ) )

# 1. Handling Categorical Features via One-Hot Encoding

In [3]:
# final_df = final_df.sample(frac=0.7)

In [4]:
y = final_df.Y
final_df = final_df.drop(columns=['Y','sub_industry','industry'])
ohe_df = pd.get_dummies(final_df)
X = ohe_df

# Free up memory
del final_df
del ohe_df

In [5]:
import sklearn.model_selection as model_selection
from sklearn import preprocessing

def prepare_data(X, y):
    '''
    This function will prepare the data for classification.
    It expects the following parameters:
      - X: feature columns
      - y: target variable column
      - train_size: proportion of dataset used for training
      - random_state: the random seed to use when selecting a subset of rows
    
    This function returns a dictionary with the following entries
      - X_train: the matrix of training data
      - y_train: the array of training labels
      - X_test: the matrix of testing data
      - y_test: the array of testing labels
    '''
    # Split data
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=71)    
    
    # Scale the variables
    scaler = preprocessing.MinMaxScaler()

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # return training and testing data
    out = {'X_train':X_train, 'y_train':y_train, 
           'X_test':X_test, 'y_test':y_test}
    
    return out

In [6]:
def fit_classification(model, data_dict,
                          cv_parameters = {},
                          model_name = None,
                          random_state = 0,
                          output_to_file = True,
                          print_to_screen = True):
    '''
    This function will fit a classification model to data and print various evaluation
    measures. It expects the following parameters
      - model: an sklearn model object
      - data_dict: the dictionary containing both training and testing data;
                   returned by the prepare_data function
      - cv_parameters: a dictionary of parameters that should be optimized
                       over using cross-validation. Specifically, each named
                       entry in the dictionary should correspond to a parameter,
                       and each element should be a list containing the values
                       to optimize over
      - model_name: the name of the model being fit, for printouts
      - random_state: the random seed to use
      - output_to_file: if the results will be saved to the output file
      - print_to_screen: if the results will be printed on screen
    
    If the model provided does not have a predict_proba function, we will
    simply print accuracy diagnostics and return.
    
    If the model provided does have a predict_proba function, we first
    figure out the optimal threshold that maximizes the accuracy and
    print out accuracy diagnostics. We then print an ROC curve, sensitivity/
    specificity curve, and calibration curve.
    
    This function returns a dictionary with the following entries
      - model: the best fitted model
      - y_pred: predictions for the test set
      - y_pred_probs: probability predictions for the test set, if the model
                      supports them
      - y_pred_score: prediction scores for the test set, if the model does not 
                      output probabilities.
    '''
        
    np.random.seed(random_state)
    
    # --------------------------
    #   Step 1 - Load the data
    # --------------------------
    X_train = data_dict['X_train']
    y_train = data_dict['y_train']
    
    X_test = data_dict['X_test']
    y_test = data_dict['y_test']
      
    # --------------------------
    #   Step 2 - Fit the model
    # --------------------------

    cv_model = GridSearchCV(model, cv_parameters, verbose=10, n_jobs=6, cv=3)
    
    start_time = time.time()

    cv_model.fit(X_train, y_train)
    end_time = time.time()
    
    best_model = cv_model.best_estimator_
    
    if print_to_screen:

        if model_name != None:
            print("=========================================================")
            print("  Model: " + model_name)
            print("=========================================================")

        print("Fit time: " + str(round(end_time - start_time, 2)) + " seconds")
        print("Optimal parameters:")
        print(cv_model.best_params_)
        print("")
    
    # -------------------------------
    #   Step 3 - Evaluate the model
    # -------------------------------
    
    y_pred_probs = best_model.predict_proba(X_test)[0]
    # gets a dictionary of {'class_name': probability}
    prob_per_class_dictionary = dict(zip(best_model.classes_, y_pred_probs))
 
    y_pred = best_model.predict(X_test)
    
    if print_to_screen:
        print(classification_report(y_test, y_pred, digits = 4))
        precision,recall,fscore,support=score(y_test,y_pred,average='weighted')
        print("Accuracy: ", accuracy_score(y_test, y_pred))
        print("Precision: ", precision)
        print("Recall: ", recall)
        print("F1: ", fscore)

    # Return the model predictions, and the
    # test set
    # -------------------------------------
    out = {'model':best_model, 'y_pred_labels':y_pred}
    out.update({'y_pred_probs':y_pred_probs})

# SVC
#         y_pred_score = best_model.decision_function(X_test)
#         out.update({'y_pred_score':y_pred_score})
        
    # Output results to file
    # ----------------------
#     if probs_predicted and output_to_file:
#         # Check whether any of the CV parameters are on the edge of
#         # the search space
#         opt_params_on_edge = find_opt_params_on_edge(cv_model)
#         dump_to_output(model_name + "::search_on_edge", opt_params_on_edge)
#         if print_to_screen:
#             print("Were parameters on edge? : " + str(opt_params_on_edge))
        
#         # Find out how different the scores are for the different values
#         # tested for by cross-validation. If they're not too different, then
#         # even if the parameters are off the edge of the search grid, we should
#         # be ok
#         score_variation = find_score_variation(cv_model)
#         dump_to_output(model_name + "::score_variation", score_variation)
#         if print_to_screen:
#             print("Score variations around CV search grid : " + str(score_variation))
        
#         # Print out all the scores
#         dump_to_output(model_name + "::all_cv_scores", str(cv_model.cv_results_['mean_test_score']))
#         if print_to_screen:
#             print( str(cv_model.cv_results_['mean_test_score']) )
        
#         # Dump the AUC to file
#         dump_to_output(model_name + "::roc_auc", roc_auc_score(y_test, y_pred_probs) )
        
    return prob_per_class_dictionary, best_model 

In [7]:
def calc_accuracy_windows(max_windows, y_test, y_pred):

    # Initialize array to hold counts for each window
    count_arr = np.zeros(max_windows)

    # For each class window, if predicted class is in window, increment count 
    # E.g. if predicted class = 4 and target class = 6, since max window allowed = 2, consider instance as accurate and increment count 
    for idx, value in enumerate(y_test.values):
        for window in np.arange(1,max_windows+1): 
             # window_arr calculates window that predicted value can fall into 
             # e.g. target value = 4, window = 2, window_arr = {2,3,4,5,6}
             window_arr = np.arange(value - window, value + window + 1)
             if (y_pred[idx] in window_arr):
                count_arr[window-1] += 1

    # Print accuracy for each time window     
    accuracy_list = []
    print(f"Accuracy with +- 0 time window(s): {accuracy_score(y_test, y_pred)*100:.4f}%")
    accuracy_list.append(accuracy_score(y_test, y_pred))
    for idx, count in enumerate(count_arr):
        print(f"Accuracy with +- {idx+1} time window(s): {(count/len(y_pred))*100:.4f}%")   
        accuracy_list.append(count/len(y_pred))
    
    return accuracy_list

In [None]:
data_dict = prepare_data(X,y)

  return self.partial_fit(X, y)


# 2.1. LogR with L2 penalty

In [18]:
X_train = data_dict['X_train']
y_train = data_dict['y_train']

X_test = data_dict['X_test']
y_test = data_dict['y_test']

model = LogisticRegression(penalty='l2', solver='sag', multi_class='ovr', verbose=20, n_jobs=6, C=1)
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)
y_pred = model.predict(X_test)

precision,recall,fscore,support=score(y_test,y_pred,average='weighted')
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", fscore)


[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.


convergence after 50 epochs took 867 seconds


[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed: 14.4min


convergence after 61 epochs took 1004 seconds


[Parallel(n_jobs=6)]: Done   2 tasks      | elapsed: 16.7min


convergence after 68 epochs took 1049 seconds


[Parallel(n_jobs=6)]: Done   3 tasks      | elapsed: 17.5min


convergence after 60 epochs took 1051 seconds


[Parallel(n_jobs=6)]: Done   4 tasks      | elapsed: 17.5min


convergence after 62 epochs took 1077 seconds


[Parallel(n_jobs=6)]: Done   5 tasks      | elapsed: 17.9min


convergence after 74 epochs took 1294 seconds


[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed: 21.6min


convergence after 57 epochs took 954 seconds


[Parallel(n_jobs=6)]: Done   7 tasks      | elapsed: 33.4min


convergence after 59 epochs took 1041 seconds


[Parallel(n_jobs=6)]: Done   8 tasks      | elapsed: 34.1min


convergence after 58 epochs took 1075 seconds


[Parallel(n_jobs=6)]: Done   9 tasks      | elapsed: 35.9min


convergence after 72 epochs took 1323 seconds


[Parallel(n_jobs=6)]: Done  10 tasks      | elapsed: 36.5min


convergence after 64 epochs took 1178 seconds


[Parallel(n_jobs=6)]: Done  11 tasks      | elapsed: 37.2min


convergence after 72 epochs took 1323 seconds


[Parallel(n_jobs=6)]: Done  12 tasks      | elapsed: 43.6min


convergence after 63 epochs took 1035 seconds


[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed: 50.6min


convergence after 62 epochs took 1074 seconds


[Parallel(n_jobs=6)]: Done  14 tasks      | elapsed: 52.0min


convergence after 59 epochs took 1060 seconds


[Parallel(n_jobs=6)]: Done  15 tasks      | elapsed: 54.2min


convergence after 64 epochs took 1176 seconds


[Parallel(n_jobs=6)]: Done  16 tasks      | elapsed: 55.5min


convergence after 79 epochs took 1402 seconds


[Parallel(n_jobs=6)]: Done  17 tasks      | elapsed: 60.5min


convergence after 66 epochs took 1167 seconds


[Parallel(n_jobs=6)]: Done  18 tasks      | elapsed: 63.1min


convergence after 66 epochs took 1053 seconds


[Parallel(n_jobs=6)]: Done  19 tasks      | elapsed: 68.2min


convergence after 64 epochs took 1082 seconds


[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed: 70.0min


convergence after 69 epochs took 1226 seconds
convergence after 74 epochs took 1337 seconds


[Parallel(n_jobs=6)]: Done  22 out of  31 | elapsed: 77.7min remaining: 31.8min


convergence after 69 epochs took 1216 seconds
convergence after 69 epochs took 1216 seconds


[Parallel(n_jobs=6)]: Done  24 out of  31 | elapsed: 83.3min remaining: 24.3min


convergence after 67 epochs took 1069 seconds
convergence after 69 epochs took 1165 seconds


[Parallel(n_jobs=6)]: Done  26 out of  31 | elapsed: 89.4min remaining: 17.2min


convergence after 79 epochs took 1324 seconds
convergence after 76 epochs took 1226 seconds


[Parallel(n_jobs=6)]: Done  28 out of  31 | elapsed: 98.2min remaining: 10.5min


convergence after 74 epochs took 1149 seconds
convergence after 72 epochs took 1055 seconds
convergence after 73 epochs took 909 seconds


[Parallel(n_jobs=6)]: Done  31 out of  31 | elapsed: 101.1min finished


{0: 8.47858058125836e-05,
 1: 0.0009922351428459334,
 2: 0.013408518200392102,
 3: 0.022364301176052082,
 4: 0.044288002577217815,
 5: 0.005314977582624015,
 6: 0.0019306550222173218,
 7: 0.14199656706548383,
 8: 0.20355979636443972,
 9: 0.39674670806338286,
 10: 0.08086238038945956,
 11: 0.006144989773795232,
 12: 0.030046417565547087,
 13: 0.00872691084315195,
 14: 0.029255301591734204,
 15: 0.003928010487908931,
 16: 8.096938847653274e-05,
 17: 0.00310988967087442,
 18: 0.003380081987142845,
 19: 0.0023314574555985974,
 20: 0.00043016568635260375,
 21: 0.00014189695509640916,
 22: 9.46295768110013e-05,
 23: 0.00013887505327397256,
 24: 0.00020235274391294515,
 25: 6.256897889116951e-05,
 26: 9.363483815838351e-06,
 27: 2.0641976544530716e-05,
 28: 4.133743771359914e-05,
 29: 0.00019811337518082744,
 30: 0.00010709857824932537}

Accuracy:  0.31387592023317
Precision:  0.2898588102503505
Recall:  0.31387592023317
F1:  0.22787249899016807


  'precision', 'predicted', average, warn_for)


# 2.2. Random Forest

In [9]:
X_train = data_dict['X_train']
y_train = data_dict['y_train']

X_test = data_dict['X_test']
y_test = data_dict['y_test']

model_rf = RandomForestClassifier(verbose=20, n_jobs=1, max_depth=50, n_estimators= 100)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
y_pred_proba_rf = model_rf.predict_proba(X_test)

precision,recall,fscore,support=score(y_test,y_pred_rf,average='weighted')
print("Accuracy: ", accuracy_score(y_test, y_pred_rf))
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", fscore)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   20.5s remaining:    0.0s


building tree 2 of 100


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   39.4s remaining:    0.0s


building tree 3 of 100


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   59.6s remaining:    0.0s


building tree 4 of 100


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.3min remaining:    0.0s


building tree 5 of 100


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.6min remaining:    0.0s


building tree 6 of 100


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  2.0min remaining:    0.0s


building tree 7 of 100


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  2.3min remaining:    0.0s


building tree 8 of 100


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  2.6min remaining:    0.0s


building tree 9 of 100


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  3.0min remaining:    0.0s


building tree 10 of 100


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  3.3min remaining:    0.0s


building tree 11 of 100


[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:  3.7min remaining:    0.0s


building tree 12 of 100


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  4.0min remaining:    0.0s


building tree 13 of 100


[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:  4.3min remaining:    0.0s


building tree 14 of 100


[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:  4.6min remaining:    0.0s


building tree 15 of 100


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  5.0min remaining:    0.0s


building tree 16 of 100


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:  5.3min remaining:    0.0s


building tree 17 of 100


[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:  5.6min remaining:    0.0s


building tree 18 of 100


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  6.0min remaining:    0.0s


building tree 19 of 100


[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed:  6.3min remaining:    0.0s


building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 31.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    3.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    3.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    4.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    4.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elaps

Accuracy:  0.3833407749169023
Precision:  0.398087300655154
Recall:  0.3833407749169023
F1:  0.3400535963492192


In [12]:
accuracy_rf = calc_accuracy_windows(2, y_test, y_pred_rf)

Accuracy with +- 0 time window(s): 38.3341%
Accuracy with +- 1 time window(s): 67.7240%
Accuracy with +- 2 time window(s): 79.3750%


# 2.3. SVM

In [None]:
from sklearn.svm import SVC

X_train = data_dict['X_train']
y_train = data_dict['y_train']

X_test = data_dict['X_test']
y_test = data_dict['y_test']

model_svm = SVC(verbose=20, probability=True)
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_test)
y_pred_proba_svm = model_svm.predict_proba(X_test)

precision,recall,fscore,support=score(y_test,y_pred_svm,average='weighted')
print("Accuracy: ", accuracy_score(y_test, y_pred_svm))
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", fscore)



[LibSVM]

In [61]:
# random_forest = 
# cv_parameters = {'max_depth': [2, 20], 'n_estimators': [100]}

# prob_per_class_dictionary_rf, best_model_rf = fit_classification(random_forest, data_dict, cv_parameters = cv_parameters, model_name='Random forest')

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed:   25.3s remaining:   50.6s
[Parallel(n_jobs=6)]: Done   3 out of   6 | elapsed:   25.3s remaining:   25.3s
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:  1.6min remaining:   47.1s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:  1.6min remaining:    0.0s


MemoryError: 