In [83]:
import pickle
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import statistics


In [84]:
# import final dataframe
cwd = os.getcwd()
final_df = pickle.load(open( cwd+"\\data\\final_df.pkl", "rb" ) )

# 1. Handling Categorical Features via One-Hot Encoding

In [85]:
final_df.head()

Unnamed: 0,industry,sub_industry,shipper,std_service_type,std_weight,freight_charges,zone,sender_state,recipient_state,same_MSA,sender_in_MSA,rec_in_MSA,sender_MSA_num,rec_MSA_num,week_number,day_of_week,Y
0,OTHER,OTHER,ups,Ground,48.0,34.39,2,IN,MI,0,0,1,99014,24340,25,0,3
1,RETAIL,ECOMMERCE,fedex,Ground,3.0,22.22,6,MT,OH,0,0,1,99026,17460,23,2,18
2,OTHER,OTHER,fedex,Home Delivery,11.0,13.19,5,IN,FL,0,1,1,34620,36740,22,1,9
3,RETAIL,FOOD STORES,ups,Ground,50.0,109.72,4,WI,,0,1,1,31540,16020,23,0,7
4,SERVICES,TRAVEL SERVICES & TRAVEL DOCUMENTS,fedex,Ground,44.0,31.57,5,MN,NY,0,1,1,33460,10580,20,2,12


In [86]:
final_df = final_df.sample(frac=0.5)

In [87]:
y = final_df.Y
final_df = final_df.drop(columns=['Y','sub_industry','sender_MSA_num','rec_MSA_num','industry','week_number'])
ohe_df = pd.get_dummies(final_df)
X = ohe_df

# Free up memory
del final_df
del ohe_df

In [88]:
# To test
# from sklearn import preprocessing

# # Min max feature scaling
# min_max_scaler = preprocessing.MinMaxScaler()
# X = min_max_scaler.fit_transform(X)

In [89]:
import sklearn.model_selection as model_selection
from sklearn import preprocessing

def prepare_data(X, y):
    '''
    This function will prepare the data for classification.
    It expects the following parameters:
      - X: feature columns
      - y: target variable column
      - train_size: proportion of dataset used for training
      - random_state: the random seed to use when selecting a subset of rows
    
    This function returns a dictionary with the following entries
      - X_train: the matrix of training data
      - y_train: the array of training labels
      - X_test: the matrix of testing data
      - y_test: the array of testing labels
    '''
    # Split data
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=71)    
    
    # Scale the variables
    scaler = preprocessing.MinMaxScaler()

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # return training and testing data
    out = {'X_train':X_train, 'y_train':y_train, 
           'X_test':X_test, 'y_test':y_test}
    
    return out

In [90]:
def fit_classification(model, data_dict,
                          cv_parameters = {},
                          model_name = None,
                          random_state = 0,
                          output_to_file = True,
                          print_to_screen = True):
    '''
    This function will fit a classification model to data and print various evaluation
    measures. It expects the following parameters
      - model: an sklearn model object
      - data_dict: the dictionary containing both training and testing data;
                   returned by the prepare_data function
      - cv_parameters: a dictionary of parameters that should be optimized
                       over using cross-validation. Specifically, each named
                       entry in the dictionary should correspond to a parameter,
                       and each element should be a list containing the values
                       to optimize over
      - model_name: the name of the model being fit, for printouts
      - random_state: the random seed to use
      - output_to_file: if the results will be saved to the output file
      - print_to_screen: if the results will be printed on screen
    
    If the model provided does not have a predict_proba function, we will
    simply print accuracy diagnostics and return.
    
    If the model provided does have a predict_proba function, we first
    figure out the optimal threshold that maximizes the accuracy and
    print out accuracy diagnostics. We then print an ROC curve, sensitivity/
    specificity curve, and calibration curve.
    
    This function returns a dictionary with the following entries
      - model: the best fitted model
      - y_pred: predictions for the test set
      - y_pred_probs: probability predictions for the test set, if the model
                      supports them
      - y_pred_score: prediction scores for the test set, if the model does not 
                      output probabilities.
    '''
        
    np.random.seed(random_state)
    
    # --------------------------
    #   Step 1 - Load the data
    # --------------------------
    X_train = data_dict['X_train']
    y_train = data_dict['y_train']
    
    X_test = data_dict['X_test']
    y_test = data_dict['y_test']
      
    # --------------------------
    #   Step 2 - Fit the model
    # --------------------------

    cv_model = GridSearchCV(model, cv_parameters, verbose=10, n_jobs=6, cv=3)
    
    start_time = time.time()

    cv_model.fit(X_train, y_train)
    end_time = time.time()
    
    best_model = cv_model.best_estimator_
    
    if print_to_screen:

        if model_name != None:
            print("=========================================================")
            print("  Model: " + model_name)
            print("=========================================================")

        print("Fit time: " + str(round(end_time - start_time, 2)) + " seconds")
        print("Optimal parameters:")
        print(cv_model.best_params_)
        print("")
    
    # -------------------------------
    #   Step 3 - Evaluate the model
    # -------------------------------
    
    y_pred_probs = best_model.predict_proba(X_test)[0]
    # gets a dictionary of {'class_name': probability}
    prob_per_class_dictionary = dict(zip(best_model.classes_, y_pred_probs))
 
    y_pred = best_model.predict(X_test)
    
    if print_to_screen:
        print(classification_report(y_test, y_pred, digits = 4))
        precision,recall,fscore,support=score(y_test,y_pred,average='weighted')
        print("Accuracy: ", accuracy_score(y_test, y_pred))
        print("Precision: ", precision)
        print("Recall: ", recall)
        print("F1: ", fscore)

    # Return the model predictions, and the
    # test set
    # -------------------------------------
    out = {'model':best_model, 'y_pred_labels':y_pred}
    out.update({'y_pred_probs':y_pred_probs})

# SVC
#         y_pred_score = best_model.decision_function(X_test)
#         out.update({'y_pred_score':y_pred_score})
        
    # Output results to file
    # ----------------------
#     if probs_predicted and output_to_file:
#         # Check whether any of the CV parameters are on the edge of
#         # the search space
#         opt_params_on_edge = find_opt_params_on_edge(cv_model)
#         dump_to_output(model_name + "::search_on_edge", opt_params_on_edge)
#         if print_to_screen:
#             print("Were parameters on edge? : " + str(opt_params_on_edge))
        
#         # Find out how different the scores are for the different values
#         # tested for by cross-validation. If they're not too different, then
#         # even if the parameters are off the edge of the search grid, we should
#         # be ok
#         score_variation = find_score_variation(cv_model)
#         dump_to_output(model_name + "::score_variation", score_variation)
#         if print_to_screen:
#             print("Score variations around CV search grid : " + str(score_variation))
        
#         # Print out all the scores
#         dump_to_output(model_name + "::all_cv_scores", str(cv_model.cv_results_['mean_test_score']))
#         if print_to_screen:
#             print( str(cv_model.cv_results_['mean_test_score']) )
        
#         # Dump the AUC to file
#         dump_to_output(model_name + "::roc_auc", roc_auc_score(y_test, y_pred_probs) )
        
    return prob_per_class_dictionary, out 

In [91]:
# Load sklearn utilities
# ----------------------
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, brier_score_loss, mean_squared_error, r2_score
from sklearn.metrics import precision_recall_fscore_support as score

from sklearn.calibration import calibration_curve

# Load classifiers
# ----------------
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier


In [92]:
data_dict = prepare_data(X,y)

  return self.partial_fit(X, y)


In [93]:
l2_logistic = LogisticRegression(penalty='l2', solver='sag', multi_class='ovr')
cv_parameters = {'C':[0.1,1,10]}

l2_logistic = fit_classification(l2_logistic,data_dict,cv_parameters=cv_parameters,model_name='L2 Logistic Regression')

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   2 out of   9 | elapsed: 12.6min remaining: 44.1min
[Parallel(n_jobs=6)]: Done   3 out of   9 | elapsed: 12.7min remaining: 25.4min
[Parallel(n_jobs=6)]: Done   4 out of   9 | elapsed: 42.6min remaining: 53.2min
[Parallel(n_jobs=6)]: Done   5 out of   9 | elapsed: 43.3min remaining: 34.7min
[Parallel(n_jobs=6)]: Done   6 out of   9 | elapsed: 45.2min remaining: 22.6min
[Parallel(n_jobs=6)]: Done   7 out of   9 | elapsed: 73.0min remaining: 20.9min
[Parallel(n_jobs=6)]: Done   9 out of   9 | elapsed: 73.4min remaining:    0.0s
[Parallel(n_jobs=6)]: Done   9 out of   9 | elapsed: 73.4min finished


  Model: L2 Logistic Regression
Fit time: 6677.77 seconds
Optimal parameters:
{'C': 1}

              precision    recall  f1-score   support

           0     0.4967    0.2239    0.3087      1706
           1     0.0000    0.0000    0.0000        97
           2     0.3910    0.1286    0.1936      7930
           3     0.2325    0.0093    0.0178     10047
           4     0.3366    0.8291    0.4788     16713
           5     0.0000    0.0000    0.0000      5587
           6     0.0000    0.0000    0.0000       159
           7     0.2745    0.0394    0.0689      9323
           8     0.1983    0.0018    0.0037     12443
           9     0.3042    0.7328    0.4300     21474
          10     0.0000    0.0000    0.0000      7435
          11     0.0000    0.0000    0.0000       128
          12     0.2789    0.0344    0.0613      5610
          13     0.2000    0.0003    0.0005      7734
          14     0.2717    0.5214    0.3572     14062
          15     0.0000    0.0000    0.0000    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
from sklearn import svm

X_train = data_dict['X_train']
y_train = data_dict['y_train']

X_test = data_dict['X_test']
y_test = data_dict['y_test']

model = svm.SVC(probability=True, verbose=10)
model.fit(X_train, y_train)
results = model.predict_proba(X_test)[0]

# gets a dictionary of {'class_name': probability}
prob_per_class_dictionary = dict(zip(model.classes_, results))

# gets a list of ['most_probable_class', 'second_most_probable_class', ..., 'least_class']
results_ordered_by_probability = map(lambda x: x[0], sorted(zip(model.classes_, results), key=lambda x: x[1], reverse=True))

prob_per_class_dictionary



[LibSVM]

In [None]:
prob_per_class_dictionary