# Delivery Prediction - Model - Random Forest 
This notebook covers the following:
- Loading of preprocessed data for training the model
- Training the random forest model
- Manual hyperparameter tuning
- Automated hyperparameter tuning with RandCV
- Calculating and viewing overall accuracy 
- Calculating and viewing accuracy for individual classes

In [3]:
import time
from datetime import timedelta
from datetime import datetime
import pickle
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import statistics
import joblib 
from imblearn.over_sampling import SMOTE

# Load sklearn utilities
# ----------------------
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, brier_score_loss, mean_squared_error, r2_score
from sklearn.metrics import precision_recall_fscore_support as score

from sklearn.calibration import calibration_curve

# Load classifiers
# ----------------
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier

## Loading preprocessed data

In [6]:
data_dict = np.load('data/data_dict_12windows_SMOTEno_MSAno.npz')

In [7]:
X_train = data_dict['X_train']
y_train = data_dict['y_train']

X_test = data_dict['X_test']
y_test = data_dict['y_test']

In [None]:
start_time = time.time()
model_rf = RandomForestClassifier(verbose=0, n_jobs=-1, max_depth=100, n_estimators= 25)
model_rf.fit(X_train, y_train)
print("Predicting...")
y_pred_rf = model_rf.predict(X_test)
y_pred_proba_rf = model_rf.predict_proba(X_test)

print("Calculating accuracy...")
calc_accuracy_windows(2, y_test, y_pred_rf)

elapsed_time_secs = time.time() - start_time
msg = "Execution took: %s secs (Wall clock time)" % timedelta(seconds=round(elapsed_time_secs))
print(msg)
print("-----")

## Manual Hyperparameter tuning

### Number of estimators

In [16]:
n_estimators = [int(x) for x in np.linspace(start = 25, stop = 90, num = 4)]
for i in n_estimators:
    start_time = time.time()
    print("Training for n_estimators = ", i)
    model_rf = RandomForestClassifier(verbose=0, n_jobs=-1, max_depth=100, n_estimators= i)
    model_rf.fit(X_train, y_train)
    print("Predicting...")
    y_pred_rf = model_rf.predict(X_test)
    y_pred_proba_rf = model_rf.predict_proba(X_test)

    print("Calculating accuracy...")
    calc_accuracy_windows(2, y_test, y_pred_rf)

    elapsed_time_secs = time.time() - start_time
    msg = "Execution took: %s secs (Wall clock time)" % timedelta(seconds=round(elapsed_time_secs))
    print(msg)
    print("-----")

Training for n_estimators =  25
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 73.4555%
Accuracy with +- 1 time window(s): 87.8412%
Accuracy with +- 2 time window(s): 89.6862%
Execution took: 0:04:20 secs (Wall clock time)
Training for n_estimators =  46
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 73.8355%
Accuracy with +- 1 time window(s): 88.0584%
Accuracy with +- 2 time window(s): 89.8530%
Execution took: 0:05:38 secs (Wall clock time)
Training for n_estimators =  68
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 74.0563%
Accuracy with +- 1 time window(s): 88.2213%
Accuracy with +- 2 time window(s): 89.9944%
Execution took: 0:09:05 secs (Wall clock time)
Training for n_estimators =  90
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 74.1249%
Accuracy with +- 1 time window(s): 88.2441%
Accuracy with +- 2 time window(s): 90.0074%
Execution took: 0:09:54 secs (Wall clock time)


### Class weights

In [17]:
class_weight =['balanced', 'balanced_subsample', None]
for i in class_weight:
    start_time = time.time()
    print("Training for class_weight = ", i)
    model_rf = RandomForestClassifier(verbose=0, n_jobs=-1, max_depth=100, n_estimators=25, class_weight=i)
    model_rf.fit(X_train, y_train)
    print("Predicting...")
    y_pred_rf = model_rf.predict(X_test)
#     y_pred_proba_rf = model_rf.predict_proba(X_test)

    print("Calculating accuracy...")
    calc_accuracy_windows(2, y_test, y_pred_rf)

    elapsed_time_secs = time.time() - start_time
    msg = "Execution took: %s secs (Wall clock time)" % timedelta(seconds=round(elapsed_time_secs))
    print(msg)
    print("-----")

Training for class_weight =  balanced
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 72.5942%
Accuracy with +- 1 time window(s): 86.9702%
Accuracy with +- 2 time window(s): 88.8846%
Execution took: 0:04:29 secs (Wall clock time)
-----
Training for class_weight =  balanced_subsample
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 72.5493%
Accuracy with +- 1 time window(s): 86.9074%
Accuracy with +- 2 time window(s): 88.8319%
Execution took: 0:04:53 secs (Wall clock time)
-----
Training for class_weight =  None
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 73.4173%
Accuracy with +- 1 time window(s): 87.8087%
Accuracy with +- 2 time window(s): 89.6655%
Execution took: 0:04:17 secs (Wall clock time)
-----


### Maximum features

In [18]:
max_features = ['auto', 'sqrt']
for i in max_features:
    start_time = time.time()
    print("Training for max_features = ", i)
    model_rf = RandomForestClassifier(verbose=0, n_jobs=-1, max_depth=100, n_estimators=25, max_features=i)
    model_rf.fit(X_train, y_train)
    print("Predicting...")
    y_pred_rf = model_rf.predict(X_test)
#     y_pred_proba_rf = model_rf.predict_proba(X_test)

    print("Calculating accuracy...")
    calc_accuracy_windows(2, y_test, y_pred_rf)

    elapsed_time_secs = time.time() - start_time
    msg = "Execution took: %s secs (Wall clock time)" % timedelta(seconds=round(elapsed_time_secs))
    print(msg)
    print("-----")

Training for max_features =  auto
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 73.4561%
Accuracy with +- 1 time window(s): 87.8489%
Accuracy with +- 2 time window(s): 89.6959%
Execution took: 0:04:19 secs (Wall clock time)
-----
Training for max_features =  sqrt
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 73.4036%
Accuracy with +- 1 time window(s): 87.7999%
Accuracy with +- 2 time window(s): 89.6572%
Execution took: 0:04:18 secs (Wall clock time)
-----


### Min Sample Split

In [19]:
min_samples_split = [2, 5, 10]
for i in min_samples_split:
    start_time = time.time()
    print("Training for min_samples_split = ", i)
    model_rf = RandomForestClassifier(verbose=0, n_jobs=-1, max_depth=100, n_estimators=25, min_samples_split=i)
    model_rf.fit(X_train, y_train)
    print("Predicting...")
    y_pred_rf = model_rf.predict(X_test)
#     y_pred_proba_rf = model_rf.predict_proba(X_test)

    print("Calculating accuracy...")
    calc_accuracy_windows(2, y_test, y_pred_rf)

    elapsed_time_secs = time.time() - start_time
    msg = "Execution took: %s secs (Wall clock time)" % timedelta(seconds=round(elapsed_time_secs))
    print(msg)
    print("-----")

Training for min_samples_split =  2
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 73.3567%
Accuracy with +- 1 time window(s): 87.7783%
Accuracy with +- 2 time window(s): 89.6312%
Execution took: 0:04:18 secs (Wall clock time)
-----
Training for min_samples_split =  5
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 73.2551%
Accuracy with +- 1 time window(s): 87.7496%
Accuracy with +- 2 time window(s): 89.5448%
Execution took: 0:04:20 secs (Wall clock time)
-----
Training for min_samples_split =  10
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 72.3045%
Accuracy with +- 1 time window(s): 87.2390%
Accuracy with +- 2 time window(s): 89.0391%
Execution took: 0:04:13 secs (Wall clock time)
-----


### Min Sample Leaf

In [20]:
min_samples_leaf = [1, 2, 4]
for i in min_samples_leaf:
    start_time = time.time()
    print("Training for min_samples_leaf = ", i)
    model_rf = RandomForestClassifier(verbose=0, n_jobs=-1, max_depth=100, n_estimators=25, min_samples_leaf=i)
    model_rf.fit(X_train, y_train)
    print("Predicting...")
    y_pred_rf = model_rf.predict(X_test)
#     y_pred_proba_rf = model_rf.predict_proba(X_test)

    print("Calculating accuracy...")
    calc_accuracy_windows(2, y_test, y_pred_rf)

    elapsed_time_secs = time.time() - start_time
    msg = "Execution took: %s secs (Wall clock time)" % timedelta(seconds=round(elapsed_time_secs))
    print(msg)
    print("-----")


Training for min_samples_leaf =  1
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 73.4013%
Accuracy with +- 1 time window(s): 87.8180%
Accuracy with +- 2 time window(s): 89.6644%
Execution took: 0:04:23 secs (Wall clock time)
-----
Training for min_samples_leaf =  2
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 71.5882%
Accuracy with +- 1 time window(s): 86.7669%
Accuracy with +- 2 time window(s): 88.5813%
Execution took: 0:04:08 secs (Wall clock time)
-----
Training for min_samples_leaf =  4
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 69.2795%
Accuracy with +- 1 time window(s): 85.4237%
Accuracy with +- 2 time window(s): 87.3183%
Execution took: 0:04:04 secs (Wall clock time)
-----


### Bootstrap

In [21]:
bootstrap = [True, False]
for i in bootstrap:
    start_time = time.time()
    print("Training for bootstrap = ", i)
    model_rf = RandomForestClassifier(verbose=0, n_jobs=-1, max_depth=100, n_estimators=25, bootstrap=i)
    model_rf.fit(X_train, y_train)
    print("Predicting...")
    y_pred_rf = model_rf.predict(X_test)
#     y_pred_proba_rf = model_rf.predict_proba(X_test)

    print("Calculating accuracy...")
    calc_accuracy_windows(2, y_test, y_pred_rf)

    elapsed_time_secs = time.time() - start_time
    msg = "Execution took: %s secs (Wall clock time)" % timedelta(seconds=round(elapsed_time_secs))
    print(msg)
    print("-----")

Training for bootstrap =  True
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 73.4590%
Accuracy with +- 1 time window(s): 87.8690%
Accuracy with +- 2 time window(s): 89.7166%
Execution took: 0:04:24 secs (Wall clock time)
-----
Training for bootstrap =  False
Predicting...
Calculating accuracy...
Accuracy with +- 0 time window(s): 73.6011%
Accuracy with +- 1 time window(s): 88.0793%
Accuracy with +- 2 time window(s): 90.0047%
Execution took: 0:05:47 secs (Wall clock time)
-----


## Automated Hyperparameter tuning with RandCV

In [32]:
data_dict = np.load('data/data_dict_14windows_SMOTEno_MSAno.npz')

In [9]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 250, num = 3)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(50, 130, num = 4)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Class weight
class_weight =['balanced', 'balanced_subsample', None]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'class_weight': class_weight}

print(random_grid)

{'n_estimators': [50, 150, 250], 'max_features': ['auto', 'sqrt'], 'max_depth': [50, 76, 103, 130], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False], 'class_weight': ['balanced', 'balanced_subsample', None]}


### Saving best results 

In [None]:
joblib.dump(rf_random.best_estimator_ , 'rf_random_model.pkl.z')
joblib.dump(rf_random.best_params_  , 'rf_random_params.pkl')
joblib.dump(rf_random.cv_results_  , 'rf_random_results.pkl')

## Visualizing overall accuracy 

In [4]:
def calc_accuracy_windows(max_windows, y_test, y_pred):

    # Initialize array to hold counts for each window
    count_arr = np.zeros(max_windows)

    # For each class window, if predicted class is in window, increment count 
    # E.g. if predicted class = 4 and target class = 6, since max window allowed = 2, consider instance as accurate and increment count 
    for idx, value in enumerate(y_test):
        for window in np.arange(1,max_windows+1): 
             # window_arr calculates window that predicted value can fall into 
             # e.g. target value = 4, window = 2, window_arr = {2,3,4,5,6}
             window_arr = np.arange(value - window, value + window + 1)
             if (y_pred[idx] in window_arr):
                count_arr[window-1] += 1

    # Print accuracy for each time window     
    accuracy_list = []
    print(f"Accuracy with +- 0 time window(s): {accuracy_score(y_test, y_pred)*100:.4f}%")
    accuracy_list.append(accuracy_score(y_test, y_pred))
    for idx, count in enumerate(count_arr):
        print(f"Accuracy with +- {idx+1} time window(s): {(count/len(y_pred))*100:.4f}%")   
        accuracy_list.append(count/len(y_pred))
    
    return accuracy_list

## Visualizing accuracy and classification results for individual results 

In [31]:
from sklearn.metrics import classification_report

target_names = ['0D', '0D - 1D 8am', '1D 8am - 1D 10.30am', '1D 10.30am - 1D 3pm',
               '1D 3pm - 2D', '2D - 2D 10.30am', '2D 10.30am - 2D 4.30pm',
               '2D 4.30pm - 2D 6.30pm', '2D 6.30pm - 3D', '3D - 3D 4.30pm',
               '3D 4.30pm - 3D 6.30pm', '3D 6.30pm - 4D', '4D - 5D', '5D - 6D']
print(classification_report(y_test, y_pred_rf, target_names=target_names))

                        precision    recall  f1-score   support

                    0D       0.88      0.68      0.77     24483
           0D - 1D 8am       0.79      0.52      0.63      1834
   1D 8am - 1D 10.30am       0.80      0.48      0.60    111575
   1D 10.30am - 1D 3pm       0.67      0.86      0.76    358761
           1D 3pm - 2D       0.56      0.27      0.36    101900
       2D - 2D 10.30am       0.84      0.40      0.54    135671
2D 10.30am - 2D 4.30pm       0.71      0.88      0.79    533489
 2D 4.30pm - 2D 6.30pm       0.68      0.13      0.22     48106
        2D 6.30pm - 3D       0.68      0.13      0.21     12266
        3D - 3D 4.30pm       0.76      0.82      0.79    423403
 3D 4.30pm - 3D 6.30pm       0.70      0.12      0.20     33862
        3D 6.30pm - 4D       0.69      0.12      0.20      9243
               4D - 5D       0.81      0.85      0.83    353810
               5D - 6D       0.85      0.73      0.79    134461

              accuracy                