In [None]:
# This is a model selction notebook for predicting delivery times using Lasso, Ridge, and SVR(sigmoid)

In [None]:
""""
some code attribution 
1. Raschka, S. (2015). Python Machine Learning. Birmingham, England: Packt Publishing.
2. Lopez de Prado, M. (2018). Advances in financial machine learning. Nashville, TN: John Wiley & Sons.
3. Hull, J.C. (2019). Machine Learning in Business.  

""""

In [26]:
# imports
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from time import time
from tabulate import tabulate
from sklearn.preprocessing import RobustScaler
from sklearn import linear_model, decomposition
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.svm import SVR

In [9]:
# load data
DATA_FOLDER = 'C:/Users/bscot/OneDrive/Documents/DoorDash/DoorDash2'
data = pd.read_csv(os.path.join(DATA_FOLDER, 'historical_imputed_1000.csv'))
# partition data
data_ind = data.drop('total delivery duration seconds', axis=1)
data_dep = data[['total delivery duration seconds']]
X_train, X_test, y_train, y_test = train_test_split(data_ind, data_dep, test_size=0.33, random_state=50)
print('X dataframe is: ',data_ind.shape)
print('y dataframe is: ',data_dep.shape)

X dataframe is:  (1000, 90)
y dataframe is:  (1000, 1)


In [4]:
# make a scorer to only punish positive errors
# the objective function for delivery prediction is asymmetric 
def mean_delinquency(y_predict, y_true):
    exclude_early = np.subtract(y_predict, y_true)
    # limit early reward to zero
    exclude_early[exclude_early <0 ] = 0
    # keep early in N but punish delinquency
    return (exclude_early.mean())

mean_delinquency_scorer = make_scorer(mean_delinquency, greater_is_better=False)

In [12]:
t0_lasso = time()
# create lasso pipeline
#
# scaler object
sc = RobustScaler()
# pca object
pca = PCA()
# regression object
lasso = Lasso()

# Pipeline of three steps
pipeline_lasso = Pipeline(steps=[('sc', sc), 
                       ('pca', pca), 
                       ('lasso', lasso)])
# lasso parameter space
parameters_lasso = {
    #'pca__n_components': list(range(1,data_ind.shape[1]+1,1)),
    'pca__n_components': [3, 6, 15, 35, 50, 70, 90],
    'lasso__alpha': [0.01/2, 0.02/2, 0.03/2, 0.04/2, 0.05/2, 0.075/2, 0.1/2]
}
# perform 5x2 nested cross validation
optimize_lasso = GridSearchCV(pipeline_lasso, parameters_lasso,scoring='neg_root_mean_squared_error', cv=2)
# understand features and hyperparams
optimize_lasso.fit(data_ind, data_dep)
# 
scores_lasso = cross_val_score(optimize_lasso,data_ind, data_dep, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1)
#
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores_lasso),np.std(scores_lasso)))
print('Best Alpha:', optimize_lasso.best_estimator_.get_params()['lasso__alpha'])
print('Best Number Of Components:', optimize_lasso.best_estimator_.get_params()['pca__n_components'])
print("done in %0.3fs" % (time() - t0_lasso))

CV accuracy: -937.411 +/- 89.019
Best Alpha: 0.05
Best Number Of Components: 6
done in 22.237s


In [10]:
t0_ridge = time()
# create ridge pipeline
#
# scaler object
sc = RobustScaler()
# pca object
pca = PCA()
# regression object
ridge = Ridge()

# Pipeline of three steps
pipeline_ridge = Pipeline(steps=[('sc', sc), 
                       ('pca', pca), 
                       ('ridge', ridge)])
# ridge parameter space
parameters_ridge = {
    #'pca__n_components': list(range(1,data_ind.shape[1]+1,1)),
    'pca__n_components': [3, 6, 15, 35, 50, 70, 90],
    'ridge__alpha': [0.01*int(len(data)), 0.02*int(len(data)), 0.03*int(len(data)), 0.04*int(len(data)), 0.05*int(len(data)), 0.075*int(len(data)),0.1*int(len(data)),0.2*int(len(data)), 0.6*int(len(data)), 1.0*int(len(data))]
}
# perform 5x2 nested cross validation
optimize_ridge = GridSearchCV(pipeline_ridge, parameters_ridge,scoring='neg_root_mean_squared_error', cv=2)
# understand features and hyperparams
optimize_ridge.fit(data_ind, data_dep)
# 
scores_ridge = cross_val_score(optimize_ridge,data_ind, data_dep, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1)
#
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores_ridge),np.std(scores_ridge)))
print('Best Alpha:', optimize_ridge.best_estimator_.get_params()['__alpha'])
print('Best Number Of Components:', optimize.best_estimator_.get_params()['pca__n_components'])

print("done in %0.3fs" % (time() - t0_ridge))

CV accuracy: -868.473 +/- 66.235
Best Alpha: 10.0
Best Number Of Components: 50
done in 29.934s


In [19]:
t0_svr = time()
# create svr pipeline
#
# scaler object
sc = RobustScaler()
# pca object
pca = PCA()
# regression object
svr = SVR(kernel='sigmoid')

# pipeline of three steps
pipeline_svr = Pipeline(steps=[('sc', sc), 
                       ('pca', pca), 
                       ('svr', svr)])
# svr parameter space
parameters_svr = {
    #'pca__n_components': list(range(1,data_ind.shape[1]+1,1)),
    'pca__n_components': [5, 15, 30, 45, 60, 75, 90],
    'svr__C': [1.0, 50.0, 100.0]
}
# perform 5x2 nested cross validation
optimize_svr = GridSearchCV(pipeline_svr, parameters_svr,scoring='neg_root_mean_squared_error', cv=2)
# understand features and hyperparams
optimize_svr.fit(data_ind, data_dep.values.ravel())
#
scores_svr = cross_val_score(optimize_svr,data_ind, data_dep.values.ravel(), scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1)
#
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores_svr),np.std(scores_svr)))
print('Best C:', optimize_svr.best_estimator_.get_params()['svr__C'])
print('Best Number Of Components:', optimize_svr.best_estimator_.get_params()['pca__n_components'])
print("done in %0.3fs" % (time() - t0_svr))

CV accuracy: -1001.573 +/- 69.773
Best C: 50.0
Best Number Of Components: 45
done in 13.181s


In [36]:
table = [["Mean_RMSE",np.mean(scores_lasso),np.mean(scores_ridge),np.mean(scores_svr),'other wb'],
["Sigma_RMSE",np.std(scores_lasso),np.std(scores_ridge),np.std(scores_svr),'other wb'],
["Time",(time() - t0_lasso),(time() - t0_ridge),(time() - t0_svr),'other wb']]
headers = ["Metric", "Lasso","Ridge","SVR","NN"]
print(tabulate(table, headers, tablefmt="plain",numalign="right"))

Metric         Lasso     Ridge       SVR  NN
Mean_RMSE   -937.411  -868.473  -1001.57  other wb
Sigma_RMSE   89.0191   66.2353   69.7732  other wb
Time         27774.6   28263.4   25808.4  other wb
