In [7]:
# Import modules
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, RidgeCV, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

In [2]:
# Import sim data to df
df = pd.read_csv('../combined/miami_req_100_term_2.csv')
df['vehicles_available_perc'] = df['vehiclesAvailable'] / df['acceptanceData_ridesRequested']

In [None]:
# Relevant input parameters
#  - Solver
#  - Load Time
#  - Unload Time
#  - Number of Vehicles Available (relative to number of requests)
#  - Methodology to incorporate number of depots and distance to requests

# Relevant 'target' variables
#  - Wait time
#  - % of Late Rides 
#  - % of Time Utilized
#  - % of Time Idle
#  - % of Time Deadheading

In [10]:
# Subset df, create correlation matrix between variables

df_sub = df[[
    
    # Input parameters
    "dwellTimes_average_loadMinutes"
    ,"dwellTimes_average_unloadMinutes"
    ,"vehicles_available_perc"
#     ,"scenarioData_solver"
    
    # Target parameters
    ,"waitTimeKpi_overall_averageMinutes"
#     ,"firstMileLastMileKpi_percentage_late_rides"
#     ,"fleetUtilizationTimePercent"
#     ,"fleetIdleTimePercent"
#     ,"fleetDeadheadTimePercent"
    
    ]]

In [11]:
df_sub.head()

Unnamed: 0,dwellTimes_average_loadMinutes,dwellTimes_average_unloadMinutes,vehicles_available_perc,waitTimeKpi_overall_averageMinutes
0,1.0,1.0,0.010101,245.54
1,1.0,1.0,0.020202,49.86
2,1.0,1.0,0.030303,16.63
3,1.0,1.0,0.040404,13.71
4,1.0,1.0,0.050505,12.21


In [12]:
df_sub.corr()

Unnamed: 0,dwellTimes_average_loadMinutes,dwellTimes_average_unloadMinutes,vehicles_available_perc,waitTimeKpi_overall_averageMinutes
dwellTimes_average_loadMinutes,1.0,0.004838,0.001416,0.027676
dwellTimes_average_unloadMinutes,0.004838,1.0,-0.000954,0.027358
vehicles_available_perc,0.001416,-0.000954,1.0,-0.453813
waitTimeKpi_overall_averageMinutes,0.027676,0.027358,-0.453813,1.0


In [13]:
# Build predictive models to understand relationship between parameters and waitTime

# Split into train/test
X = df_sub.drop('waitTimeKpi_overall_averageMinutes', axis=1)
y = df_sub['waitTimeKpi_overall_averageMinutes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

# Create steps
steps = [
    ('scalar', StandardScaler()),
    ('model', Ridge())
]

# Create param grid
params = [
    {'model__alpha': [0.1, 0.3, 0.5, 0.7, 0.9],
     'model__normalize': [True, False]}
]

# Initialize pipe and pass to GridSearchCV object
pipe = Pipeline(steps)
gm_cv = GridSearchCV(pipe, params, cv=5)
gm_cv.fit(X_train, y_train)

# R^2 
r2 = gm_cv.score(X_test, y_test)

print("Tuned ElasticNet Alpha: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))

# Run best model and obtain coefficients
ridge = RidgeCV(alphas=[0.9], normalize=True, cv=10)
ridge.fit(X, y)

coef_table = pd.DataFrame(list(X.columns)).copy()
coef_table.columns = ["variable"]
coef_table.insert(len(coef_table.columns),"coefficient",ridge.coef_.transpose())
coef_table.sort_values(by="coefficient", inplace = True)

coef_table

Tuned ElasticNet Alpha: {'model__alpha': 0.5, 'model__normalize': True}
Tuned ElasticNet R squared: 0.18759907336555115


Unnamed: 0,variable,coefficient
2,vehicles_available_perc,-184.577677
1,dwellTimes_average_unloadMinutes,0.568542
0,dwellTimes_average_loadMinutes,0.586464
