-Import all necessary libraries and data

In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [7]:
data = pd.read_csv('/content/LargeOrderLevelDataV2.csv')

-Reformat all date/time features to exclude the date and track time as seconds past midnight rather than HH:MM:SS

In [8]:
fix_dates = ['business_timestamp_order_taken', 'order_completed_timestamp', 'plan_ahead_requested_timestamp']
            
for x in fix_dates:
  st_times = []
  fl_times = []
  if x in data:
    for t in data[x]:
      if type(t) != str:
        t = "0.0"
        st_times.append(t)
        fl_times.append(t)
        continue
      st_times.append(t)
      nt = t[11: 19]
      nt = nt.replace(":", "")
      hr_s = nt[0: 2]
      min_s = nt[2: 4]
      sec = nt[4:]
      ts = float(hr_s) * 3600 + float(min_s) * 60 + float(sec)
      fl_times.append(str(ts))

    for k in range(len(st_times)):
      data[x] = data[x].replace([st_times[k]], float(fl_times[k]))

-Data cleaning: 
-convert features tracked in minutes to be tracked in seconds instead
-use numbers to represent labels for discrete features


In [9]:
fix = []
fix_mins = ['make_time', 'bake_time', 'rack_time', 'otd_time']

del data['store_order_number']
del data['make_time_v2']

data = data[data['make_time'].notna()]

for x in data:
  af = bool(True)
  for y in data[x]:
    if af == bool(False):
      break
    if type(y) != float:
      af = bool(False)
      fix.append(x)    

for a in fix:
  options = {}
  n = 0
  for x in data[a]:
    if not x in options.values():
      options[n] = x
      n += 1
  if len(options) == 1:
    del data[a]

  for i in range(len(options)):
    data[a] = data[a].replace([options[i]], float(i))

data = data.fillna(0.0)

for c in fix_mins:
  data[c] = data[c].apply(lambda x: x*60)

ctime = data['make_time']
del data['make_time']

-Train a random forest model on the data using hyperparameters found from grid search with 5-fold cross validation and check its accuracy. 

-Hyperparameters are currently hardcoded from a past grid search since grid search takes so long

In [12]:
train_data, test_data, train_time, test_time = train_test_split(data, ctime, test_size = 0.1, )
model = RandomForestRegressor(criterion = 'squared_error', max_depth = 8, min_samples_leaf = 5, n_estimators = 140)
model.fit(train_data, train_time)

guesses = model.predict(test_data)
if test_time.size == guesses.size:
  total_err = 0.0
  i = 0
  for x in test_time:
    total_err += abs(x - guesses[i])
    i += 1
  avg_err = total_err / i
  print(f"average error: {avg_err} seconds")
else:
  print("uh-oh")

average error: 45.44074049467927 seconds


-Check loss value of the model for overfitting

In [13]:
mean_squared_log_error(test_time, guesses)

0.16865098510987278

-Optimize hyperparameters with gridsearch

-This grid search was limited by time constraints. Even after eliminating some hyperparameters and possible values, this took about 4 hours

In [None]:
param_grid = {
    'n_estimators': [100, 110, 120, 130, 140, 150],
    'criterion': ["squared_error", "poisson"],#, "absolute_error"],
    'max_depth': [4, 5, 6, 7, 8],
    #'min_samples_split': [],
    'min_samples_leaf': [5, 10, 15, 20, 25],
    #'min_weight_fraction_leaf': [],
    #'max_features': [],
    #'max_leaf_nodes': [],
    #'min_impurity_decrease': [],
    #'bootstrap': [],
    #'oob_score': [],
    #'max_samples': [],
}

model = RandomForestRegressor()
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, cv = 5)
grid_search.fit(data, ctime)
grid_search.best_params_