In [1]:
# autoreload
%load_ext autoreload
%autoreload 2

from datetime import datetime
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import optuna

# set current working directory to project root
import os
os.chdir(os.path.dirname(os.getcwd()))
from src.paths import TRANSFORMED_DATA_DIR
from src import data_split
from src import model
from src import plot

In [2]:
# load dataset
df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,11.0,15.0,26.0,8.0,9.0,7.0,3.0,1.0,0.0,3.0,...,11.0,7.0,4.0,3.0,4.0,9.0,19.0,2022-01-29,4,17.0
1,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,5.0,4.0,10.0,7.0,5.0,9.0,10.0,2022-01-30,4,9.0
2,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,...,8.0,7.0,8.0,5.0,5.0,10.0,0.0,2022-01-31,4,3.0
3,1.0,1.0,0.0,0.0,0.0,3.0,2.0,3.0,4.0,5.0,...,3.0,16.0,7.0,1.0,0.0,1.0,3.0,2022-02-01,4,3.0
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,1.0,2.0,...,3.0,8.0,3.0,0.0,4.0,4.0,3.0,2022-02-02,4,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-11-26,199,0.0
80168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-11-27,199,0.0
80169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-11-28,199,0.0
80170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-11-29,199,0.0


In [3]:
# train test split
X_train, y_train, X_test, y_test = data_split.train_test_split(df, datetime(2022, 6, 1, 0, 0, 0), 'target_rides_next_hour')

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32226, 674)
y_train.shape=(32226,)
X_test.shape=(47946, 674)
y_test.shape=(47946,)


In [4]:
# define objective function
def objective(trial: optuna.trial.Trial) -> float:
    '''Takes in hyperparameters as input, and trains a model that computes the average validation error based on TimeSeriesSplit cross validation'''

    # define hyperparameters
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }

    tss = TimeSeriesSplit(n_splits=4)
    scores = []
    for train_index, val_index in tss.split(X_train):
        # split data
        X_train_, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

        # create model
        pipeline = model.get_pipeline(**params)

        # fit model
        pipeline.fit(X_train_, y_train_)

        # compute validation error
        y_pred = pipeline.predict(X_val)
        mae = mean_absolute_error(y_val, y_pred)

        scores.append(mae)
    
    return np.mean(scores)

In [5]:
import warnings
warnings.filterwarnings('ignore')

# optuna study
study = optuna.create_study(direction='minimize', study_name='lightgbm')
study.optimize(objective, n_trials=10)

[32m[I 2023-03-06 13:27:39,019][0m A new study created in memory with name: lightgbm[0m
[32m[I 2023-03-06 13:27:46,489][0m Trial 0 finished with value: 1.6154221889593725 and parameters: {'num_leaves': 33, 'colsample_bytree': 0.3689506953116249, 'subsample': 0.8787489345369892, 'min_child_samples': 34}. Best is trial 0 with value: 1.6154221889593725.[0m
[32m[I 2023-03-06 13:28:07,621][0m Trial 1 finished with value: 1.4602736496738657 and parameters: {'num_leaves': 178, 'colsample_bytree': 0.9001691960101477, 'subsample': 0.4412345562104535, 'min_child_samples': 86}. Best is trial 1 with value: 1.4602736496738657.[0m
[32m[I 2023-03-06 13:28:14,200][0m Trial 2 finished with value: 1.6113077514456804 and parameters: {'num_leaves': 26, 'colsample_bytree': 0.7778005818373854, 'subsample': 0.6982971355214072, 'min_child_samples': 67}. Best is trial 1 with value: 1.4602736496738657.[0m
[32m[I 2023-03-06 13:28:25,520][0m Trial 3 finished with value: 1.427971940528273 and paramet

In [6]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 220, 'colsample_bytree': 0.57894966988321, 'subsample': 0.6731226675537886, 'min_child_samples': 34}


In [7]:
# fit best params on full training set
pipeline = model.get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

In [8]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.5062


In [9]:
plot.plot_one_sample(features=X_test, targets=y_test, predictions=pd.Series(predictions), example_id=2000)