In [12]:
import warnings
warnings.filterwarnings("ignore")

# import libraries
from datetime import datetime
import pandas as pd
import optuna
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from src.paths import TRANSFORMED_DATA_DIR
from src.data_split import train_test_split
from src.model import get_pipeline

In [13]:
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df.head()

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29 00:00:00,1,0.0
1,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 01:00:00,1,0.0
2,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 02:00:00,1,0.0
3,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 03:00:00,1,0.0
4,1.0,1.0,0.0,2.0,0.0,0.0,1.0,2.0,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 04:00:00,1,0.0


In [14]:
df.pickup_hour.max()

Timestamp('2022-12-31 22:00:00')

In [3]:
X_train, y_train, X_test, y_test = train_test_split(
    df=df,
    cutoff_date=datetime(2022, 8, 1, 0, 0),
    target_col_name="target_rides_next_hour",
)

# print shapes of train and test data
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (1170240, 674)
y_train shape: (1170240,)
X_test shape: (972815, 674)
y_test shape: (972815,)


In [9]:
def create_objective(X_train, y_train):
    def objective(trial: optuna.trial.Trial) -> float:
        """
        Train a model with a set of hyperparameters and compute and validation error based on a TimeSeriesSplit.
        """
        # define hyperparameters to tune
        hyperparams = {
            "metric": "mae",
            "verbose": -1,
            "num_leaves": trial.suggest_int("num_leaves", 2, 256),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
            "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),    
        }

        tss = TimeSeriesSplit(n_splits=2)
        scores = []
        for train_index, val_index in tss.split(X_train):
            X_train_, X_val_ = X_train.iloc[train_index], X_train.iloc[val_index]
            y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

            # create pipeline
            pipeline = get_pipeline(**hyperparams)
            pipeline.fit(X_train_, y_train_)

            # evaluate the model
            y_pred = pipeline.predict(X_val_)
            mae = mean_absolute_error(y_val_, y_pred)
            scores.append(mae)

        # return the mean scores
        return np.array(scores).mean()

    return objective

In [10]:
study = optuna.create_study(direction="minimize")
study.optimize(create_objective(X_train, y_train), n_trials=5)

[I 2023-09-30 06:21:05,219] A new study created in memory with name: no-name-c89fb76f-faed-4479-80d0-fbff3f1dc200




[I 2023-09-30 06:22:42,951] Trial 0 finished with value: 3.072244159382656 and parameters: {'num_leaves': 94, 'feature_fraction': 0.3862504312836544, 'bagging_fraction': 0.8293642062420745, 'min_child_samples': 72}. Best is trial 0 with value: 3.072244159382656.




[I 2023-09-30 06:24:48,789] Trial 1 finished with value: 3.0837980014182467 and parameters: {'num_leaves': 102, 'feature_fraction': 0.9238433765735197, 'bagging_fraction': 0.6948025161038978, 'min_child_samples': 92}. Best is trial 0 with value: 3.072244159382656.




[I 2023-09-30 06:27:03,201] Trial 2 finished with value: 3.093753199729105 and parameters: {'num_leaves': 205, 'feature_fraction': 0.5329351850285253, 'bagging_fraction': 0.5544851277021376, 'min_child_samples': 35}. Best is trial 0 with value: 3.072244159382656.




[I 2023-09-30 06:29:24,019] Trial 3 finished with value: 3.0898218126032306 and parameters: {'num_leaves': 251, 'feature_fraction': 0.7099724755493637, 'bagging_fraction': 0.2745847262263166, 'min_child_samples': 55}. Best is trial 0 with value: 3.072244159382656.




[I 2023-09-30 06:31:46,805] Trial 4 finished with value: 3.0652738682202063 and parameters: {'num_leaves': 229, 'feature_fraction': 0.6187251349221514, 'bagging_fraction': 0.47045698322260165, 'min_child_samples': 85}. Best is trial 4 with value: 3.0652738682202063.


In [11]:
best_params = study.best_trial.params
print(f'{best_params=}')

pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

best_params={'num_leaves': 229, 'feature_fraction': 0.6187251349221514, 'bagging_fraction': 0.47045698322260165, 'min_child_samples': 85}
test_mae=2.5144
