In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
from taxi_demand_predictor.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")

from taxi_demand_predictor.data_split import train_test_split
from datetime import datetime
X_train, y_train, X_test, y_test = train_test_split(
    df, 
    target_column_name='target_rides_next_hour', 
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0))

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32595, 674)
y_train.shape=(32595,)
X_test.shape=(40545, 674)
y_test.shape=(40545,)


In [6]:
from taxi_demand_predictor.model import get_pipeline
from sklearn.model_selection import TimeSeriesSplit, KFold
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
import optuna

def objective(trial: optuna.trial.Trial, n_splits: int = 4) -> float:
    hyperparams = {
        "metric": "mae",
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),
    }

    tss = TimeSeriesSplit(n_splits=n_splits)
    scores = []
    for train_index, val_index in tss.split(X_train):
        # Split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        # train model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)

        # evaluate
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)

    return np.array(scores).mean()

In [7]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)

[I 2023-08-06 01:24:23,893] A new study created in memory with name: no-name-e65e5b42-7aca-45b7-8627-922a7560df63
[I 2023-08-06 01:24:41,474] Trial 0 finished with value: 3.112308484128409 and parameters: {'num_leaves': 121, 'feature_fraction': 0.7356102884826023, 'bagging_fraction': 0.2569512573285109, 'min_child_samples': 20}. Best is trial 0 with value: 3.112308484128409.
[I 2023-08-06 01:25:09,832] Trial 1 finished with value: 3.3703516080640017 and parameters: {'num_leaves': 175, 'feature_fraction': 0.5710355538087806, 'bagging_fraction': 0.7989623558707164, 'min_child_samples': 73}. Best is trial 0 with value: 3.112308484128409.
[I 2023-08-06 01:25:42,255] Trial 2 finished with value: 3.2270415402074164 and parameters: {'num_leaves': 210, 'feature_fraction': 0.8775540217822762, 'bagging_fraction': 0.6810932391617193, 'min_child_samples': 38}. Best is trial 0 with value: 3.112308484128409.
[I 2023-08-06 01:25:51,895] Trial 3 finished with value: 3.622255339534966 and parameters: {

In [9]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 121, 'feature_fraction': 0.7356102884826023, 'bagging_fraction': 0.2569512573285109, 'min_child_samples': 20}


In [10]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 154527
[LightGBM] [Info] Number of data points in the train set: 32595, number of used features: 675
[LightGBM] [Info] Start training from score 11.571069


In [11]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.4338
