In [1]:
import warnings

import pandas as pd
from paths import GOLD_DATA_DIR
from preprocessing import (
    split_data,
    period_avg, 
    TemporalFeaturesEngineer, 
    ColumnDropper
)
from sklearn.metrics import mean_absolute_error
from plots import plot_train_and_target
from model import get_pipeline, objective
from datetime import datetime


import optuna

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_parquet(GOLD_DATA_DIR/str(2023)/'model_data_2023.parquet')
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_time,pickup_location_id,rides_next_hour
0,18.0,28.0,43.0,33.0,12.0,3.0,2.0,1.0,1.0,1.0,...,5.0,5.0,7.0,5.0,8.0,15.0,26.0,2023-01-29,4,53.0
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,...,3.0,1.0,5.0,2.0,1.0,2.0,0.0,2023-01-30,4,2.0
2,2.0,0.0,0.0,0.0,0.0,0.0,3.0,5.0,2.0,3.0,...,2.0,1.0,1.0,1.0,0.0,0.0,2.0,2023-01-31,4,0.0
3,3.0,16.0,27.0,19.0,12.0,5.0,4.0,2.0,2.0,3.0,...,1.0,1.0,0.0,2.0,1.0,2.0,4.0,2023-01-29,7,5.0
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,...,1.0,1.0,3.0,1.0,1.0,2.0,1.0,2023-01-30,7,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-09-30,109,0.0
7484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-09-29,115,0.0
7485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-09-30,115,0.0
7486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023-09-29,172,0.0


In [3]:
X_train, y_train, X_test, y_test = split_data(df,
                                              'pickup_time',
                                              datetime(2023, 10, 30, 0, 0, 0),
                                              'rides_next_hour')

In [4]:
print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(5683, 674)
y_train.shape=(5683,)
X_test.shape=(1805, 674)
y_test.shape=(1805,)


In [5]:
# Instantiate a study of the best parameters
study = optuna.create_study(direction="minimize")

# Optimize the objective function, find the best parameters
study.optimize(lambda trial: objective(trial, X_train, y_train, get_pipeline, mean_absolute_error, 5), n_trials=10)

[I 2024-08-01 20:38:38,847] A new study created in memory with name: no-name-cff68674-7295-4796-bcfd-20af01f07d3a
[I 2024-08-01 20:38:56,293] Trial 0 finished with value: 3.0707165991947827 and parameters: {'num_leaves': 99, 'feature_fraction': 0.9294121700337195, 'bagging_fraction': 0.5333781157239745, 'min_child_samples': 8}. Best is trial 0 with value: 3.0707165991947827.
[I 2024-08-01 20:39:03,404] Trial 1 finished with value: 3.3859349419883493 and parameters: {'num_leaves': 243, 'feature_fraction': 0.9200786468140287, 'bagging_fraction': 0.3629221535575261, 'min_child_samples': 76}. Best is trial 0 with value: 3.0707165991947827.
[I 2024-08-01 20:39:09,964] Trial 2 finished with value: 3.3569419766062594 and parameters: {'num_leaves': 176, 'feature_fraction': 0.8573649031114565, 'bagging_fraction': 0.30472018156927216, 'min_child_samples': 88}. Best is trial 0 with value: 3.0707165991947827.
[I 2024-08-01 20:39:34,885] Trial 3 finished with value: 3.2600879332811417 and parameter

In [6]:
# Retrieve the best parameters
best_params = study.best_params

# Print the best parameters
print(f"Best params: {best_params}")

Best params: {'num_leaves': 99, 'feature_fraction': 0.9294121700337195, 'bagging_fraction': 0.5333781157239745, 'min_child_samples': 8}


In [7]:
# train the model
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

In [8]:
# Predict the test set
predictions = pipeline.predict(X_test)

# Evaluate the model
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=: .4f}')

test_mae= 3.2439
