In [1]:
# import libraries
from datetime import datetime
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_absolute_error
from src.paths import TRANSFORMED_DATA_DIR
from src.data_split import train_test_split

In [2]:
# load data
df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df.head()

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29 00:00:00,1,0.0
1,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 01:00:00,1,0.0
2,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 02:00:00,1,0.0
3,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 03:00:00,1,0.0
4,1.0,1.0,0.0,2.0,0.0,0.0,1.0,2.0,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-01-29 04:00:00,1,0.0


In [3]:
# split data
X_train, y_train, X_test, y_test = train_test_split(
    df=df,
    cutoff_date=datetime(2022, 8, 1, 0, 0),
    target_col_name='target_rides_next_hour'
)

# check shapes for train and test
print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(1170240, 674)
y_train.shape=(1170240,)
X_test.shape=(972815, 674)
y_test.shape=(972815,)


In [4]:
def average_n_last_weeks(X: pd.DataFrame, n_weeks: int=4) -> pd.DataFrame:
    """
    Returns the average rides of the same hour from the last n weeks.

    Args:
        X (pd.DataFrame): dataframe with the features
        n_weeks (int): number of weeks to average

    Returns:
        pd.DataFrame: dataframe with the new feature
    """
    X_ = X.copy()
    average_n_weeks = np.average(
        [X_[f'rides_previous_{i*7*24}_hour'].values for i in range(1, n_weeks+1)],
        axis=0
    )
    X_[f'average_rides_{n_weeks}_weeks'] = average_n_weeks

    return X_

In [5]:
add_feature_average_4_last_weeks = FunctionTransformer(
    func=average_n_last_weeks,
    kw_args={'n_weeks': 4},
    validate=False
)


In [6]:
class PastWeeksHourlyAverage(BaseEstimator, TransformerMixin):
    """
    Returns the average rides of the same hour from the last n weeks.
    
    Args:
        n_weeks (int): Number of weeks to average. Default is 4.
    """

    def __init__(self, n_weeks: int = 4):
        self.n_weeks = n_weeks

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()

        average_n_weeks = np.average(
            [X_[f'rides_previous_{i*7*24}_hour'].values for i in range(1, self.n_weeks + 1)],
            axis=0
        )
        X_[f'average_rides_{self.n_weeks}_weeks'] = average_n_weeks
        return X_

In [9]:
class DatetimeComponentsExtractor(BaseEstimator, TransformerMixin):
    """Extractor for datetime components like hour and day of the week."""

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        X_['hour'] = X_['pickup_hour'].dt.hour 
        X_['day_of_week'] = X_['pickup_hour'].dt.dayofweek
        X_['is_weekend'] = X_['pickup_hour'].dt.weekday.isin([5, 6]).astype(int)

        X_.drop(columns=['pickup_hour'], inplace=True)
        
        return X_

In [10]:
datetime_extractor = DatetimeComponentsExtractor()
past_weeks_averager = PastWeeksHourlyAverage(n_weeks=4)

In [11]:
pipeline = make_pipeline(
    datetime_extractor,
    past_weeks_averager,
    lgb.LGBMRegressor()
)

pipeline.fit(X_train, y_train)

In [12]:
# predict on test data
y_pred = pipeline.predict(X_test)

# calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print(f'{mae=:.4f}')

mae=2.6251
