In [None]:
%reload_ext autoreload
%autoreload 2
%cd ..

In [None]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

In [None]:
df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')

In [None]:
df.head(5)

In [None]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022, 6, 1, 0, 0 ,0),
    target_column_name='target_rides_next_hour'
)

In [None]:
print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

In [None]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    """
    Return predictions for last 4 weeks data
    """
    X['average_rides_last_4_weeks'] = 0.25*(
        X[f'rides_previous_{1*7*24}_hour'] + \
        X[f'rides_previous_{2*7*24}_hour'] + \
        X[f'rides_previous_{3*7*24}_hour'] + \
        X[f'rides_previous_{4*7*24}_hour']
    )
    return X

In [None]:
from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    func = average_rides_last_4_weeks,
    validate = False
)

In [None]:
add_feature_average_rides_last_4_weeks.fit_transform(X_train)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):
    """
    Create temporal features from pickup_hour
    """

    def fit(self, X: pd.DataFrame, y=None):
        return self

    def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:

        X_ = X.copy()

        # Generate numeric columns from datetime
        X_['hour'] = X_['pickup_hour'].dt.hour
        X_['day_of_week'] = X_['pickup_hour'].dt.dayofweek

        return X_.drop(columns=['pickup_hour'])

In [None]:
add_temporal_features = TemporalFeatureEngineer()
add_temporal_features.fit_transform(X_train)

In [None]:
import lightgbm as lgb

In [None]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
predictions = pipeline.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')