In [2]:
import sys
import os
import pandas as pd
from datetime import datetime
import lightgbm as lgb
from dotenv import load_dotenv
load_dotenv()
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# from sklearn.model_selection import train_test_split #Removed
import matplotlib.pyplot as plt
# from sklearn.model_selection import cross_val_score #Removed

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.config import TRANSFORMED_DATA_DIR
from src.data_utils import split_time_series_data
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow

In [3]:
def split_time_series_data_cutoff(
    df: pd.DataFrame,
    target_column: str,
    cutoff_date: str
) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
    """
    Splits a time series DataFrame into training and testing sets based on a cutoff date.

    Args:
        df (pd.DataFrame): The input DataFrame containing the time series data.
        target_column (str): The name of the target column to separate from the features.
        cutoff_date (str):  Date string ("YYYY-MM-DD HH:MM:SS")

    Returns:
        Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
        - X_train (pd.DataFrame): Training features.
        - y_train (pd.Series): Training target values.
        - X_test (pd.DataFrame): Testing features.
        - y_test (pd.Series): Testing target values.
    """
    # Sort the DataFrame by date
    df_sorted = df.sort_values("pickup_hour")

    # Convert the cutoff date string to a datetime object
    cutoff_datetime = pd.to_datetime(cutoff_date)

    # Split the data into training and testing sets
    train_data = df_sorted[df_sorted["pickup_hour"] < cutoff_datetime].reset_index(drop=True)
    test_data = df_sorted[df_sorted["pickup_hour"] >= cutoff_datetime].reset_index(drop=True)

    # Separate features (X) and target (y) for both sets
    X_train = train_data.drop(columns=[target_column])
    y_train = train_data[target_column]
    X_test = test_data.drop(columns=[target_column])
    y_test = test_data[target_column]

    return X_train, y_train, X_test, y_test

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")

In [4]:
# Define the cutoff date
CUTOFF_DATE = "2023-03-01 00:00:00"

# Split the data using the new function
X_train, y_train, X_test, y_test = split_time_series_data_cutoff(df, target_column="target", cutoff_date=CUTOFF_DATE)

print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [
        f"rides_t-{7*24}", # 1 week ago
        f"rides_t-{14*24}", # 2 weeks ago
        f"rides_t-{21*24}", # 3 weeks ago
        f"rides_t-{28*24}" # 4 weeks ago
    ]

    # Ensure the required columns exist in the test DataFrame
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")

    # Calculate the average of the last 4 weeks
    X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)

    return X


add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False
)


Train set shape: (8060, 674)
Test set shape: (79560, 674)


In [None]:
class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour"] = X_["pickup_hour"].dt.hour
        X_["day_of_week"] = X_["pickup_hour"].dt.dayofweek

        return X_.drop(columns=["pickup_hour", "pickup_location_id"])

add_temporal_features = TemporalFeatureEngineer()

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)

learning_rates = [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.5, 0.65, 0.75, 0.85, 0.95, 1]

for lr in learning_rates:
    model = pipeline.set_params(lgbmregressor__learning_rate=lr)
    model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
test_mae = mean_absolute_error(y_test, y_test_pred)
print(f"Test mae: {test_mae}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.048705 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 134340
[LightGBM] [Info] Number of data points in the train set: 8060, number of used features: 674
[LightGBM] [Info] Start training from score 10.315261
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039815 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 134340
[LightGBM] [Info] Number of data points in the train set: 8060, number of used features: 674
[LightGBM] [Info] Start training from score 10.315261
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037554 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 134340
[LightGBM] [Info] Number of data points in the train set: 8060, number of used features: 674
[LightGBM] [Info] S