In [26]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [28]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")

In [29]:
df["pickup_hour"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 87620 entries, 0 to 87619
Series name: pickup_hour
Non-Null Count  Dtype         
--------------  -----         
87620 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 684.7 KB


In [30]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(55900, 674)
(55900,)
(31720, 674)
(31720,)


In [31]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [
            f"rides_t-{7*24}",  # 1 week ago
            f"rides_t-{14*24}", # 2 weeks ago
            f"rides_t-{21*24}", # 3 weeks ago
            f"rides_t-{28*24}"  # 4 weeks ago
        ]

        # Ensure the required columns exist in the test DataFrame
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")

    # Calculate the average of the last 4 weeks
    X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)

    return X

from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False
)

In [32]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour"] = X_["pickup_hour"].dt.hour
        X_["day_of_week"] = X_["pickup_hour"].dt.dayofweek

        return X_.drop(columns=["pickup_hour", "location_id"])

add_temporal_features = TemporalFeatureEngineer()

In [33]:
import lightgbm as lgb

from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)

In [34]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error

# Ensure "pickup_location_id" and "location_id" are removed if they exist
drop_columns = ['location_id', 'pickup_location_id']
X_train = X_train.drop(columns=[col for col in drop_columns if col in X_train.columns], errors='ignore')
X_test = X_test.drop(columns=[col for col in drop_columns if col in X_test.columns], errors='ignore')

# Ensure column consistency
X_test = X_test[X_train.columns]  

# Debugging: Check columns before fitting
print("Columns in X_train:", X_train.columns)
print("Columns in X_test:", X_test.columns)

# Define the parameter grid for LGBMRegressor
param_distributions = {
    "lgbmregressor__num_leaves": [2, 50, 70, 256],
}

# Initialize the RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=5,
    scoring="neg_mean_absolute_error",
    cv=3,
    verbose=2,
    random_state=42,
)

# Fit the model
random_search.fit(X_train, y_train)

# Get best parameters
print("Best Parameters:", random_search.best_params_)
print("Best Score (Negative MAE):", random_search.best_score_)

# Evaluate on test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Test Set MAE:", mae)


Columns in X_train: Index(['rides_t-672', 'rides_t-671', 'rides_t-670', 'rides_t-669',
       'rides_t-668', 'rides_t-667', 'rides_t-666', 'rides_t-665',
       'rides_t-664', 'rides_t-663',
       ...
       'rides_t-9', 'rides_t-8', 'rides_t-7', 'rides_t-6', 'rides_t-5',
       'rides_t-4', 'rides_t-3', 'rides_t-2', 'rides_t-1', 'pickup_hour'],
      dtype='object', length=673)
Columns in X_test: Index(['rides_t-672', 'rides_t-671', 'rides_t-670', 'rides_t-669',
       'rides_t-668', 'rides_t-667', 'rides_t-666', 'rides_t-665',
       'rides_t-664', 'rides_t-663',
       ...
       'rides_t-9', 'rides_t-8', 'rides_t-7', 'rides_t-6', 'rides_t-5',
       'rides_t-4', 'rides_t-3', 'rides_t-2', 'rides_t-1', 'pickup_hour'],
      dtype='object', length=673)
Fitting 3 folds for each of 4 candidates, totalling 12 fits




[CV] END ........................lgbmregressor__num_leaves=2; total time=   0.1s
[CV] END ........................lgbmregressor__num_leaves=2; total time=   0.2s
[CV] END ........................lgbmregressor__num_leaves=2; total time=   0.1s
[CV] END .......................lgbmregressor__num_leaves=50; total time=   0.1s
[CV] END .......................lgbmregressor__num_leaves=50; total time=   0.1s
[CV] END .......................lgbmregressor__num_leaves=50; total time=   0.1s
[CV] END .......................lgbmregressor__num_leaves=70; total time=   0.1s
[CV] END .......................lgbmregressor__num_leaves=70; total time=   0.1s
[CV] END .......................lgbmregressor__num_leaves=70; total time=   0.2s
[CV] END ......................lgbmregressor__num_leaves=256; total time=   0.1s
[CV] END ......................lgbmregressor__num_leaves=256; total time=   0.1s
[CV] END ......................lgbmregressor__num_leaves=256; total time=   0.1s


ValueError: 
All the 12 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Sai Abheesh\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Sai Abheesh\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Sai Abheesh\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "c:\Users\Sai Abheesh\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\Sai Abheesh\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "c:\Users\Sai Abheesh\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "c:\Users\Sai Abheesh\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\Users\Sai Abheesh\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 921, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "c:\Users\Sai Abheesh\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "C:\Users\Public\Documents\Wondershare\CreatorTemp\ipykernel_26436\1703110252.py", line 13, in transform
    return X_.drop(columns=["pickup_hour", "location_id"])
  File "c:\Users\Sai Abheesh\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\frame.py", line 5581, in drop
    return super().drop(
  File "c:\Users\Sai Abheesh\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\generic.py", line 4788, in drop
    obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  File "c:\Users\Sai Abheesh\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\generic.py", line 4830, in _drop_axis
    new_axis = axis.drop(labels, errors=errors)
  File "c:\Users\Sai Abheesh\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\indexes\base.py", line 7070, in drop
    raise KeyError(f"{labels[mask].tolist()} not found in axis")
KeyError: "['location_id'] not found in axis"
