In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Show all output for a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [4]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

citi_bike_df = pd.read_parquet(TRANSFORMED_DATA_DIR / "citi_bike_tabular_data.parquet")

In [5]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    citi_bike_df,
    cutoff_date=datetime(2024, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(648, 674)
(648,)
(366, 674)
(366,)


In [6]:
lag_columns = [col for col in X_train.columns if col.startswith("ride_count_t-")]
X_train_lags = X_train[lag_columns]
X_test_lags = X_test[lag_columns]

In [7]:
import lightgbm as lgb

lgb_base = lgb.LGBMRegressor()
lgb_base.fit(X_train_lags, y_train)

# Get top 10 most important features
importances = lgb_base.feature_importances_
feature_ranks = pd.Series(importances, index=lag_columns).sort_values(ascending=False)
top_features = feature_ranks.head(10).index.tolist()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006873 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27672
[LightGBM] [Info] Number of data points in the train set: 648, number of used features: 672
[LightGBM] [Info] Start training from score 4.978395


In [8]:
X_train_reduced = X_train[top_features]
X_test_reduced = X_test[top_features]

In [9]:
lgb_model = lgb.LGBMRegressor()
lgb_model.fit(X_train_reduced, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 313
[LightGBM] [Info] Number of data points in the train set: 648, number of used features: 10
[LightGBM] [Info] Start training from score 4.978395


In [12]:
from sklearn.metrics import mean_absolute_error

predictions = lgb_model.predict(X_test_reduced)
test_mae = mean_absolute_error(y_test, predictions)
print(f"MAE on reduced feature set: {test_mae:.4f}")

MAE on reduced feature set: 3.3224


In [13]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os
load_dotenv() 

mlflow = set_mlflow_tracking()
log_model_to_mlflow(lgb_model, X_test_reduced, "LGBM_Regressor_Top10Features", "mean_absolute_error", score=test_mae)

True

INFO:src.experiment_utils:MLflow tracking URI and credentials set.
2025/05/11 05:23:15 INFO mlflow.tracking.fluent: Experiment with name 'LGBM_Regressor_Top10Features' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBM_Regressor_Top10Features
INFO:src.experiment_utils:Logged mean_absolute_error: 3.3224286930210147
INFO:src.experiment_utils:Model signature inferred.
Registered model 'LGBMRegressor' already exists. Creating a new version of this model...
2025/05/11 05:23:24 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 2
Created version '2' of model 'LGBMRegressor'.
INFO:src.experiment_utils:Model logged with name: LGBMRegressor


🏃 View run masked-mare-918 at: https://dagshub.com/YashMathur3010/Citi_Bike.mlflow:5000/#/experiments/4/runs/d2b6c326ec444e59a21010e83914f93e
🧪 View experiment at: https://dagshub.com/YashMathur3010/Citi_Bike.mlflow:5000/#/experiments/4


<mlflow.models.model.ModelInfo at 0x1350ca150>