In [3]:
%load_ext autoreload
%autoreload 2
import sys
import os
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import mean_absolute_error
from statsmodels.tsa.arima.model import ARIMA
from src.config import TRANSFORMED_DATA_DIR
from src.data_utils import split_time_series_data
from src.experiment_utils import set_mlflow_tracking  # Assuming this sets up MLflow
from dotenv import load_dotenv
import mlflow
from mlflow.models.signature import infer_signature

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# Load data
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")

# Split data
X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# Ensure y_train and y_test have a datetime index
if not isinstance(y_train.index, pd.DatetimeIndex):
    y_train.index = pd.date_range(start="2022-01-01", periods=len(y_train), freq="D")
if not isinstance(y_test.index, pd.DatetimeIndex):
    y_test.index = pd.date_range(start=y_train.index[-1] + pd.Timedelta(days=1), 
                                 periods=len(y_test), freq="D")

# Train an ARMA model (ARIMA with d=0)
p, q = 2, 2  # ARMA(p, q) - No differencing
arma_model = ARIMA(y_train, order=(p, 0, q))  # d=0 for ARMA
fitted_arma = arma_model.fit()

# Make predictions for test set
forecast_steps = len(y_test)
predictions = fitted_arma.forecast(steps=forecast_steps)

# Compute Mean Absolute Error (MAE)
test_mae = mean_absolute_error(y_test, predictions)
print(f"Test MAE: {test_mae:.4f}")

# Set up MLflow
load_dotenv()
mlflow = set_mlflow_tracking()

# Custom function to log ARMA model to MLflow
def log_arma_to_mlflow(model, experiment_name, metric_name, score, forecast_steps):
    with mlflow.start_run():
        # Log parameters
        mlflow.log_param("p", p)
        mlflow.log_param("d", 0)
        mlflow.log_param("q", q)
        
        # Log metric
        mlflow.log_metric(metric_name, score)
        
        # Create a dummy input for signature (array of steps)
        dummy_input = np.arange(forecast_steps).reshape(-1, 1)  # Shape: (forecast_steps, 1)
        predictions = model.forecast(steps=forecast_steps)
        
        # Infer signature with dummy input and predictions
        signature = infer_signature(dummy_input, predictions)
        
        # Log the model using statsmodels flavor
        mlflow.statsmodels.log_model(model, "model", signature=signature)

# Log the ARMA model
log_arma_to_mlflow(fitted_arma, "ARMA", "mean_absolute_error", test_mae, forecast_steps)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
(55900, 674)
(55900,)
(31720, 674)
(31720,)


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
INFO:src.experiment_utils:MLflow tracking URI and credentials set.


Test MAE: 18.3437
🏃 View run burly-elk-702 at: https://dagshub.com/s3akash/USTAXIMODEL.mlflow/#/experiments/6/runs/19cf0d0b107d4a3aa524d0aaa07cb290
🧪 View experiment at: https://dagshub.com/s3akash/USTAXIMODEL.mlflow/#/experiments/6
