In [None]:
%load_ext autoreload
%autoreload 2
import sys
import os
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import mean_absolute_error
from statsmodels.tsa.arima.model import ARIMA
#from src.config import TRANSFORMED_DATA_DIR
#from src.data_utils import split_time_series_data
#from src.experiment_utils import set_mlflow_tracking  # Assuming this sets up MLflow
from dotenv import load_dotenv
import mlflow
from mlflow.models.signature import infer_signature
from prophet import Prophet

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# Load data
df = pd.read_parquet("D:/Code/Git/ESDS_500_NY_CAB_TAXI/data/transformed/tabular_data.parquet")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
def split_time_series_data(df, cutoff_date, target_column):
    """Split DataFrame into train and test sets based on a cutoff date."""
    df['datetime'] = pd.to_datetime(df.index)  # Assuming index is datetime; adjust if needed
    train_df = df[df['datetime'] < cutoff_date]
    test_df = df[df['datetime'] >= cutoff_date]
    X_train = train_df.drop(columns=[target_column, 'datetime'])
    y_train = train_df[target_column]
    X_test = test_df.drop(columns=[target_column, 'datetime'])
    y_test = test_df[target_column]
    return X_train, y_train, X_test, y_test

# Mock set_mlflow_tracking function
def set_mlflow_tracking():
    """Set up MLflow tracking (mocked to local)."""
    mlflow.set_tracking_uri("file:./mlruns")  # Local tracking; adjust if using a server
    mlflow.set_experiment("Prophet_Experiment")
    return mlflow

# Load data
df = pd.read_parquet("D:/Code/Git/ESDS_500_NY_CAB_TAXI/data/transformed/tabular_data.parquet")

# Split data
X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# Ensure y_train and y_test have a datetime index
if not isinstance(y_train.index, pd.DatetimeIndex):
    y_train.index = pd.date_range(start="2022-01-01", periods=len(y_train), freq="D")
if not isinstance(y_test.index, pd.DatetimeIndex):
    y_test.index = pd.date_range(start=y_train.index[-1] + pd.Timedelta(days=1), 
                                 periods=len(y_test), freq="D")

# Prepare data for Prophet (requires 'ds' and 'y' columns)
train_df = pd.DataFrame({
    'ds': y_train.index,
    'y': y_train.values
})
test_df = pd.DataFrame({
    'ds': y_test.index,
    'y': y_test.values
})

# Train the Prophet model
prophet_model = Prophet(
    yearly_seasonality=True,
    weekly_seasonality=True,
    daily_seasonality=True
)
prophet_model.fit(train_df)

# Make future dataframe for predictions
future = prophet_model.make_future_dataframe(periods=len(y_test), freq="D")
forecast = prophet_model.predict(future)

# Extract predictions for the test period
predictions = forecast.tail(len(y_test))['yhat'].values

# Compute Mean Absolute Error (MAE)
test_mae = mean_absolute_error(y_test, predictions)
print(f"Test MAE: {test_mae:.4f}")

# Set up MLflow
load_dotenv()
mlflow = set_mlflow_tracking()

# Custom function to log Prophet model to MLflow
def log_prophet_to_mlflow(model, experiment_name, metric_name, score, test_df):
    with mlflow.start_run():
        # Log hyperparameters
        mlflow.log_param("yearly_seasonality", True)
        mlflow.log_param("weekly_seasonality", True)
        mlflow.log_param("daily_seasonality", True)
        
        # Log metric
        mlflow.log_metric(metric_name, score)
        
        # Use test_df['ds'] as input for signature inference
        input_df = test_df[['ds']]
        predictions = model.predict(input_df)['yhat']
        
        # Infer signature
        signature = infer_signature(input_df, predictions)
        
        # Log the model
        mlflow.prophet.log_model(model, "model", signature=signature)

# Log the Prophet model
log_prophet_to_mlflow(prophet_model, "Prophet", "mean_absolute_error", test_mae, test_df)

NameError: name 'DATA_FILE' is not defined