In [1]:
import pandas as pd
import numpy as np
import pickle
import optuna
from prophet import Prophet
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from prophet.diagnostics import cross_validation, performance_metrics
import io
from contextlib import redirect_stdout, redirect_stderr

from utils.load_data import load_data
from utils.data_splitter import DataSplitter
from utils.model_filters import filter_for_prophet

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
optuna.logging.set_verbosity(optuna.logging.WARNING)

df = load_data('data/commodity_prices.csv')

In [2]:
date_dict = {
    'train_start': "2023-06-01", 'train_end': "2025-06-30",
    'valid_start': "2025-07-01", 'valid_end': "2025-07-31",
    'test_start': "2025-08-01", 'test_end': "2025-08-18"
}

prophet_thresholds = {
    'train': 250,
    'valid': 10,
    'test': 5
}

splitter = DataSplitter(df, date_dict, prophet_thresholds)
train_df, valid_df, test_df = splitter.run()
train_df, valid_df, test_df = filter_for_prophet(train_df, valid_df, test_df, cutoff="2025-06-15")


In [3]:

train_df = train_df.rename(columns={
    'Arrival_Date': 'ds',
    'log_Modal_Price': 'y'
})

pairs = train_df[['Product_Type', 'Market']].drop_duplicates()
results = []

for _, row in pairs.iterrows():
    product = row['Product_Type']
    market = row['Market']

    subset = train_df[(train_df['Product_Type'] == product) & (train_df['Market'] == market)]
    prophet_df = subset[['ds', 'y']]

    # Define objective function
    def objective(trial):
        model = Prophet(
            changepoint_prior_scale=trial.suggest_float('changepoint_prior_scale', 0.001, 0.5, log=True),
            seasonality_prior_scale=trial.suggest_float('seasonality_prior_scale', 1, 20, log=True),
            seasonality_mode=trial.suggest_categorical("seasonality_mode", ["additive", "multiplicative"]),
            yearly_seasonality=True,
            weekly_seasonality=False,
            daily_seasonality=False
        )

        # Redirect stdout/stderr to suppress CmdStanPy logs
        f = io.StringIO()
        with redirect_stdout(f), redirect_stderr(f):
            model.fit(prophet_df)

            df_cv = cross_validation(
                model,
                initial='365 days',
                period='90 days',
                horizon='30 days',
                disable_tqdm=True  # disables Prophet CV progress bar
            )

        df_perf = performance_metrics(df_cv)
        return df_perf['rmse'].mean()

    # Run Optuna optimization silently
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=30, show_progress_bar=False, n_jobs=1)

    best_params = study.best_params
    best_rmse = study.best_value

    # Fit final model silently
    f = io.StringIO()
    with redirect_stdout(f), redirect_stderr(f):
        best_model = Prophet(
            **best_params,
            yearly_seasonality=True,
            weekly_seasonality=False,
            daily_seasonality=False
        )
        best_model.fit(prophet_df)

    # Save model
    filename = f"models/prophet/prophet_{product}_{market}.pkl"
    with open(filename, "wb") as f_model:
        pickle.dump(best_model, f_model)

    # Save results
    results.append({
        "Product_Type": product,
        "Market": market,
        "best_params": best_params,
        "best_rmse": best_rmse
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

In [4]:
valid_df = valid_df.rename(columns ={
    'Arrival_Date': 'ds',
    'log_Modal_Price': 'y'
})

val_results = []

pairs = valid_df[['Product_Type', 'Market']].drop_duplicates()

for _, row in pairs.iterrows():
    product = row['Product_Type']
    market = row['Market']

    filename = f"models/prophet/prophet_{product}_{market}.pkl"
    try:
        with open(filename, "rb") as f:
            model = pickle.load(f)
    except FileNotFoundError:
        print(f"Model not found for {product}, {market}, skipping...")
        continue

    subset = valid_df[(valid_df['Product_Type'] == product) & (valid_df['Market'] == market)]
    
    future = model.make_future_dataframe(periods=60)
    forecast = model.predict(future)

    forecast_val = forecast[forecast['ds'].isin(subset['ds'])]
    merged = forecast_val[["ds", "yhat"]].merge(subset[["ds", "y"]], on="ds")

    rmse = root_mean_squared_error(merged["y"], merged["yhat"])
    mae = mean_absolute_error(merged["y"], merged["yhat"])
    mape = np.mean(np.abs((merged["y"] - merged["yhat"]) / merged["y"])) * 100

    val_results.append({
        "Product_Type": product,
        "Market": market,
        "RMSE": rmse,
        "MAE": mae,
        "MAPE": mape
    })

val_results_df = pd.DataFrame(val_results)


In [6]:
results_df.head()

Unnamed: 0,Product_Type,Market,best_params,best_rmse
0,Alsandikai|Alsandikai|FAQ,North Paravur,{'changepoint_prior_scale': 0.0160914774492605...,0.090032
1,Amaranthus|Amaranthus|FAQ,Aluva,{'changepoint_prior_scale': 0.0111509110859437...,0.12435
2,Amaranthus|Amaranthus|FAQ,Angamaly,{'changepoint_prior_scale': 0.0037172801925956...,0.098268
3,Amaranthus|Amaranthus|FAQ,Broadway market,{'changepoint_prior_scale': 0.2900761911118907...,0.065033
4,Amaranthus|Amaranthus|FAQ,Ernakulam,{'changepoint_prior_scale': 0.0096480530972131...,0.11236


In [8]:
val_results_df.head()

Unnamed: 0,Product_Type,Market,RMSE,MAE,MAPE
0,Alsandikai|Alsandikai|FAQ,North Paravur,0.358751,0.331599,3.923165
1,Amaranthus|Amaranthus|FAQ,Aluva,0.263948,0.219611,2.644459
2,Amaranthus|Amaranthus|FAQ,Angamaly,0.105153,0.10422,1.301665
3,Amaranthus|Amaranthus|FAQ,Broadway market,0.048119,0.03938,0.487904
4,Amaranthus|Amaranthus|FAQ,Ernakulam,0.155013,0.110221,1.355356
