In [1]:
import pandas as pd
import numpy as np
import pickle
import optuna
from prophet import Prophet
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from prophet.diagnostics import cross_validation, performance_metrics
import io
from contextlib import redirect_stdout, redirect_stderr

from utils.load_data import load_data
from utils.data_splitter import DataSplitter

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
optuna.logging.set_verbosity(optuna.logging.WARNING)

df = load_data('data/commodity_prices.csv')

In [2]:
date_dict = {
    'train_start': "2023-06-01", 'train_end': "2025-06-30",
    'valid_start': "2025-07-01", 'valid_end': "2025-07-31",
    'test_start': "2025-08-01", 'test_end': "2025-08-18"
}

thresholds = {
    'train': 100,
    'valid': 10,
    'test': 5
}

splitter = DataSplitter(df, date_dict, thresholds)
train_df, valid_df, test_df = splitter.run()

In [None]:

train_df = train_df.rename(columns={
    'Arrival_Date': 'ds',
    'log_Modal_Price': 'y'
})

pairs = train_df[['Product_Type', 'Market']].drop_duplicates()
results = []

for _, row in pairs.iterrows():
    product = row['Product_Type']
    market = row['Market']

    subset = train_df[(train_df['Product_Type'] == product) & (train_df['Market'] == market)]
    prophet_df = subset[['ds', 'y']]

    # Define objective function
    def objective(trial):
        model = Prophet(
            changepoint_prior_scale=trial.suggest_float('changepoint_prior_scale', 0.001, 0.5, log=True),
            seasonality_prior_scale=trial.suggest_float('seasonality_prior_scale', 1, 20, log=True),
            seasonality_mode=trial.suggest_categorical("seasonality_mode", ["additive", "multiplicative"]),
            yearly_seasonality=True,
            weekly_seasonality=False,
            daily_seasonality=False
        )

        # Redirect stdout/stderr to suppress CmdStanPy logs
        f = io.StringIO()
        with redirect_stdout(f), redirect_stderr(f):
            model.fit(prophet_df)

            df_cv = cross_validation(
                model,
                initial='100 days',
                period='90 days',
                horizon='30 days',
                disable_tqdm=True  # disables Prophet CV progress bar
            )

        df_perf = performance_metrics(df_cv)
        return df_perf['rmse'].mean()

    # Run Optuna optimization silently
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=30, show_progress_bar=False, n_jobs=1)

    best_params = study.best_params
    best_rmse = study.best_value

    # Fit final model silently
    f = io.StringIO()
    with redirect_stdout(f), redirect_stderr(f):
        best_model = Prophet(
            **best_params,
            yearly_seasonality=True,
            weekly_seasonality=False,
            daily_seasonality=False
        )
        best_model.fit(prophet_df)

    # Save model
    filename = f"models/prophet/prophet_{product}_{market}.pkl"
    with open(filename, "wb") as f_model:
        pickle.dump(best_model, f_model)

    # Save results
    results.append({
        "Product_Type": product,
        "Market": market,
        "best_params": best_params,
        "best_rmse": best_rmse
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

In [None]:
valid_df = valid_df.rename(columns ={
    'Arrival_Date': 'ds',
    'log_Modal_Price': 'y'
})

val_results = []

pairs = valid_df[['Product_Type', 'Market']].drop_duplicates()

for _, row in pairs.iterrows():
    product = row['Product_Type']
    market = row['Market']

    filename = f"models/prophet/prophet_{product}_{market}.pkl"
    try:
        with open(filename, "rb") as f:
            model = pickle.load(f)
    except FileNotFoundError:
        print(f"Model not found for {product}, {market}, skipping...")
        continue

    subset = valid_df[(valid_df['Product_Type'] == product) & (valid_df['Market'] == market)]
    
    future = model.make_future_dataframe(periods=30)
    forecast = model.predict(future)

    forecast_val = forecast[forecast['ds'].isin(subset['ds'])]
    merged = forecast_val[["ds", "yhat"]].merge(subset[["ds", "y"]], on="ds")

    rmse = root_mean_squared_error(merged["y"], merged["yhat"])
    mae = mean_absolute_error(merged["y"], merged["yhat"])
    mape = np.mean(np.abs((merged["y"] - merged["yhat"]) / merged["y"])) * 100

    val_results.append({
        "Product_Type": product,
        "Market": market,
        "RMSE": rmse,
        "MAE": mae,
        "MAPE": mape
    })

val_results_df = pd.DataFrame(val_results)


In [3]:
train_df = train_df.rename(columns ={
    'Arrival_Date': 'ds',
    'log_Modal_Price': 'y'
})

pairs = train_df[['Product_Type', 'Market']].drop_duplicates()

results = []

for _, row in pairs.iterrows():
    product = row['Product_Type']
    market = row['Market']

    subset = train_df[(train_df['Product_Type'] == product) & (train_df['Market'] == market)]
    prophet_df = subset[['ds', 'y']]

    # Define objective function inside the loop
    def objective(trial):
        model = Prophet(
            changepoint_prior_scale=trial.suggest_float('changepoint_prior_scale', 0.001, 0.5, log=True),
            seasonality_prior_scale=trial.suggest_float('seasonality_prior_scale', 1, 20, log=True),
            seasonality_mode=trial.suggest_categorical("seasonality_mode", ["additive", "multiplicative"]),
            yearly_seasonality=True,
            weekly_seasonality=False,
            daily_seasonality=False
        )
        model.fit(prophet_df)

        df_cv = cross_validation(
            model,
            initial='365 days',
            period='90 days',
            horizon='30 days',
            disable_tqdm=True
        )
        df_perf = performance_metrics(df_cv)
        return df_perf['rmse'].mean()

    # Run optimization
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20, show_progress_bar=False, n_jobs=2)  # fewer trials if you have 293 pairs

    best_params = study.best_params
    best_rmse = study.best_value

    # Fit final model with best params
    best_model = Prophet(**best_params, yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
    best_model.fit(prophet_df)

    # Save model
    filename = f"models/prophet/prophet_{product}_{market}.pkl"
    with open(filename, "wb") as f:
        pickle.dump(best_model, f)

    # Save results
    results.append({
        "Product_Type": product,
        "Market": market,
        "best_params": best_params,
        "best_rmse": best_rmse
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)


15:49:36 - cmdstanpy - INFO - Chain [1] start processing
15:49:36 - cmdstanpy - INFO - Chain [1] start processing
15:49:36 - cmdstanpy - INFO - Chain [1] done processing
Seasonality has period of 365.25 days which is larger than initial window. Consider increasing initial.
15:49:36 - cmdstanpy - INFO - Chain [1] done processing
Seasonality has period of 365.25 days which is larger than initial window. Consider increasing initial.
15:49:36 - cmdstanpy - INFO - Chain [1] start processing
15:49:36 - cmdstanpy - INFO - Chain [1] start processing
15:49:36 - cmdstanpy - INFO - Chain [1] done processing
15:49:36 - cmdstanpy - INFO - Chain [1] done processing
15:49:36 - cmdstanpy - INFO - Chain [1] start processing
15:49:36 - cmdstanpy - INFO - Chain [1] start processing
15:49:36 - cmdstanpy - INFO - Chain [1] done processing
15:49:36 - cmdstanpy - INFO - Chain [1] done processing
15:49:36 - cmdstanpy - INFO - Chain [1] start processing
15:49:36 - cmdstanpy - INFO - Chain [1] start processing


KeyboardInterrupt: 

In [None]:
valid_df = valid_df.rename(columns ={
    'Arrival_Date': 'ds',
    'log_Modal_Price': 'y'
})

val_results = []

pairs = valid_df[['Product_Type', 'Market']].drop_duplicates()

for _, row in pairs.iterrows():
    product = row['Product_Type']
    market = row['Market']

    filename = f"models/prophet/prophet_{product}_{market}.pkl"
    try:
        with open(filename, "rb") as f:
            model = pickle.load(f)
    except FileNotFoundError:
        print(f"Model not found for {product}, {market}, skipping...")
        continue

    subset = valid_df[(valid_df['Product_Type'] == product) & (valid_df['Market'] == market)]
    
    future = model.make_future_dataframe(periods=30)
    forecast = model.predict(future)

    forecast_val = forecast[forecast['ds'].isin(subset['ds'])]
    merged = forecast_val[["ds", "yhat"]].merge(subset[["ds", "y"]], on="ds")

    rmse = mean_squared_error(merged["y"], merged["yhat"], squared=False)
    mae = mean_absolute_error(merged["y"], merged["yhat"])
    mape = np.mean(np.abs((merged["y"] - merged["yhat"]) / merged["y"])) * 100

    val_results.append({
        "Product_Type": product,
        "Market": market,
        "RMSE": rmse,
        "MAE": mae,
        "MAPE": mape
    })

val_results_df = pd.DataFrame(val_results)


In [None]:
train_df = train_df.rename(columns ={
    'Arrival_Date': 'ds',
    'log_Modal_Price': 'y'
})
results = []

prophet_df = train_df[['ds', 'y']]


def objective(trial):
    changepoint_prior_scale = trial.suggest_float('changepoint_prior_scale', 0.001, 0.5, log=True)
    seasonality_prior_scale = trial.suggest_float("seasonality_prior_scale", 1, 20, log=True)
    seasonality_mode = trial.suggest_categorical("seasonality_mode", ["additive", "multiplicative"])
    
    model = Prophet(
        changepoint_prior_scale=changepoint_prior_scale,
        seasonality_prior_scale=seasonality_prior_scale,
        seasonality_mode=seasonality_mode,
        yearly_seasonality=True,   
        weekly_seasonality=False, 
        daily_seasonality=False    
    )

    model.fit(prophet_df)

    df_cv = cross_validation(
        model,
        initial='365 days',
        period='90 days',
        horizon='30 days'
        )
    df_perf = performance_metrics(df_cv)

    rmse = df_perf['rmse'].mean()
    return rmse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30, show_progress_bar=False)   # try 30 different parameter sets

print("Best parameters:", study.best_params)
print("Best RMSE:", study.best_value)


[I 2025-09-03 13:09:36,426] A new study created in memory with name: no-name-3be3d6c2-5b5f-4288-ba79-b579f0a06567
13:09:36 - cmdstanpy - INFO - Chain [1] start processing
13:09:36 - cmdstanpy - INFO - Chain [1] done processing
Seasonality has period of 365.25 days which is larger than initial window. Consider increasing initial.
  0%|          | 0/2 [00:00<?, ?it/s]13:09:36 - cmdstanpy - INFO - Chain [1] start processing
13:09:36 - cmdstanpy - INFO - Chain [1] done processing
13:09:36 - cmdstanpy - INFO - Chain [1] start processing
13:09:36 - cmdstanpy - INFO - Chain [1] done processing
100%|██████████| 2/2 [00:00<00:00, 19.39it/s]
[I 2025-09-03 13:09:36,630] Trial 0 finished with value: 0.22273783778795864 and parameters: {'changepoint_prior_scale': 0.0025191400197472342, 'seasonality_prior_scale': 1.3199369038308713, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 0.22273783778795864.
13:09:36 - cmdstanpy - INFO - Chain [1] start processing
13:09:36 - cmdstanpy - INFO - 

Best parameters: {'changepoint_prior_scale': 0.012645498677468493, 'seasonality_prior_scale': 3.3708425689393136, 'seasonality_mode': 'multiplicative'}
Best RMSE: 0.09056481207493536


0.0903839449495678

In [7]:
best_params = study.best_params

best_model = Prophet(
    changepoint_prior_scale=best_params["changepoint_prior_scale"],
    seasonality_prior_scale=best_params["seasonality_prior_scale"],
    seasonality_mode=best_params["seasonality_mode"],
    yearly_seasonality=True,
    weekly_seasonality=False,
    daily_seasonality=False
)

best_model.fit(prophet_df)

12:19:18 - cmdstanpy - INFO - Chain [1] start processing
12:19:18 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x10cc0f790>

In [4]:
train_df.shape, valid_df.shape, test_df.shape, 

((140521, 9), (6639, 9), (3677, 9))

In [None]:
train_df.groupby(['Product_Type', 'Market']).size().reset_index().shape

(293, 3)

In [5]:
d = train_df.groupby(['Product_Type', 'Market']).size().reset_index()

In [7]:
d[d['Product_Type'] == 'Bitter gourd|Other|FAQ']

Unnamed: 0,Product_Type,Market,0
88,Bitter gourd|Other|FAQ,Kothamangalam,561
89,Bitter gourd|Other|FAQ,Piravam,525


In [9]:
d.iloc[146]

Product_Type    Coriander(Leaves)|Other|FAQ
Market                              Piravam
0                                       115
Name: 146, dtype: object

In [12]:
d[0].min()

np.int64(107)

In [5]:
train_df.head()

Unnamed: 0,Product_Type,Commodity,Variety_Type,Arrival_Date,Market,Is_VFPCK,Season,Year,log_Modal_Price
0,Alsandikai|Alsandikai|FAQ,Alsandikai,Alsandikai|Alsandikai,2023-12-13,North Paravur,False,Winter,2023,8.556606
1,Alsandikai|Alsandikai|FAQ,Alsandikai,Alsandikai|Alsandikai,2023-12-14,North Paravur,False,Winter,2023,8.732466
2,Alsandikai|Alsandikai|FAQ,Alsandikai,Alsandikai|Alsandikai,2023-12-16,North Paravur,False,Winter,2023,8.47658
3,Alsandikai|Alsandikai|FAQ,Alsandikai,Alsandikai|Alsandikai,2023-12-18,North Paravur,False,Winter,2023,8.160804
4,Alsandikai|Alsandikai|FAQ,Alsandikai,Alsandikai|Alsandikai,2023-12-19,North Paravur,False,Winter,2023,8.612685
