In [10]:
# forecasting_sales_prophet.py

import os
import pickle
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_absolute_error

# === CONFIGURATION ===
INPUT_PATH = "../outputs/cleaned_sales.csv"
MODEL_DIR = "output/models"
FORECAST_OUTPUT = "output/sept_nov_forecast.csv"
ACCURACY_OUTPUT = "output/validation_accuracy_Jun_Jul_Aug.csv"
FORECAST_HORIZON_WEEKS = 13
VALIDATION_START = pd.to_datetime("2024-06-01")
VALIDATION_END = pd.to_datetime("2024-08-31")

# === PREPARE OUTPUT DIRECTORIES ===
os.makedirs("output", exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# === LOAD AND PREPARE DATA ===
df = pd.read_csv(INPUT_PATH)
df = df.rename(columns={"SerailNum": "SerialNum"})
df['weekend_date'] = pd.to_datetime(df['weekend_date'], format='mixed', dayfirst=True)
serials = df['SerialNum'].unique()

forecast_results = []
accuracy_results = []

def train_and_forecast_for_serial(serial, data):
    print(f"\nProcessing SerialNum {serial}...")

    data = data[['weekend_date', 'quantity']].rename(columns={'weekend_date': 'ds', 'quantity': 'y'}).dropna()
    data = data.sort_values('ds')

    if len(data) < 2:
        print(f"Skipping SerialNum {serial}: Not enough total data.")
        return

    train = data[data['ds'] < VALIDATION_START]
    valid = data[(data['ds'] >= VALIDATION_START) & (data['ds'] <= VALIDATION_END)]

    if len(train) < 2:
        print(f"Skipping SerialNum {serial}: Not enough training data.")
        return

    # === TRAINING ===
    model = Prophet(weekly_seasonality=True, yearly_seasonality=True, growth='linear')
    train['floor'] = 0
    model.fit(train)

    # === SAVE MODEL ===
    model_filename = os.path.join(MODEL_DIR, f"prophet_model_serial_{serial}.pkl")
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved to {model_filename}")

    # === VALIDATION FORECAST ===
    future_valid = model.make_future_dataframe(periods=len(valid), freq='W')
    future_valid['floor'] = 0
    forecast_valid = model.predict(future_valid)[['ds', 'yhat']].tail(len(valid)).reset_index(drop=True)
    forecast_valid['yhat'] = forecast_valid['yhat'].clip(lower=0)
    forecast_valid['actual'] = valid['y'].values
    forecast_valid['SerialNum'] = serial
    forecast_valid['month'] = forecast_valid['ds'].dt.month

    monthly_accuracy = forecast_valid.groupby('month').apply(
        lambda g: 1 - (abs(g['yhat'] - g['actual']).sum() / g['actual'].sum())
    ).reset_index()
    monthly_accuracy.columns = ['month', 'accuracy']
    monthly_accuracy['SerialNum'] = serial
    accuracy_results.append(monthly_accuracy)

    # === FUTURE FORECAST ===
    future = model.make_future_dataframe(periods=FORECAST_HORIZON_WEEKS, freq='W')
    future['floor'] = 0
    forecast_future = model.predict(future)[['ds', 'yhat']].tail(FORECAST_HORIZON_WEEKS)
    forecast_future['yhat'] = forecast_future['yhat'].clip(lower=0)
    forecast_future['SerialNum'] = serial
    forecast_results.append(forecast_future)

# === PROCESS EACH SERIAL NUMBER ===
for serial in serials:
    serial_df = df[df['SerialNum'] == serial]
    train_and_forecast_for_serial(serial, serial_df)

# === SAVE OUTPUTS ===
pd.concat(forecast_results).to_csv(FORECAST_OUTPUT, index=False)
pd.concat(accuracy_results).to_csv(ACCURACY_OUTPUT, index=False)

# === SUMMARY ===
print("\n✅ Forecasting complete.")
print(f"- Forecasts saved to: {FORECAST_OUTPUT}")
print(f"- Accuracy results saved to: {ACCURACY_OUTPUT}")
print(f"- Trained models saved in: {MODEL_DIR}")
print(f"- Total SerialNums processed: {len(serials)}")
print(f"- Successfully trained models: {len(forecast_results)}")
if len(forecast_results) < len(serials):
    print(f"- Skipped SerialNums due to insufficient data: {len(serials) - len(forecast_results)}")

# === OPTIONAL: LOADER FUNCTION ===
def load_prophet_model(serial_num):
    """Load a saved Prophet model for a specific SerialNum."""
    model_path = os.path.join(MODEL_DIR, f"prophet_model_serial_{serial_num}.pkl")
    with open(model_path, 'rb') as f:
        return pickle.load(f)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['floor'] = 0
14:13:11 - cmdstanpy - INFO - Chain [1] start processing
14:13:11 - cmdstanpy - INFO - Chain [1] done processing
14:13:11 - cmdstanpy - INFO - Chain [1] done processing



Processing SerialNum 1...
Model saved to output/models\prophet_model_serial_1.pkl


  monthly_accuracy = forecast_valid.groupby('month').apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['floor'] = 0
14:13:11 - cmdstanpy - INFO - Chain [1] start processing
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['floor'] = 0
14:13:11 - cmdstanpy - INFO - Chain [1] start processing



Processing SerialNum 2...


14:13:11 - cmdstanpy - INFO - Chain [1] done processing
  monthly_accuracy = forecast_valid.groupby('month').apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['floor'] = 0
14:13:11 - cmdstanpy - INFO - Chain [1] start processing
  monthly_accuracy = forecast_valid.groupby('month').apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['floor'] = 0
14:13:11 - cmdstanpy - INFO - Chain [1] start processing


Model saved to output/models\prophet_model_serial_2.pkl

Processing SerialNum 3...
Skipping SerialNum 3: Not enough training data.

Processing SerialNum 4...


14:13:11 - cmdstanpy - INFO - Chain [1] done processing
  monthly_accuracy = forecast_valid.groupby('month').apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['floor'] = 0
14:13:11 - cmdstanpy - INFO - Chain [1] start processing
  monthly_accuracy = forecast_valid.groupby('month').apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['floor'] = 0
14:13:11 - cmdstanpy - INFO - Chain [1] start processing


Model saved to output/models\prophet_model_serial_4.pkl

Processing SerialNum 5...


14:13:11 - cmdstanpy - INFO - Chain [1] done processing


Model saved to output/models\prophet_model_serial_5.pkl

✅ Forecasting complete.
- Forecasts saved to: output/sept_nov_forecast.csv
- Accuracy results saved to: output/validation_accuracy_Jun_Jul_Aug.csv
- Trained models saved in: output/models
- Total SerialNums processed: 5
- Successfully trained models: 4
- Skipped SerialNums due to insufficient data: 1


  monthly_accuracy = forecast_valid.groupby('month').apply(
