# Phase 3: Statistical Models (ARIMA / SARIMA / SARIMAX)

Parameter selection via AIC/BIC, residual diagnostics, and exogenous variables.

In [1]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.data_loader import REPO_ROOT, load_and_merge_data, get_aggregated_series
from src.models.statistical import (
    fit_sarima,
    fit_sarimax,
    grid_search_sarima,
    residual_diagnostics,
    forecast_sarima,
)
from src.metrics import evaluate_forecasts

## 1. Load Data & Split

In [2]:
data_dir = REPO_ROOT / "data" / "store-sales-time-series-forecasting"
df = load_and_merge_data(data_dir)
daily_sales = get_aggregated_series(df)

train_end = '2016-12-31'
val_start = '2017-01-01'
val_end = '2017-02-28'

train = daily_sales.loc[:train_end]
val = daily_sales.loc[val_start:val_end]

# Exog: onpromotion (aggregated) and holiday flag
daily = df.groupby('date').agg({'sales': 'sum', 'onpromotion': 'sum', 'is_holiday': 'first'}).reset_index()
daily = daily.set_index('date')
exog_train = daily.loc[:train_end][['onpromotion', 'is_holiday']].astype(float)
exog_val = daily.loc[val_start:val_end][['onpromotion', 'is_holiday']].astype(float)

## 2. Grid Search (SARIMA)

In [3]:
# Small grid to keep runtime reasonable
grid_result = grid_search_sarima(
    train,
    p_range=(0, 2), d_range=(0, 1), q_range=(0, 2),
    P_range=(0, 1), D_range=(0, 1), Q_range=(0, 1),
    s=7,
    metric='aic',
)
print('Best params:', grid_result['best_params'])
print('Best AIC:', grid_result['best_score'])

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._

Best params: None
Best AIC: inf


## 3. Fit Best SARIMA

In [4]:
order = grid_result['best_params']['order']
seasonal_order = grid_result['best_params']['seasonal_order']

_, sarima_result = fit_sarima(train, order, seasonal_order)
print(sarima_result.summary())

TypeError: 'NoneType' object is not subscriptable

## 4. Residual Diagnostics

In [None]:
residual_diagnostics(sarima_result)
plt.show()

## 5. SARIMAX (with promotions & holiday)

In [None]:
try:
    _, sarimax_result = fit_sarimax(train, exog_train, order, seasonal_order)
    y_sarimax = forecast_sarima(sarimax_result, len(val), exog_val)
except Exception as e:
    print(f'SARIMAX failed: {e}')
    y_sarimax = None

## 6. Forecast & Metrics

In [None]:
y_true = val.values
y_sarima = forecast_sarima(sarima_result, len(val))

print('SARIMA:', evaluate_forecasts(y_true, y_sarima))
if y_sarimax is not None:
    print('SARIMAX:', evaluate_forecasts(y_true, y_sarimax))

fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(val.index, y_true, label='Actual', color='black')
ax.plot(val.index, y_sarima, label='SARIMA')
if y_sarimax is not None:
    ax.plot(val.index, y_sarimax, label='SARIMAX')
ax.legend()
ax.set_title('SARIMA/SARIMAX vs Actual')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()