# 03 â€” ARIMA Model Selection (AIC/BIC)
Fit ARIMA(p,d,q) per correlation series and pick orders using AIC (BIC tie-break).

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

corr_df = pd.read_parquet("../data/processed/rolling_corr_sample.parquet")
corr_df.shape

In [None]:
# ARIMA grid + selection
def select_arima_order(y, p_vals=(0,1,2,3), d_vals=(0,1), q_vals=(0,1,2,3), max_iter=200):
    best = None
    y = pd.Series(y).dropna()

    for p in p_vals:
        for d in d_vals:
            for q in q_vals:
                try:
                    model = sm.tsa.ARIMA(y, order=(p,d,q))
                    res = model.fit(method_kwargs={"maxiter": max_iter})
                    cand = {
                        "p": p, "d": d, "q": q,
                        "aic": float(res.aic),
                        "bic": float(res.bic)
                    }
                    if best is None:
                        best = cand
                    else:
                        # Primary: AIC. Tie-breaker: BIC.
                        if cand["aic"] < best["aic"] - 1e-6:
                            best = cand
                        elif abs(cand["aic"] - best["aic"]) <= 1e-6 and cand["bic"] < best["bic"]:
                            best = cand
                except Exception:
                    continue
    return best

In [None]:
# Run selection for a subset
series_cols = corr_df.columns[:50]  # change to more if you want
orders = []

for col in series_cols:
    best = select_arima_order(corr_df[col].values)
    if best is None:
        best = {"p": None, "d": None, "q": None, "aic": np.nan, "bic": np.nan}
    best["series"] = col
    orders.append(best)

orders_df = pd.DataFrame(orders).sort_values("aic")
orders_df.head(10)

In [None]:
orders_df.to_csv("../data/processed/arima_orders_sample.csv", index=False)