In [30]:
# 1) import your preprocessing function
from preprocessing import load_and_preprocess_data

# 2) load & preprocess
#    this returns (monthly_data, annual_data)
df, monthly_data, annual_data = load_and_preprocess_data(path = "../data/dataset.csv")

# 3) work off the monthly_data DataFrame
df = monthly_data.copy()


In [31]:
import numpy as np

df['WERT_log'] = np.log1p(df['WERT'])   # log1p handles zeros gracefully


In [32]:
from statsmodels.tsa.stattools import adfuller
result = adfuller(df['WERT_log'].dropna())
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')


ADF Statistic: -0.501687922518181
p-value: 0.8916866886816828


In [33]:
df['WERT_log_diff'] = df['WERT_log'].diff().dropna()
result_diff = adfuller(df['WERT_log_diff'].dropna())
print(f'Differenced ADF Statistic: {result_diff[0]}')
print(f'Differenced p-value: {result_diff[1]}')


Differenced ADF Statistic: -5.067996287177161
Differenced p-value: 1.6252088690216988e-05


In [34]:
#find the optimal SARIMA parameters using a grid search approach to minimize the AIC (Akaike Information Criterion)

import itertools
import warnings
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Define p, d, q and seasonal components
p = d = q = range(0, 3)  # Test values 0, 1, 2
P = D = Q = range(0, 2)  # Seasonal terms
S = [12]  

pdq = list(itertools.product(p, d, q))
seasonal_pdq = list(itertools.product(P, D, Q, S))

warnings.filterwarnings("ignore")

best_aic = float("inf")
best_params = None

for param in pdq:
    for seasonal_param in seasonal_pdq:
        try:
            model = SARIMAX(df['WERT_log'],
                            order=param,
                            seasonal_order=seasonal_param,
                            enforce_stationarity=False,
                            enforce_invertibility=False)
            results = model.fit(disp=False)
            
            if results.aic < best_aic:
                best_aic = results.aic
                best_params = (param, seasonal_param)
                
        except:
            continue

print(f'Best ARIMA{best_params[0]} x {best_params[1]} - AIC:{best_aic}')



Best ARIMA(0, 1, 2) x (1, 0, 1, 12) - AIC:108.25568631169956


In [35]:
# Expand ranges for more robust tuning
p = d = q = range(0, 4)   # now testing 0 to 3
P = D = Q = range(0, 2)
S = [12]  # monthly seasonality

pdq = list(itertools.product(p, d, q))
seasonal_pdq = list(itertools.product(P, D, Q, S))

best_aic = float("inf")
best_params = None

for param in pdq:
    for seasonal_param in seasonal_pdq:
        try:
            model = SARIMAX(df['WERT_log'],
                            order=param,
                            seasonal_order=seasonal_param,
                            enforce_stationarity=False,
                            enforce_invertibility=False)
            results = model.fit(disp=False)
            
            if results.aic < best_aic:
                best_aic = results.aic
                best_params = (param, seasonal_param)
        
        except:
            continue

print(f'Best SARIMA{best_params[0]} x {best_params[1]}12 - AIC:{best_aic}')


Best SARIMA(3, 1, 3) x (1, 0, 1, 12)12 - AIC:105.66588443909608


In [36]:
best_order = best_params[0]
best_seasonal_order = best_params[1]

model = SARIMAX(df['WERT_log'],
                order=best_order,
                seasonal_order=best_seasonal_order,
                enforce_stationarity=False,
                enforce_invertibility=False)

results = model.fit()

# Summary
print(results.summary())


                                      SARIMAX Results                                       
Dep. Variable:                             WERT_log   No. Observations:                  300
Model:             SARIMAX(3, 1, 3)x(1, 0, [1], 12)   Log Likelihood                 -43.833
Date:                              Wed, 09 Jul 2025   AIC                            105.666
Time:                                      15:18:54   BIC                            138.475
Sample:                                  01-01-2000   HQIC                           118.821
                                       - 12-01-2024                                         
Covariance Type:                                opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.6595      0.128      5.134      0.000       0.408       0.911
ar.L2         -0.96