In [2]:
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../preprocessed-data.csv')
df

Unnamed: 0,Date,Value
0,2010-01-01,388.91
1,2010-02-01,390.41
2,2010-03-01,391.37
3,2010-04-01,392.67
4,2010-05-01,393.21
...,...,...
179,2024-12-01,425.40
180,2025-01-01,426.65
181,2025-02-01,427.09
182,2025-03-01,428.15


In [4]:
def findOptimalArima(timeSeries, maxP=5, maxD=2, maxQ=5):
    bestAic = np.inf
    bestOrder = None
    
    print("Finding optimal ARIMA parameters...")
    
    for p in range(maxP + 1):
        for d in range(maxD + 1):
            for q in range(maxQ + 1):
                try:
                    model = ARIMA(timeSeries, order=(p, d, q))
                    modelFit = model.fit()
                    
                    if modelFit.aic < bestAic:
                        bestAic = modelFit.aic
                        bestOrder = (p, d, q)
                        
                except:
                    continue
    
    print(f"Best ARIMA order: {bestOrder}")
    print(f"Best AIC: {bestAic}")
    return bestOrder

In [5]:
def evaluateModel(df, predCol):
    # remove rows with NaN predictions for evaluation
    validDf = df.dropna(subset=[predCol, 'Value'])
    
    if len(validDf) == 0:
        return "No valid predictions to evaluate"
    
    actual = validDf['Value']
    predicted = validDf[predCol]
    
    # calculate metrics
    mae = np.mean(np.abs(actual - predicted))
    mse = np.mean((actual - predicted) ** 2)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100
    
    return f"MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, MAPE: {mape:.4f}%"

In [6]:
# ...existing code...

# fixed ARIMA execution function
def execArima(steps=12):
    # prepare data with proper datetime index
    dfCopy = df.copy()
    dfCopy['Date'] = pd.to_datetime(dfCopy['Date'])
    dfCopy.set_index('Date', inplace=True)
    
    timeSeries = dfCopy['Value']
            
    # fit ARIMA model, no need to run, already run once, its 5 1 5
    # model = ARIMA(timeSeries, order=findOptimalArima(timeSeries))


    model = ARIMA(timeSeries, order=(5,1,5))

    modelFit = model.fit()
    
    print(modelFit.summary())
    
    # make in-sample predictions
    inSamplePred = modelFit.fittedvalues

    # first val will always be 0, need NaN so it doesnt affect plot
    inSamplePred.iloc[0] = np.nan

    # create dataframe with predictions
    resultDf = dfCopy.copy()
    
    resultDf['ArimaPred'] = inSamplePred
    
    # make future predictions
    forecast = modelFit.forecast(steps=steps)
    
    # create future dates
    lastDate = dfCopy.index[-1]
    futureDates = []
    futureValues = []
    futureArima = []
    
    for i in range(steps):
        newDate = lastDate + relativedelta(months=i+1)
        futureDates.append(newDate)
        futureValues.append(np.nan)
        futureArima.append(forecast.iloc[i] if hasattr(forecast, 'iloc') else forecast[i])
    
    # create future dataframe
    futureDf = pd.DataFrame({
        'Value': futureValues,
        'ArimaPred': futureArima
    }, index=futureDates)
    
    # combine historical and future data
    extendedDf = pd.concat([resultDf, futureDf])
    
    # evaluate model
    print("\nModel Evaluation:")
    evaluation = evaluateModel(resultDf.reset_index(), "ArimaPred")
    print(evaluation, "\n")
    
    # reset index to match MVA format
    extendedDf = extendedDf.reset_index()
    extendedDf = extendedDf.rename(columns={'index': 'Date'})
    extendedDf['Date'] = extendedDf['Date'].dt.strftime('%Y-%m-%d')
    
    print(extendedDf.to_string())
    return extendedDf

# ...existing code...

In [7]:
# execute ARIMA model
arimaResults = execArima(12)

# save results
outPath = "../arima-results.csv"
arimaResults.to_csv(outPath, index=False)

arimaResults

                               SARIMAX Results                                
Dep. Variable:                  Value   No. Observations:                  184
Model:                 ARIMA(5, 1, 5)   Log Likelihood                -121.987
Date:                Thu, 29 May 2025   AIC                            265.975
Time:                        16:15:05   BIC                            301.279
Sample:                    01-01-2010   HQIC                           280.285
                         - 04-01-2025                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.7311      9.656      0.179      0.858     -17.195      20.657
ar.L2         -0.9991     26.372     -0.038      0.970     -52.687      50.689
ar.L3         -0.9991     36.018     -0.028      0.9

Unnamed: 0,Date,Value,ArimaPred
0,2010-01-01,388.91,
1,2010-02-01,390.41,388.910322
2,2010-03-01,391.37,391.396034
3,2010-04-01,392.67,391.674916
4,2010-05-01,393.21,393.125008
...,...,...,...
191,2025-12-01,,426.236910
192,2026-01-01,,427.130526
193,2026-02-01,,427.756195
194,2026-03-01,,428.593612
