In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from utils import *
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import root_mean_squared_error
import joblib
import os

In [4]:
files = os.listdir('../data/out')
model_names = set(["_".join(f.split("_")[0:-1]) for f in files])
params = pd.read_csv('../models/hyperparameters.csv').set_index('model_name')
season = 52 # annual seasonality on weekly data

In [3]:
for name in model_names:
    train_df = pd.read_csv(f'../data/out/{name}_train.csv').assign(week_start = lambda x: pd.to_datetime(x['week_start'])).set_index('week_start')
    test_df = pd.read_csv(f'../data/out/{name}_test.csv').assign(week_start = lambda x: pd.to_datetime(x['week_start'])).set_index('week_start')
    model_params = params.loc[name]
    order = tuple(model_params[['p', 'd', 'q']])
    seasonal_order = tuple(model_params[['P', 'D', 'Q']]) + (season,)
    # Create and train SARIMAX model
    print(f"training {name} model")
    model = SARIMAX(train_df["count"], order=order, seasonal_order=seasonal_order)
    model_fit = model.fit()
    joblib.dump(model_fit, f'../models/sarimax_{name}.joblib')
    # Print model summary
    print(model_fit.summary())
    # Make predictions
    print(f"Predicting {name} model")
    test_predictions = model_fit.predict(start=test_df.index[0], end=test_df.index[-1], dynamic=False)
    test_df['y_pred'] = test_predictions
    test_df.to_csv(f'../data/predictions/sarimax_{name}.csv')
    rmse = root_mean_squared_error(test_df["count"], test_predictions)
    print(f"{name} RMSE: {rmse}")

training boston_Inspectional Services model


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                       SARIMAX Results                                       
Dep. Variable:                                 count   No. Observations:                  470
Model:             SARIMAX(2, 0, 13)x(1, 1, [1], 52)   Log Likelihood               -2345.567
Date:                               Sun, 12 May 2024   AIC                           4727.134
Time:                                       13:55:50   BIC                           4799.772
Sample:                                   12-30-2013   HQIC                          4755.849
                                        - 12-26-2022                                         
Covariance Type:                                 opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.0325      0.538      0.061      0.952      -1.021       1.087
ar.L2      

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                      SARIMAX Results                                       
Dep. Variable:                                count   No. Observations:                  470
Model:             SARIMAX(0, 1, 0)x(1, 1, [1], 52)   Log Likelihood               -1645.544
Date:                              Sun, 12 May 2024   AIC                           3297.089
Time:                                      13:57:17   BIC                           3309.188
Sample:                                  12-30-2013   HQIC                          3301.872
                                       - 12-26-2022                                         
Covariance Type:                                opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.S.L52       0.0893      0.075      1.185      0.236      -0.058       0.237
ma.S.L52      -0.77

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                      SARIMAX Results                                       
Dep. Variable:                                count   No. Observations:                  470
Model:             SARIMAX(0, 1, 0)x(1, 1, [1], 52)   Log Likelihood               -2429.700
Date:                              Sun, 12 May 2024   AIC                           4865.399
Time:                                      13:58:31   BIC                           4877.499
Sample:                                  12-30-2013   HQIC                          4870.183
                                       - 12-26-2022                                         
Covariance Type:                                opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.S.L52       0.1196      0.096      1.242      0.214      -0.069       0.308
ma.S.L52      -0.72

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                      SARIMAX Results                                       
Dep. Variable:                                count   No. Observations:                  470
Model:             SARIMAX(0, 1, 0)x(1, 1, [1], 52)   Log Likelihood               -3125.320
Date:                              Sun, 12 May 2024   AIC                           6256.641
Time:                                      13:59:26   BIC                           6268.740
Sample:                                  12-30-2013   HQIC                          6261.424
                                       - 12-26-2022                                         
Covariance Type:                                opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.S.L52       0.2352      0.044      5.377      0.000       0.149       0.321
ma.S.L52      -1.00

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                       SARIMAX Results                                       
Dep. Variable:                                 count   No. Observations:                  470
Model:             SARIMAX(2, 0, 10)x(1, 1, [1], 52)   Log Likelihood               -2565.947
Date:                               Sun, 12 May 2024   AIC                           5161.894
Time:                                       14:16:13   BIC                           5222.426
Sample:                                   12-30-2013   HQIC                          5185.823
                                        - 12-26-2022                                         
Covariance Type:                                 opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.4373      1.068     -0.409      0.682      -2.530       1.656
ar.L2      

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                      SARIMAX Results                                       
Dep. Variable:                                count   No. Observations:                  470
Model:             SARIMAX(2, 0, 6)x(1, 1, [1], 52)   Log Likelihood               -2977.548
Date:                              Sun, 12 May 2024   AIC                           5977.097
Time:                                      14:28:55   BIC                           6021.487
Sample:                                  12-30-2013   HQIC                          5994.645
                                       - 12-26-2022                                         
Covariance Type:                                opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.0884      0.146     -0.605      0.545      -0.375       0.198
ar.L2          0.84

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                       SARIMAX Results                                       
Dep. Variable:                                 count   No. Observations:                  470
Model:             SARIMAX(2, 0, 10)x(1, 1, [1], 52)   Log Likelihood               -2277.830
Date:                               Sun, 12 May 2024   AIC                           4585.660
Time:                                       14:34:29   BIC                           4646.192
Sample:                                   12-30-2013   HQIC                          4609.590
                                        - 12-26-2022                                         
Covariance Type:                                 opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.4408      1.796      0.245      0.806      -3.079       3.961
ar.L2      

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                      SARIMAX Results                                       
Dep. Variable:                                count   No. Observations:                  470
Model:             SARIMAX(0, 1, 0)x(1, 1, [1], 52)   Log Likelihood               -2688.319
Date:                              Sun, 12 May 2024   AIC                           5382.638
Time:                                      14:35:00   BIC                           5394.737
Sample:                                  12-30-2013   HQIC                          5387.421
                                       - 12-26-2022                                         
Covariance Type:                                opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.S.L52      -0.3990      0.108     -3.706      0.000      -0.610      -0.188
ma.S.L52       0.00

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                       SARIMAX Results                                       
Dep. Variable:                                 count   No. Observations:                  470
Model:             SARIMAX(1, 0, 19)x(1, 1, [1], 52)   Log Likelihood               -3382.262
Date:                               Sun, 12 May 2024   AIC                           6810.524
Time:                                       14:39:38   BIC                           6903.340
Sample:                                   12-30-2013   HQIC                          6847.216
                                        - 12-26-2022                                         
Covariance Type:                                 opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.9960      0.004    281.522      0.000       0.989       1.003
ma.L1      

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                       SARIMAX Results                                       
Dep. Variable:                                 count   No. Observations:                  470
Model:             SARIMAX(2, 0, 13)x(1, 1, [1], 52)   Log Likelihood               -1997.985
Date:                               Sun, 12 May 2024   AIC                           4031.971
Time:                                       14:42:46   BIC                           4104.610
Sample:                                   12-30-2013   HQIC                          4060.687
                                        - 12-26-2022                                         
Covariance Type:                                 opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.2375      1.521      0.813      0.416      -1.744       4.219
ar.L2      

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                       SARIMAX Results                                       
Dep. Variable:                                 count   No. Observations:                  470
Model:             SARIMAX(2, 0, 12)x(1, 1, [1], 52)   Log Likelihood               -1888.753
Date:                               Sun, 12 May 2024   AIC                           3811.506
Time:                                       14:46:30   BIC                           3880.109
Sample:                                   12-30-2013   HQIC                          3838.627
                                        - 12-26-2022                                         
Covariance Type:                                 opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.0590      0.191      0.309      0.757      -0.315       0.433
ar.L2      

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                       SARIMAX Results                                       
Dep. Variable:                                 count   No. Observations:                  470
Model:             SARIMAX(3, 0, 28)x(1, 1, [1], 52)   Log Likelihood               -2392.801
Date:                               Sun, 12 May 2024   AIC                           4853.602
Time:                                       14:54:47   BIC                           4990.808
Sample:                                   12-30-2013   HQIC                          4907.842
                                        - 12-26-2022                                         
Covariance Type:                                 opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.8884      0.020    -44.011      0.000      -0.928      -0.849
ar.L2      

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                      SARIMAX Results                                       
Dep. Variable:                                count   No. Observations:                  470
Model:             SARIMAX(2, 0, 7)x(1, 1, [1], 52)   Log Likelihood               -3065.775
Date:                              Sun, 12 May 2024   AIC                           6155.549
Time:                                      14:57:13   BIC                           6203.975
Sample:                                  12-30-2013   HQIC                          6174.693
                                       - 12-26-2022                                         
Covariance Type:                                opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.0042      0.010     -0.408      0.683      -0.024       0.016
ar.L2          0.99

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                      SARIMAX Results                                       
Dep. Variable:                                count   No. Observations:                  470
Model:             SARIMAX(1, 0, 4)x(1, 1, [1], 52)   Log Likelihood               -3319.779
Date:                              Sun, 12 May 2024   AIC                           6655.557
Time:                                      14:58:39   BIC                           6687.841
Sample:                                  12-30-2013   HQIC                          6668.320
                                       - 12-26-2022                                         
Covariance Type:                                opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.9929      0.006    176.386      0.000       0.982       1.004
ma.L1         -0.21

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                      SARIMAX Results                                       
Dep. Variable:                                count   No. Observations:                  470
Model:             SARIMAX(0, 1, 0)x(1, 1, [1], 52)   Log Likelihood               -2754.100
Date:                              Sun, 12 May 2024   AIC                           5514.200
Time:                                      14:59:04   BIC                           5526.299
Sample:                                  12-30-2013   HQIC                          5518.984
                                       - 12-26-2022                                         
Covariance Type:                                opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.S.L52       0.1330      0.098      1.363      0.173      -0.058       0.324
ma.S.L52      -0.61

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                      SARIMAX Results                                       
Dep. Variable:                                count   No. Observations:                  470
Model:             SARIMAX(0, 1, 0)x(1, 1, [1], 52)   Log Likelihood               -2834.590
Date:                              Sun, 12 May 2024   AIC                           5675.181
Time:                                      14:59:26   BIC                           5687.280
Sample:                                  12-30-2013   HQIC                          5679.964
                                       - 12-26-2022                                         
Covariance Type:                                opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.S.L52       0.3050      0.041      7.400      0.000       0.224       0.386
ma.S.L52      -0.99

In [10]:
# Getting confidence intervals from already trained models, updating prediction data

for name in model_names:
    model = joblib.load(f"../models/sarimax_{name}.joblib")
    pred_data = pd.read_csv(f"../data/predictions/sarimax_{name}.csv").assign(week_start = lambda x: pd.to_datetime(x['week_start'])).set_index('week_start')
    conf_int = model.get_forecast(steps=len(pred_data)).conf_int()
    pred_data.join(conf_int).to_csv(f"../data/predictions/sarimax_{name}.csv")