In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from utils import *
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import root_mean_squared_error
import joblib
import os

In [19]:
files = os.listdir('../data/out')
model_names = set(["_".join(f.split("_")[0:-1]) for f in files])
params = pd.read_csv('../models/hyperparameters.csv').set_index('model_name')
season = 52 # annual seasonality on weekly data

In [None]:
for name in model_names:
    train_df = pd.read_csv(f'../data/out/{name}_train.csv').assign(week_start = lambda x: pd.to_datetime(x['week_start'])).set_index('week_start')
    test_df = pd.read_csv(f'../data/out/{name}_test.csv').assign(week_start = lambda x: pd.to_datetime(x['week_start'])).set_index('week_start')
    model_params = params.loc[name]
    order = tuple(model_params[['p', 'd', 'q']])
    seasonal_order = tuple(model_params[['P', 'D', 'Q']]) + (season,)
    # Create and train SARIMAX model
    print(f"training {name} model")
    model = SARIMAX(train_df["count"], order=order, seasonal_order=seasonal_order)
    model_fit = model.fit()
    joblib.dump(model_fit, f'../models/sarimax_{name}.joblib')
    # Print model summary
    print(model_fit.summary())
    # Make predictions
    print(f"Predicting {name} model")
    test_predictions = model_fit.predict(start=test_df.index[0], end=test_df.index[-1], dynamic=False)
    test_df['y_pred'] = test_predictions
    test_df.to_csv(f'../data/predictions/sarimax_{name}.csv')
    rmse = root_mean_squared_error(test_df["count"], test_predictions)
    print(f"{name} RMSE: {rmse}")