In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from capstone.utils import read_file, get_sectors
from capstone.model_selection import overunder_error

from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import make_pipeline

from tqdm.auto import tqdm

from warnings import filterwarnings
filterwarnings("ignore")

# Set visualization style and adjust plot settings
sns.set_style("whitegrid")
plt.rcParams["lines.linewidth"] = 1
plt.rcParams["axes.edgecolor"] = "k"

In [2]:
sectors = get_sectors()
df = read_file("master_df", index_col=0)

In [16]:
y_all = df[sectors]
X = df[df.columns[~df.columns.isin(sectors)]]

pca_pipe = make_pipeline(StandardScaler(), PCA(n_components=.8, random_state=42))
tscv = TimeSeriesSplit(n_splits=2)

order = (1, 0, 1)
s_order = (0, 0, 0, 252)

window = 126

sarimax_preds = pd.DataFrame()
sarimax_ouls = pd.DataFrame()

for sector in tqdm(sectors):
    y = y_all[sector]

    for i in range(window, len(y), window):
        X_train, X_test = X.iloc[i-window:i], X.iloc[i:i+window]
        y_train, y_test = y[i-window:i], y[i:i+window]

        X_train_pca = pca_pipe.fit_transform(X_train)
        X_test_pca = pca_pipe.transform(X_test)

        cv_ouls = []

        for train_idx, test_idx in tscv.split(y_train):
            cvx_train, cvy_train = X_train.iloc[train_idx], y_train[train_idx]
            cvx_test, cvy_test = X_train.iloc[test_idx], y_train[test_idx]

            cvx_train_pca = pca_pipe.fit_transform(cvx_train)
            cvx_test_pca = pca_pipe.transform(cvx_test)

            cv_model = SARIMAX(cvy_train.values, cvx_train_pca, order=order, seasonal_order=s_order).fit()
            cvy_hat = cv_model.predict(start=test_idx[0], end=test_idx[-1], exog=cvx_test_pca)

            cv_ouls.append(overunder_error(cvy_test, cvy_hat, underpred_penalty=0, overpred_penalty=2))

        mean_oul = np.mean(cv_ouls)
        sarimax_ouls.loc[X_test.index.min(), sector] = mean_oul

        model = SARIMAX(y_train.values, X_train_pca, order=order, seasonal_order=s_order).fit()

        forecast = model.get_forecast(steps=len(X_test_pca), exog=X_test_pca)
        y_hat = forecast.predicted_mean

        sarimax_preds.loc[X_test.index.min(), sector] = np.mean(y_hat)

  0%|          | 0/11 [00:00<?, ?it/s]

In [29]:
sarimax_mean_ouls = pd.DataFrame(sarimax_ouls.mean(axis=1), columns=["SARIMAX"])
sarimax_best_sectors = pd.DataFrame(sarimax_preds.idxmax(axis=1), columns=["SARIMAX"])

In [33]:
sarimax_mean_ouls.to_csv("data/sarimax_mean_ouls.csv")
sarimax_best_sectors.to_csv("data/sarimax_best_sectors.csv")

In [32]:
sarimax_mean_ouls

Unnamed: 0,SARIMAX
2007-05-23,0.046584
2007-11-20,0.089489
2008-05-22,0.099027
2008-11-19,0.108878
2009-05-22,0.156493
2009-11-19,0.112607
2010-05-24,0.104928
2010-11-19,0.123629
2011-05-23,0.093582
2011-11-18,0.096298
