In [5]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from capstone.utils import read_file, get_sectors
from capstone.model_selection import overunder_error, sarimax_exog_cross_val_score

from statsmodels.tsa.statespace.sarimax import SARIMAX

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from tqdm.auto import tqdm

from warnings import filterwarnings
filterwarnings("ignore")

# Set visualization style and adjust plot settings
sns.set_style("whitegrid")
plt.rcParams["lines.linewidth"] = 1
plt.rcParams["axes.edgecolor"] = "k"

In [6]:
sectors = get_sectors()
df = read_file("master_df", index_col=0)

targets = df[sectors]
features = df[df.columns[~df.columns.isin(sectors)]]

In [7]:
# Define the forecast horizon in terms of trading days per year
trading_days = 252
forecast = int(trading_days / 2)

# Shift the features to match the forecast horizon, and drop any missing values
X_shifted = features.shift(forecast).dropna()

# Align the target data with the shifted features
y_all = targets.reindex(X_shifted.index)

# Create a pipeline for standardizing and applying PCA
pca_pipe = make_pipeline(StandardScaler(), PCA(n_components=.8, random_state=42))

# Define the SARIMAX orders for ARIMA and seasonal components
order = (1, 0, 1) # Returns are usually stationary
s_order = (0, 0, 0, 252) # Returns do not usually exhibit seasonality

# Initialize empty DataFrames to store predictions and over-under loss scores
sarimax_preds = pd.DataFrame()
sarimax_ouls = pd.DataFrame()

# Loop through each sector
for sector in tqdm(sectors):
    # Extract the target variable for the current sector
    y = y_all[sector]

    # Loop through the data with a window equal to the forecast horizon
    for i in range(forecast, len(y), forecast):
        
        # Split the data into training and testing sets
        X_train, X_test = X_shifted.iloc[i-forecast:i], X_shifted.iloc[i:i+forecast]
        y_train, y_test = y[i-forecast:i], y[i:i+forecast]

        # Apply PCA to the training and testing feature sets
        X_train_pca = pca_pipe.fit_transform(X_train)
        X_test_pca = pca_pipe.transform(X_test)

        # Perform time-series cross-validation and calculate the mean over-under loss
        mean_oul = np.mean(
            sarimax_exog_cross_val_score(
                X_train,
                y_train,
                order=order,
                seasonal_order=s_order,
                pca=pca_pipe,
                cv=2,
                scorer=overunder_error,
                overpred_penalty=2,
                underpred_penalty=0
            )
        )
        
        # Store the mean over-under loss score
        sarimax_ouls.loc[X_test.index.min(), sector] = mean_oul

        # Fit the SARIMAX model to the training data
        model = SARIMAX(y_train.values, X_train_pca, order=order, seasonal_order=s_order).fit()

        # Generate forecasts for the testing data
        forecast_results = model.get_forecast(steps=len(X_test_pca), exog=X_test_pca)
        y_hat = forecast_results.predicted_mean

        # Store the mean forecasted value
        sarimax_preds.loc[X_test.index.min(), sector] = np.mean(y_hat)

  0%|          | 0/11 [00:00<?, ?it/s]

In [8]:
sarimax_mean_ouls = pd.DataFrame(sarimax_ouls.mean(axis=1), columns=["SARIMAX"])
sarimax_best_sectors = pd.DataFrame(sarimax_preds.idxmax(axis=1), columns=["SARIMAX"])

In [9]:
sarimax_mean_ouls.to_csv("data/sarimax_mean_ouls.csv")
sarimax_best_sectors.to_csv("data/sarimax_best_sectors.csv")

In [10]:
help(sarimax_exog_cross_val_score)

Help on function sarimax_exog_cross_val_score in module capstone.model_selection:

sarimax_exog_cross_val_score(X: pandas.core.frame.DataFrame, y: pandas.core.series.Series, order: Tuple[int, int, int], seasonal_order: Tuple[int, int, int, int], pca: sklearn.pipeline.Pipeline, cv: int, scorer: Callable, **scorer_kwargs: Optional[Any]) -> List[float]
    Perform time-series cross-validation using a SARIMAX model with exogenous variables and PCA.
    
    Parameters:
        - X (DataFrame): Feature matrix.
        - y (Series): Target variable.
        - order (Tuple[int, int, int]): The (p,d,q) order of the model for the number of AR, differences, and MA parameters.
        - seasonal_order (Tuple[int, int, int, int]): The (P,D,Q,S) order of the seasonal component.
        - pca (Pipeline): PCA pipeline for dimensionality reduction.
        - cv (int): Number of folds in TimeSeriesSplit.
        - scorer (Callable): Custom scoring function.
        - **scorer_kwargs (Optional[Any]): Ad