In [5]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Custom modules and functions
from capstone.model_selection import overunder_error, arimax_cross_val_score
from capstone.utils import read_file, get_sectors, set_plot_style

# SARIMAX model from statsmodels
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Machine learning and modeling tools
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Progress bar for loops
from tqdm.auto import tqdm

# Ignore convergence warnings
from warnings import filterwarnings
filterwarnings("ignore")

# Set visualization style and adjust plot settings
sns.set_style("whitegrid")
plt.rcParams["lines.linewidth"] = 1
plt.rcParams["axes.edgecolor"] = "k"

In [6]:
# Load in files
sectors = get_sectors()
df = read_file("master_df", index_col=0)

targets = df[sectors]
features = df[df.columns[~df.columns.isin(sectors)]]

In [7]:
# Define the forecast horizon in terms of trading days per year
trading_days = 252
forecast = int(trading_days / 2)

# Shift the features to match the forecast horizon, and drop any missing values
X_shifted = features.shift(forecast).dropna()

# Align the target data with the shifted features
y_all = targets.reindex(X_shifted.index)

# Create a pipeline for standardizing and applying PCA
pca_pipe = make_pipeline(StandardScaler(), PCA(n_components=.8, random_state=42))

# Define the ARIMAX orders for ARIMA and seasonal components
order = (1, 0, 1) # Returns are usually stationary, so no differencing applied

# Initialize empty DataFrames to store predictions and over-under loss scores
arimax_preds = pd.DataFrame()
arimax_ouls = pd.DataFrame()

# Loop through each sector
for sector in tqdm(sectors):

    # Extract the target variable for the current sector
    y = y_all[sector]

    # Loop through the data with a window equal to the forecast horizon
    for i in range(forecast, len(y), forecast):
        
        # Split the data into training and testing sets
        X_train, X_test = X_shifted.iloc[i-forecast:i], X_shifted.iloc[i:i+forecast]
        y_train, y_test = y[i-forecast:i], y[i:i+forecast]

        # Apply PCA to the training and testing feature sets
        X_train_pca = pca_pipe.fit_transform(X_train)
        X_test_pca = pca_pipe.transform(X_test)

        # Perform time-series cross-validation and calculate the mean over-under loss
        mean_oul = np.mean(
            arimax_cross_val_score(
                X_train,
                y_train,
                order=order,
                pca=pca_pipe,
                cv=2,
                scorer=overunder_error,
                overpred_penalty=2,
                underpred_penalty=0
            )
        )
        
        # Store the mean over-under loss score
        arimax_ouls.loc[X_test.index.min(), sector] = mean_oul

        # Fit the ARIMAX model to the training data
        model = SARIMAX(y_train.values, X_train_pca, order=order).fit()

        # Generate forecasts for the testing data
        forecast_results = model.get_forecast(steps=len(X_test_pca), exog=X_test_pca)
        y_hat = forecast_results.predicted_mean

        # Store the mean forecasted value
        arimax_preds.loc[X_test.index.min(), sector] = np.mean(y_hat)

  0%|          | 0/11 [00:00<?, ?it/s]

In [10]:
arimax_mean_ouls = pd.DataFrame(arimax_ouls.mean(axis=1), columns=["ARIMAX"])
arimax_best_sectors = pd.DataFrame(arimax_preds.idxmax(axis=1), columns=["ARIMAX"])

In [11]:
arimax_mean_ouls.to_csv("data/arimax_mean_ouls.csv")
arimax_best_sectors.to_csv("data/arimax_best_sectors.csv")