In [None]:
import warnings

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

warnings.filterwarnings("ignore")

# Data Loading

In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Load a DataFrame with a specific version of a CSV
df: pd.DataFrame = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "shiveshprakash/34-year-daily-stock-data/versions/1",
    "stock_data.csv",
)

# Drop useless columns or which we will create ourselves
df = df.drop(columns=["prev_day"])

# Display the first few rows of the dataframe
df.head()

# Data Cleaning and Preprocessing

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Convert "dt" to datetime format
df["dt"] = pd.to_datetime(df["dt"], format="%Y-%m-%d")

# Check data types
df.dtypes

# Data Analysis

### General Plots

In [None]:
# Plot the S&P 500 over time
plt.figure(figsize=(12, 6))
sns.lineplot(x=df["dt"], y=df["sp500"], label="S&P 500")
plt.title("S&P 500 Index Over Time")
plt.xlabel("Date")
plt.ylabel("S&P 500 Index")
plt.legend()
plt.show()

In [None]:
# Visualize the relationship between S&P 500 and DJIA
sns.scatterplot(x="sp500", y="djia", data=df)
plt.title("S&P 500 vs DJIA")
plt.xlabel("S&P 500 Index")
plt.ylabel("DJIA Index")
plt.show()

### Correlation Analysis

In [None]:
# Select only numeric columns for correlation analysis
numeric_df = df.select_dtypes(include=[np.number])

# Compute the correlation matrix
corr_matrix = numeric_df.corr()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Create new features

### Aggregate Rolling Features

In [None]:
windows = [7, 14, 30, 90, 365]  # Rolling window sizes (days)

cols_to_aggregate = ["sp500", "sp500_volume", "djia", "djia_volume", "hsi", "vix"]
less_correlated_features = ["us3m", "joblessness", "epu"]
cols_to_aggregate_all = cols_to_aggregate + less_correlated_features
nr_window_features = 0
for window in windows:
    for col in cols_to_aggregate_all:
        df[f"{col}_mean_{window}"] = df[col].rolling(window=window, min_periods=1).mean()
        df[f"{col}_std_{window}"] = df[col].rolling(window=window, min_periods=1).std()
        nr_window_features += 2

# Drop rows with NaN values introduced by rolling calculations
df = df.dropna()

print(f"Nr of window features: {nr_window_features}")

# Display the updated DataFrame
df.head()

### Autoencoders

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Create lagged features (temporary DataFrame)
lag_days = 365
lagged_df = pd.DataFrame()
# for lag in range(1, lag_days + 1):
# lagged_df[f"sp500_lag_{lag}"] = df["sp500"].shift(lag)
# lagged_df[f"sp500_volume_lag_{lag}"] = df["sp500_volume"].shift(lag)
# lagged_df[f"djia_lag_{lag}"] = df["djia"].shift(lag)
# lagged_df[f"djia_volume_lag_{lag}"] = df["djia_volume"].shift(lag)
# lagged_df[f"hsi_lag_{lag}"] = df["hsi"].shift(lag)
# lagged_df[f"vix_lag_{lag}"] = df["vix"].shift(lag)

for col in cols_to_aggregate:
    for lag in range(1, lag_days + 1):
        lagged_df[f"{col}_lag_{lag}"] = df[col].shift(lag)
for col in less_correlated_features:
    for lag in [7, 14, 30, 90]:
        lagged_df[f"{col}_lag_{lag}"] = df[col].shift(lag)

# Drop rows with NaN values
lagged_df = lagged_df.dropna()

# Normalize lagged features
scaler = MinMaxScaler()
X_lagged = scaler.fit_transform(lagged_df.values)
joblib.dump(scaler, "lagged_scaler.pkl")  # Save the scaler

# Clean up: Delete the temporary lagged features DataFrame
del lagged_df

# Get input dimensions
input_dim = X_lagged.shape[1]  # Number of lagged features

# Display the number of lagged features
print(f"Nr of lagged features: {input_dim}")

In [None]:
import optuna
from backend.autoencoders import objective

# Number of trials for optimization
n_trials = 100

# Run Optuna optimization
study_ae = optuna.create_study(direction="minimize")
study_ae.optimize(lambda trial: objective(trial, X_lagged), n_trials=n_trials)

# Print best hyperparameters
print("\nBest Autoencoder Parameters:")
print(study_ae.best_params)

In [None]:
from backend.autoencoders import Autoencoder, train_predict_autoencoder

# Get best parameters
best_params = study_ae.best_params

# Initialize Autoencoder with best parameters
autoencoder = Autoencoder(
    input_dim=X_lagged.shape[1],
    encoding_dim=best_params["encoding_dim"],
    hidden_dim=best_params["hidden_dim"],
    dropout_rate=best_params["dropout_rate"],
)

# Train Autoencoder
trained_autoencoder, embeddings, last_mse_loss = train_predict_autoencoder(
    autoencoder,
    X_lagged,
    epochs=150,
    batch_size=best_params["batch_size"],
    lr=best_params["lr"],
    l1_penalty=best_params["l1_penalty"],
    weight_decay=best_params["weight_decay"],
)

In [None]:
# Convert embeddings to DataFrame
embedding_df = pd.DataFrame(embeddings, columns=[f"embed_{i + 1}" for i in range(embeddings.shape[1])])

### Prepare the final df to train

##### Take the right data for training

In [None]:
# Select only numeric columns for training
training_df = df.select_dtypes(include=[np.number])

# Convert to an ordered categorical column
if training_df["joblessness"].dtypes != "category":
    training_df["joblessness"] = pd.Categorical(
        training_df["joblessness"],
        categories=[1, 2, 3, 4],
        ordered=True
    )

##### Scale the df

In [None]:
# Separate the "joblessness" column
joblessness = training_df["joblessness"]

# Select all columns except "joblessness"
columns_to_scale = training_df.drop(columns=["joblessness"]).columns

# Apply MinMaxScaler to the selected columns
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(training_df[columns_to_scale])
joblib.dump(scaler, "training_df_scaler.pkl")  # Save the scaler

# Create a DataFrame for the scaled data
training_df = pd.DataFrame(scaled_data, columns=columns_to_scale, index=training_df.index)

# Add back the "joblessness" column
training_df["joblessness"] = joblessness

##### Attach embeddings to the training DataFrame

In [None]:
# Attach embeddings to the main DataFrame
training_df = pd.concat([training_df.reset_index(drop=True), embedding_df], axis=1)

# Display the final DataFrame with embeddings
training_df.head()

In [None]:
print(list(training_df.columns))

# Train the models

In [None]:
from backend.models import train_model, evaluate_model, plot_predictions, prepare_data, objective
import optuna

# Reduce logging output (only show errors)
optuna.logging.set_verbosity(optuna.logging.ERROR)

# Define the prediction horizons
days_to_predict = [1, 7, 14, 21, 28]

In [None]:
# Prepare data
X_train, X_test, y_train, y_test = prepare_data(training_df, days_to_predict)

In [None]:
from tqdm.auto import tqdm

# Number of trials
n_trials = 1000
n_warmup_steps = 25

# Initialize progress bars
tqdm_ridge = tqdm(total=n_trials, desc="Optimizing Ridge (Best R²: -∞)")
tqdm_svr = tqdm(total=n_trials, desc="Optimizing SVR (Best R²: -∞)")

# Track best R² scores
best_ridge_r2 = float('-inf')
best_svr_r2 = float('-inf')


def tqdm_callback_ridge(study, trial):
    """Update tqdm progress bar and show best R² score for Ridge."""
    global best_ridge_r2
    if study.best_trial.value > best_ridge_r2:
        best_ridge_r2 = study.best_trial.value
    tqdm_ridge.set_description(f"Optimizing Ridge (Best R²: {best_ridge_r2:.4f})")
    tqdm_ridge.update(1)


def tqdm_callback_svr(study, trial):
    """Update tqdm progress bar and show best R² score for SVR."""
    global best_svr_r2
    if study.best_trial.value > best_svr_r2:
        best_svr_r2 = study.best_trial.value
    tqdm_svr.set_description(f"Optimizing SVR (Best R²: {best_svr_r2:.4f})")
    tqdm_svr.update(1)


# Optimize Ridge
study_ridge = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(),
                                  pruner=optuna.pruners.MedianPruner(n_warmup_steps=n_warmup_steps))
study_ridge.optimize(lambda trial: objective(trial, "Ridge", X_train, y_train, X_test, y_test),
                     n_trials=n_trials, callbacks=[tqdm_callback_ridge])

# Optimize SVR
study_svr = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(),
                                pruner=optuna.pruners.MedianPruner(n_warmup_steps=n_warmup_steps))
study_svr.optimize(lambda trial: objective(trial, "SVR", X_train, y_train, X_test, y_test),
                   n_trials=n_trials, callbacks=[tqdm_callback_svr])

# Close progress bars
tqdm_ridge.close()
tqdm_svr.close()

# Print best parameters
print("\nBest Ridge Parameters:")
print(study_ridge.best_params)

print("\nBest SVR Parameters:")
print(study_svr.best_params)

In [None]:
# Use best parameters from Optuna
best_ridge_params = study_ridge.best_params
best_svr_params = study_svr.best_params

# Train with best parameters
models = {
    "LinearRegression": {},
    "Ridge": best_ridge_params,
    "SVR": best_svr_params,
}

all_results = {}
for model_name, params in models.items():
    print(f"\nTraining {model_name} with Optimized Parameters...\n" + "=" * 50)

    # Train model
    model, y_pred = train_model(model_name, X_train, y_train, X_test, **params)

    # Evaluate model
    results = evaluate_model(model_name, y_test, y_pred, days_to_predict)

    # Store results
    all_results[model_name] = results

    # Plot predictions
    plot_predictions(y_test, y_pred, days_to_predict, results, model_name)

# Conclusion and Future Work
...