In [None]:
import warnings

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.fft import rfft, rfftfreq

from backend.autoencoders import Autoencoder, train_predict_autoencoder

warnings.filterwarnings("ignore")

# Data Loading

In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Load a DataFrame with a specific version of a CSV
df: pd.DataFrame = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "shiveshprakash/34-year-daily-stock-data/versions/1",
    "stock_data.csv",
)

# Drop useless columns or which we will create ourselves
df = df.drop(columns=["prev_day"])

# Display the first few rows of the dataframe
df.head()

# Data Cleaning and Preprocessing

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Convert "dt" to datetime format
df["dt"] = pd.to_datetime(df["dt"], format="%Y-%m-%d")

# Check data types
df.dtypes

# Data Analysis

### General Plots

In [None]:
# Plot the S&P 500 over time
plt.figure(figsize=(12, 6))
sns.lineplot(x=df["dt"], y=df["sp500"], label="S&P 500")
plt.title("S&P 500 Index Over Time")
plt.xlabel("Date")
plt.ylabel("S&P 500 Index")
plt.legend()
plt.show()

In [None]:
# Visualize the relationship between S&P 500 and DJIA
sns.scatterplot(x="sp500", y="djia", data=df)
plt.title("S&P 500 vs DJIA")
plt.xlabel("S&P 500 Index")
plt.ylabel("DJIA Index")
plt.show()

### Correlation Analysis

In [None]:
# Select only numeric columns for correlation analysis
numeric_df = df.select_dtypes(include=[np.number])

# Compute the correlation matrix
corr_matrix = numeric_df.corr()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Create new features

### Aggregate Rolling Features

In [None]:
windows = [7, 14, 30, 90]  # Rolling window sizes (days)

for window in windows:
    df[f"sp500_mean_{window}"] = df["sp500"].rolling(window=window).mean()
    df[f"sp500_std_{window}"] = df["sp500"].rolling(window=window).std()
    df[f"vix_mean_{window}"] = df["vix"].rolling(window=window).mean()
    df[f"sp500_volume_mean_{window}"] = df["sp500_volume"].rolling(window=window).mean()
    df[f"djia_mean_{window}"] = df["djia"].rolling(window=window).mean()

# Drop rows with NaN values introduced by rolling calculations
df = df.dropna()

# Display the updated DataFrame
df.head()

### Fourier Transform

In [None]:
# Fourier Transform for S&P 500
N = len(df["sp500"])
T = 1  # Time step (1 day)
yf = rfft(df["sp500"])  # Fourier Transform
xf = rfftfreq(N, T)  # Frequencies

# Define frequency ranges for specific windows
freq_ranges = {
    "daily_to_weekly": (1 / 7, 1 / 1),  # 1 to 7 days
    "weekly_to_monthly": (1 / 30, 1 / 7),  # 7 to 30 days
    "monthly_to_quarterly": (1 / 90, 1 / 30),  # 30 to 90 days
}

# Extract magnitudes for each range
for key, (low, high) in freq_ranges.items():
    mask = (xf >= low) & (xf <= high)
    df[f"sp500_fft_{key}_mean"] = np.mean(np.abs(yf[mask]))
    df[f"sp500_fft_{key}_max"] = np.max(np.abs(yf[mask]))
    df[f"sp500_fft_{key}_sum"] = np.sum(np.abs(yf[mask]))

# Repeat the process for VIX
yf_vix = rfft(df["vix"])
for key, (low, high) in freq_ranges.items():
    mask = (xf >= low) & (xf <= high)
    df[f"vix_fft_{key}_mean"] = np.mean(np.abs(yf_vix[mask]))
    df[f"vix_fft_{key}_max"] = np.max(np.abs(yf_vix[mask]))
    df[f"vix_fft_{key}_sum"] = np.sum(np.abs(yf_vix[mask]))

# Drop irrelevant data
df = df.dropna()

# Display the updated DataFrame
df.head()

### Autoencoders

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Step 1: Create lagged features (temporary DataFrame)
lag_days = 30
lagged_df = pd.DataFrame()
for lag in range(1, lag_days + 1):
    lagged_df[f"sp500_lag_{lag}"] = df["sp500"].shift(lag)
    lagged_df[f"vix_lag_{lag}"] = df["vix"].shift(lag)
    lagged_df[f"sp500_volume_lag_{lag}"] = df["sp500_volume"].shift(lag)

# Step 2: Drop rows with NaN values
lagged_df = lagged_df.dropna()
df = df.iloc[lag_days:].reset_index(drop=True)

# Step 3: Normalize lagged features
scaler = MinMaxScaler()
X_lagged = scaler.fit_transform(lagged_df.values)
joblib.dump(scaler, "scaler.pkl")  # Save the scaler

# Step 4: Get input dimensions
input_dim = X_lagged.shape[1]  # Number of lagged features

# Display the number of lagged features
print(f"Nr of lagged features: {input_dim}")

In [None]:
# Initialize the Autoencoder
encoding_dim = 10  # Compressed representation size
autoencoder = Autoencoder(input_dim, encoding_dim)

# Train the autoencoder and get embeddings
trained_autoencoder, embeddings = train_predict_autoencoder(
    autoencoder,
    X_lagged,
    epochs=50,
    batch_size=256,
    lr=0.0005,
    l1_penalty=0.001,
    weight_decay=1e-5
)

In [None]:
# Convert embeddings to DataFrame
embedding_df = pd.DataFrame(embeddings, columns=[f"embed_{i + 1}" for i in range(embeddings.shape[1])])

# Attach embeddings to the main DataFrame
df = pd.concat([df.reset_index(drop=True), embedding_df], axis=1)

# Clean up: Delete the temporary lagged features DataFrame
del lagged_df

# Display the final DataFrame with embeddings
df.head()

# Train the models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define features and target
X = numeric_df.drop(columns=["sp500"])
y = numeric_df["sp500"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2

# Conclusion and Future Work
...