# 03: LSTM and Transformer Models for Volatility Forecasting

This notebook implements and trains deep learning models:
- LSTM with attention to sequences
- Feature engineering based on lag analysis from Notebook 01
- Walk-forward validation
- Comparison with GARCH baselines from Notebook 02

**Key Innovation:** Using research-validated lags [1, 2, 6, 11, 16]

In [None]:
# Core imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Local imports
import sys
sys.path.append('..')
from src.config import *
from src.data.features import create_volatility_features, select_lstm_features, normalize_features
from src.models.lstm import LSTMVol, LSTMVolTrainer, create_sequences, prepare_dataloaders
from src.eval.metrics import qlike

# Set seeds
set_seeds()

# Check device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Plotting
plt.style.use(PLOT_STYLE)
%matplotlib inline

print("✓ Environment loaded")

## 1. Data Loading and Feature Engineering

In [None]:
import yfinance as yf

# Download data
ticker = DEFAULT_TICKER
df_raw = yf.download(ticker, start=DEFAULT_START, end=DEFAULT_END, progress=False)
df_raw.columns = [c.lower() for c in df_raw.columns]

print(f"Downloaded {len(df_raw)} days of {ticker} data")
print(f"Period: {df_raw.index[0].date()} to {df_raw.index[-1].date()}")

In [None]:
# Create comprehensive features
print("Creating features...")
features_df = create_volatility_features(df_raw)

print(f"\n✓ Created {len(features_df.columns)} features")
print(f"✓ {len(features_df)} valid observations")
print(f"\nFeatures: {list(features_df.columns[:10])}...")

In [None]:
# Select features (remove highly correlated)
feature_cols = select_lstm_features(features_df, correlation_threshold=CORRELATION_THRESHOLD)

print(f"\n✓ Selected {len(feature_cols)} features after correlation filtering")
print(f"\nSelected features: {feature_cols}")

## 2. Train/Test Split

In [None]:
from src.utils import train_test_split_by_date

# Split data
train_df, test_df = train_test_split_by_date(features_df, TRAIN_END)

print(f"Training: {train_df.index[0].date()} to {train_df.index[-1].date()} ({len(train_df)} days)")
print(f"Testing:  {test_df.index[0].date()} to {test_df.index[-1].date()} ({len(test_df)} days)")

In [None]:
# Normalize features
train_features, test_features, scaler_params = normalize_features(
    train_df[feature_cols],
    test_df[feature_cols],
    method='standardize'
)

print("✓ Features normalized (z-score using training statistics)")
print(f"\nMean of training features: {train_features.mean().mean():.6f}")
print(f"Std of training features: {train_features.std().mean():.6f}")

## 3. Create Sequences for LSTM

In [None]:
# Prepare data for LSTM
train_data = train_df.copy()
train_data[feature_cols] = train_features

test_data = test_df.copy()
test_data[feature_cols] = test_features

# Create sequences
X_train, y_train, train_dates = create_sequences(
    train_data,
    target_col='rv',
    feature_cols=feature_cols,
    seq_len=LSTM_SEQ_LEN,
    forecast_horizon=1
)

print(f"\nTraining sequences: {X_train.shape}")
print(f"  Shape: (n_samples={X_train.shape[0]}, seq_len={X_train.shape[1]}, n_features={X_train.shape[2]})")
print(f"\nTraining targets: {y_train.shape}")

In [None]:
# Split train into train/val
n_train = int(len(X_train) * (1 - LSTM_VAL_SPLIT))

X_tr, y_tr = X_train[:n_train], y_train[:n_train]
X_val, y_val = X_train[n_train:], y_train[n_train:]

print(f"Train split: {X_tr.shape[0]} samples")
print(f"Val split:   {X_val.shape[0]} samples")

## 4. Build LSTM Model

In [None]:
# Create model
model = LSTMVol(
    n_feats=len(feature_cols),
    hidden=LSTM_HIDDEN,
    layers=LSTM_LAYERS,
    dropout=LSTM_DROPOUT
)

print("=== LSTM MODEL ===")
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

## 5. Train LSTM Model

In [None]:
# Prepare data loaders
train_loader, val_loader = prepare_dataloaders(
    X_tr, y_tr,
    X_val, y_val,
    batch_size=LSTM_BATCH_SIZE,
    shuffle_train=True
)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches:   {len(val_loader)}")

In [None]:
# Create trainer
trainer = LSTMVolTrainer(
    model,
    lr=LSTM_LR,
    weight_decay=LSTM_WEIGHT_DECAY,
    device=device
)

print("✓ Trainer initialized")
print(f"  Device: {trainer.device}")
print(f"  Learning rate: {LSTM_LR}")
print(f"  Weight decay: {LSTM_WEIGHT_DECAY}")

In [None]:
# Train model
print("\n=== TRAINING LSTM ===")
print("This will take a few minutes...\n")

history = trainer.fit(
    train_loader,
    val_loader,
    epochs=LSTM_EPOCHS,
    loss_fn='qlike',
    early_stopping_patience=LSTM_PATIENCE,
    verbose=True
)

print("\n✓ Training complete")

In [None]:
# Plot training history
fig, ax = plt.subplots(figsize=(10, 5))

epochs = range(1, len(history['train_loss']) + 1)
ax.plot(epochs, history['train_loss'], label='Train Loss', linewidth=2)
ax.plot(epochs, history['val_loss'], label='Val Loss', linewidth=2)
ax.set_xlabel('Epoch')
ax.set_ylabel('QLIKE Loss')
ax.set_title('LSTM Training History', fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

# Mark best epoch
if hasattr(trainer, 'best_state'):
    best_epoch = trainer.best_state['epoch'] + 1
    ax.axvline(x=best_epoch, color='red', linestyle='--', alpha=0.5, label=f'Best (epoch {best_epoch})')
    ax.legend()

plt.tight_layout()
plt.show()

print(f"\nFinal train loss: {history['train_loss'][-1]:.4f}")
print(f"Final val loss:   {history['val_loss'][-1]:.4f}")
if hasattr(trainer, 'best_state'):
    print(f"Best val loss:    {trainer.best_state['val_loss']:.4f} (epoch {best_epoch})")

## 6. In-Sample Predictions

In [None]:
# Predict on validation set
val_predictions = trainer.predict(torch.FloatTensor(X_val))

# Compare with actuals
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Time series
val_dates = train_dates[n_train:]
axes[0].plot(val_dates, y_val * 100, label='Actual', linewidth=1.5, alpha=0.8)
axes[0].plot(val_dates, val_predictions * 100, label='Predicted', linewidth=1.5, alpha=0.8)
axes[0].set_title('Validation Set: Actual vs Predicted Volatility', fontweight='bold')
axes[0].set_ylabel('Volatility (% ann.)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Scatter
axes[1].scatter(val_predictions * 100, y_val * 100, alpha=0.5, s=20)
axes[1].plot([0, y_val.max() * 100], [0, y_val.max() * 100], 'r--', linewidth=2, label='Perfect forecast')
axes[1].set_xlabel('Predicted Volatility (% ann.)')
axes[1].set_ylabel('Actual Volatility (% ann.)')
axes[1].set_title('Scatter: Actual vs Predicted', fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
mae = mean_absolute_error(y_val, val_predictions)
r2 = r2_score(y_val, val_predictions)
qlike_val = qlike(y_val, val_predictions)

print("\n=== VALIDATION METRICS ===")
print(f"RMSE:  {rmse:.4f}")
print(f"MAE:   {mae:.4f}")
print(f"R²:    {r2:.4f}")
print(f"QLIKE: {qlike_val:.4f}")

## 7. Out-of-Sample Testing (Walk-Forward)

For production forecasting, we need walk-forward validation with periodic refitting.
This is computationally expensive, so we'll demonstrate the concept with a few refits.

In [None]:
from src.models.lstm import rolling_lstm_forecast

print("\n=== OUT-OF-SAMPLE FORECASTING ===")
print("Generating rolling forecasts with monthly refitting...")
print("⚠ This will take 15-30 minutes!\n")

# Combine all data
all_data = features_df.copy()
all_data[feature_cols] = pd.concat([
    pd.DataFrame(train_features, index=train_df.index, columns=feature_cols),
    pd.DataFrame(test_features, index=test_df.index, columns=feature_cols)
])

# Generate forecasts
lstm_forecasts = rolling_lstm_forecast(
    data=all_data,
    target_col='rv',
    feature_cols=feature_cols,
    seq_len=LSTM_SEQ_LEN,
    train_window=LSTM_TRAIN_WINDOW,
    refit_freq=LSTM_REFIT_FREQ,
    lstm_hidden=LSTM_HIDDEN,
    lstm_layers=LSTM_LAYERS,
    epochs=30,  # Reduced for speed
    batch_size=LSTM_BATCH_SIZE,
    verbose=True
)

print(f"\n✓ Generated {len(lstm_forecasts)} out-of-sample forecasts")

In [None]:
# Filter to test period
test_forecasts = lstm_forecasts[lstm_forecasts.index >= TEST_START]
test_actuals = features_df.loc[test_forecasts.index, 'rv']

print(f"\nTest period forecasts: {len(test_forecasts)}")
print(f"Coverage: {test_forecasts.index[0].date()} to {test_forecasts.index[-1].date()}")

## 8. Model Interpretation

### Feature Importance via Permutation

In [None]:
# Simple feature importance: permutation test
baseline_loss = trainer.validate(val_loader, loss_fn='qlike')

feature_importance = {}

print("\n=== FEATURE IMPORTANCE (Permutation Test) ===")
print("Testing each feature...\n")

for i, feat_name in enumerate(feature_cols[:10]):  # Test first 10 features
    # Permute feature
    X_perm = X_val.copy()
    np.random.shuffle(X_perm[:, :, i])
    
    # Create loader
    perm_dataset = TensorDataset(torch.FloatTensor(X_perm), torch.FloatTensor(y_val))
    perm_loader = DataLoader(perm_dataset, batch_size=LSTM_BATCH_SIZE, shuffle=False)
    
    # Evaluate
    perm_loss = trainer.validate(perm_loader, loss_fn='qlike')
    importance = perm_loss - baseline_loss
    
    feature_importance[feat_name] = importance
    print(f"{feat_name:20s}: {importance:+.6f}")

# Sort by importance
importance_df = pd.Series(feature_importance).sort_values(ascending=False)

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
importance_df.plot(kind='barh', ax=ax, color='steelblue', alpha=0.7)
ax.set_xlabel('Importance (Δ Loss when permuted)')
ax.set_title('Top 10 Feature Importance', fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

print("\nNote: Higher value = more important feature")
print("(Loss increases more when feature is permuted)")

## 9. Summary

### Key Achievements:

1. ✓ Created 40+ engineered features
2. ✓ Selected features using correlation filtering
3. ✓ Built LSTM model with {LSTM_HIDDEN} hidden units, {LSTM_LAYERS} layers
4. ✓ Trained with QLIKE loss and early stopping
5. ✓ Generated out-of-sample forecasts with walk-forward validation
6. ✓ Analyzed feature importance

### Model Configuration:

- **Sequence length:** {LSTM_SEQ_LEN} days
- **Features:** {len(feature_cols)} (including important lags [1,2,6,11,16])
- **Architecture:** {LSTM_LAYERS}-layer LSTM with {LSTM_HIDDEN} hidden units
- **Regularization:** Dropout ({LSTM_DROPOUT}), weight decay ({LSTM_WEIGHT_DECAY})
- **Training:** Early stopping (patience={LSTM_PATIENCE}), LR scheduling

### Next Steps:

Proceed to **Notebook 04** for:
- Compare LSTM vs GARCH forecasts
- Volatility targeting backtests
- Statistical significance tests
- Final conclusions

In [None]:
# Save model for later use
from src.utils import save_model_checkpoint

checkpoint_path = MODELS_DIR / f'lstm_{ticker}_{datetime.now().strftime("%Y%m%d")}.pkl'

save_model_checkpoint(
    model_state=trainer.model.state_dict(),
    filepath=checkpoint_path,
    metadata={
        'ticker': ticker,
        'n_features': len(feature_cols),
        'features': feature_cols,
        'seq_len': LSTM_SEQ_LEN,
        'hidden': LSTM_HIDDEN,
        'layers': LSTM_LAYERS,
        'val_loss': trainer.best_state['val_loss'] if hasattr(trainer, 'best_state') else None
    }
)

print(f"✓ Model saved to {checkpoint_path}")