# Leg 2: News Model Training

This notebook trains the **Leg 2 News** component of the multimodal pipeline.

**Architecture**:
1. **FinbertHAN**: Hierarchical Attention Network using frozen DistilRoBERTa embeddings -> Bi-GRU -> Time-Aware Attention
2. **Inputs**: Tokenized headlines, time gaps, and auxiliary features (Velocity, Novelty, Event Flags)
3. **Target**: Next-day Open-to-Close Excess Return
4. **Calibration**: Isotonic Regression on out of fold OOF predictions

**Process**:
1. Load preprocessed news data which contains sentences, gaps
2. Wrap the PyTorch model in a scikit-learn compatible class (`Leg2HANWrapper`)
3. Run walk-forward cross-validation aka the expanding window using `src.utils.cv`
4. Calibrate raw scores
5. Evaluate performance (MSE, IC) and visualize results

In [1]:
import sys
import os
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr, spearmanr

current_dir = Path(os.getcwd())
src_path = current_dir.parent / 'src'
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

from models.HAN_l2 import FinbertHAN
from data.loaders import NewsDataset, han_collate_fn
from utils.cv import generate_yearly_oof
from models.calibrators import IsotonicCalibrator

# Config
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 42
BATCH_SIZE = 32
EPOCHS = 5  # Keeping low for testing, for better results increase it, no more than 512 though
LR = 1e-4
MAX_GRAD_NORM = 1.0

torch.manual_seed(SEED)
np.random.seed(SEED)

print(f"Running on {DEVICE}")

  from .autonotebook import tqdm as notebook_tqdm


Running on cpu


## 1. Define Model Wrapper

To utilize the `generate_yearly_oof` function from `src.utils.cv` (which expects a standard `.fit()` / `.predict()` interface), we wrap the PyTorch training logic into a class.

In [None]:
class Leg2HANWrapper:
    """
    Scikit-learn compatible wrapper for the FinbertHAN PyTorch model
    Handles DataLoaders, Training Loops, and GPU movement internally
    """
    def __init__(self, 
                 batch_size=BATCH_SIZE, 
                 epochs=EPOCHS, 
                 lr=LR, 
                 device=DEVICE):
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        self.device = device
        self.model = None
        self.train_loss_history = []

    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Trains the FinbertHAN model
        X: DataFrame containing 'sentences', 'gaps', 'novelty'
        y: Target series ie the excess return If None, checks if 'target' is in X
        """
        # Prepare data
        train_df = X.copy()
        if y is not None:
            train_df['target'] = y.values
        
        # Initialize dataset & loader
        dataset = NewsDataset(train_df)
        loader = DataLoader(
            dataset, 
            batch_size=self.batch_size, 
            shuffle=True, 
            collate_fn=han_collate_fn,
            num_workers=0 # 0 is default for Windows, if on Linux/Apple silicon set >0 for speed
        )

        # Initialize Model
        self.model = FinbertHAN(aux_dim=4).to(self.device)
        optimizer = optim.AdamW(filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.lr)
        criterion = nn.MSELoss()

        self.model.train()
        self.train_loss_history = []

        # Training loop
        for epoch in range(self.epochs):
            epoch_loss = 0.0
            count = 0
            for batch in loader:
                # Move batch to device
                input_ids = batch['input_ids'].to(self.device)
                att_mask = batch['attention_mask'].to(self.device)
                doc_lens = batch['doc_lengths']
                time_gaps = batch['time_gaps'].to(self.device)
                aux_feats = batch['aux_features'].to(self.device)
                news_mask = batch['news_mask'].to(self.device)
                targets = batch['targets'].to(self.device)

                optimizer.zero_grad()

                # Forward pass
                preds, _, _ = self.model(
                    input_ids, att_mask, doc_lens, time_gaps, aux_feats, news_mask
                )

                loss = criterion(preds, targets)
                loss.backward()
                
                # Gradient Clipping for stability
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), MAX_GRAD_NORM)
                
                optimizer.step()

                epoch_loss += loss.item() * targets.size(0)
                count += targets.size(0)
            
            avg_loss = epoch_loss / max(1, count)
            self.train_loss_history.append(avg_loss)
            # Print progress
            print(f"Epoch {epoch+1}/{self.epochs} - Loss: {avg_loss:.6f}")
        
        return self

    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """
        Generates predictions for X
        """
        self.model.eval()
        
        # Prepare Data 
        test_df = X.copy()
        if 'target' not in test_df.columns:
            test_df['target'] = 0.0 # Dummy target, so this needs to be changed when running
            
        dataset = NewsDataset(test_df)
        loader = DataLoader(
            dataset, 
            batch_size=self.batch_size, 
            shuffle=False, 
            collate_fn=han_collate_fn,
            num_workers=0
        )

        all_preds = []
        with torch.no_grad():
            for batch in loader:
                input_ids = batch['input_ids'].to(self.device)
                att_mask = batch['attention_mask'].to(self.device)
                doc_lens = batch['doc_lengths']
                time_gaps = batch['time_gaps'].to(self.device)
                aux_feats = batch['aux_features'].to(self.device)
                news_mask = batch['news_mask'].to(self.device)

                preds, _, _ = self.model(
                    input_ids, att_mask, doc_lens, time_gaps, aux_feats, news_mask
                )
                all_preds.append(preds.cpu().numpy())
        
        # Flatten results
        return np.concatenate(all_preds).flatten()

## 2. Load Data

We load the dataset processed news data. We check multiple potential locations as this might run in a Kaggle kernel or local environment.

In [None]:
# Checking multiple locations
# Change this accoring to however you have your local machine setup because I have been changing
# it a lot for the past few hours
SEARCH_PATHS = [
    "../data/processed/news_ready.pkl",
    "/kaggle/input/multimodal-leg2-data/news_ready.pkl",
    "/kaggle/working/news_ready.pkl",
    "news_ready.pkl"
]

data_found = False
for path in SEARCH_PATHS:
    if os.path.exists(path):
        DATA_PATH = path
        data_found = True
        break
        
if data_found:
    print(f"Loading data from {DATA_PATH}")
    df_full = pd.read_pickle(DATA_PATH)
    
    # Date format
    df_full['Date'] = pd.to_datetime(df_full['Date'], utc=True)
    df_full = df_full.sort_values('Date').reset_index(drop=True)
    
    # Define Features and Target
    # 'sentences' and 'gaps' are lists, passed as objects
    # We pass the whole dataframe to the wrapper, but split X and y for the CV utility
    target_col = 'excess_return'
    
    # Filter out rows with NaN targets if any
    df_full = df_full.dropna(subset=[target_col])
    
    X = df_full.drop(columns=[target_col])
    y = df_full[target_col]
    dates = df_full['Date']
    
    print(f"Data Loaded. Shape: {df_full.shape}")
    print(f"Date Range: {dates.min()} to {dates.max()}")
else:
    print(f"ERROR: Data file not found...Checked paths: {SEARCH_PATHS}")
    raise FileNotFoundError(f"news_ready.pkl not found in {SEARCH_PATHS}")

ERROR: Data file not found...Checked paths: ['../data/processed/news_ready.pkl', '/kaggle/input/multimodal-leg2-data/news_ready.pkl', '/kaggle/working/news_ready.pkl', 'news_ready.pkl']


FileNotFoundError: news_ready.pkl not found in ['../data/processed/news_ready.pkl', '/kaggle/input/multimodal-leg2-data/news_ready.pkl', '/kaggle/working/news_ready.pkl', 'news_ready.pkl']

## 3. Walk-Forward Cross Validation (OOF Generation)

We uses `generate_yearly_oof` to simulate a realistic production environment.
- **Expanding Window**: Train on years `[Start, Y-1]`, predict on `Y`
- **Min Train Years**: 2 Ensures we have enough history for the first fold

In [None]:
# Define the model factory returns a fresh instance
def model_factory():
    return Leg2HANWrapper(batch_size=BATCH_SIZE, epochs=EPOCHS, lr=LR, device=DEVICE)

# Run OOF Generation
# This will take a while
if 'X' in locals():
    oof_preds_raw, oof_targets, fold_stats = generate_yearly_oof(
        model_factory=model_factory,
        X=X,
        y=y,
        dates=dates,
        min_train_years=2,
        n_jobs=1  # 1 for neural nets
    )
    
    print(f"OOF Prediction Complete. Generated {len(oof_preds_raw)} predictions.")
    
    # Display fold stats
    print(pd.DataFrame(fold_stats))

## 4. Calibration

The raw output of the Neural Network (MSE regression) might not align perfectly with probabilities or expected returns in a linear fashion. We use Isotonic Regression to map the raw scores to calibrated Expected Excess Returns. We are still using the IsotonicCalibrator class from the previous notebook.

In [None]:
if 'oof_preds_raw' in locals() and len(oof_preds_raw) > 0:
    # Initialize and fit calibrator
    # Training Isotonic requires strictly monotonic relationship if possible
    calibrator = IsotonicCalibrator(out_of_bounds='clip')
    calibrator.fit(oof_preds_raw, oof_targets)
    
    # Generate calibrated scores
    oof_preds_calib = calibrator.predict(oof_preds_raw)
    
    # Save calibrator for inference
    save_path = Path('../models/leg2_calibrator.pkl')
    calibrator.save(save_path)

## 5. Metrics & Visualization

We evaluate the model using:
1. **MSE**: Mean Squared Error (Lower is better)
2. **IC (Information Coefficient)**: Pearson correlation between prediction and target (Higher is better)
3. **Visuals**: Scatter plot of Raw vs Target and Calibrated vs Target

In [None]:
if 'oof_preds_raw' in locals() and len(oof_preds_raw) > 0:
    # 1. Metrics
    mse_raw = mean_squared_error(oof_targets, oof_preds_raw)
    mse_calib = mean_squared_error(oof_targets, oof_preds_calib)
    
    ic_raw, _ = pearsonr(oof_preds_raw, oof_targets)
    ic_calib, _ = pearsonr(oof_preds_calib, oof_targets)
    
    print(f"Leg 2 Results")
    print(f"MSE (Raw): {mse_raw:.6f}")
    print(f"MSE (Calib): {mse_calib:.6f}")
    print(f"IC (Raw): {ic_raw:.4f}")
    print(f"IC (Calib): {ic_calib:.4f}")
    
    # 2. Visualization
    fig, ax = plt.subplots(1, 2, figsize=(16, 6))
    
    # Plot 1: Raw Predictions vs Target chose binning for clarity
    # Using sample if data is huge for speed
    indices = np.random.choice(len(oof_targets), min(5000, len(oof_targets)), replace=False)
    
    sns.regplot(x=oof_preds_raw[indices], y=oof_targets[indices], ax=ax[0], 
                scatter_kws={'alpha':0.3, 's': 10}, line_kws={'color':'red'})
    ax[0].set_title(f"Raw Scores vs Excess Return (IC={ic_raw:.3f})")
    ax[0].set_xlabel("Raw NN Output")
    ax[0].set_ylabel("True Excess Return")
    
    # Plot 2: Calibrated vs Target
    sns.regplot(x=oof_preds_calib[indices], y=oof_targets[indices], ax=ax[1], 
                scatter_kws={'alpha':0.3, 's': 10}, line_kws={'color':'green'})
    ax[1].set_title(f"Calibrated Scores vs Excess Return (IC={ic_calib:.3f})")
    ax[1].set_xlabel("Calibrated Probability/Return")
    ax[1].set_ylabel("True Excess Return")
    
    plt.tight_layout()
    plt.show()