In [None]:
!pip install rtdl_num_embeddings -q --no-index --find-links=/kaggle/input/jane-street-import/rtdl_num_embeddings

In [None]:
import os, sys, gc
import pickle
import dill
import numpy as np
import pandas as pd
import polars as pl

import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorch_lightning import (LightningDataModule, LightningModule, Trainer)

from sklearn.metrics import r2_score

import torch.optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import math
from tqdm import tqdm
from collections import OrderedDict
from tabm_reference import Model, make_parameter_groups

import warnings
import joblib
from pytorch_lightning.callbacks import Callback
import gc

import lightgbm as lgb
from lightgbm import LGBMRegressor, Booster
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

sys.path.append("/kaggle/input/jane-street-real-time-market-data-forecasting")

In [None]:
class CONFIG:
    """Configuration class for model parameters"""
    seed = 42  # Random seed for reproducibility
    target_col = "responder_6"  # Target variable name
    # Features: 79 base features + 9 lagged features
    feature_cols = [f"feature_{idx:02d}" for idx in range(79)] + [f"responder_{idx}_lag_1" for idx in range(9)]
    # Paths to pre-trained models
    model_paths = [
        "/kaggle/input/js-xs-nn-trained-model",  # Neural Network models
        "/kaggle/input/js-with-lags-trained-xgb/result.pkl", # XGBoost model
        "/kaggle/input/als-e-106-pp0-40-xgb-5fold/result0.pkl",
        "/kaggle/input/als-e-106-pp0-40-xgb-5fold/result1.pkl",
        "/kaggle/input/als-e-106-pp0-40-xgb-5fold/result2.pkl",
        "/kaggle/input/als-e-106-pp0-40-xgb-5fold/result3.pkl",
        "/kaggle/input/als-e-106-pp0-40-xgb-5fold/result4.pkl"  
    ]
    ae_mlp_cols = [f"feature_{idx:02d}" for idx in range(79)] + [f"responder_{idx}_lag_1" for idx in range(9)] + ['symbol_id', 'time_id', 'date_id']
    ae_model_path = '/kaggle/input/ae_mlp_js24_v2/pytorch/ae_mlp_js24_v2/1/ae_mlp_model_06_01_2025.pth'
    ridge_model_path = "/kaggle/input/jsridgev01011635"
    scaler_path = '/kaggle/input/scaler_new/other/scaler_new/1/robust_scaler_07_01.pkl'

In [None]:
# Load validation data
valid = pl.scan_parquet(f"/kaggle/input/js24-preprocessing-create-lags/validation.parquet/").collect().to_pandas()

# Load XGBoost model
xgb_model = None
with open(CONFIG.model_paths[2], "rb") as fp:
    result = pickle.load(fp)
    xgb_model = result["model"]
xgb_feature_cols = ["symbol_id", "time_id"] + CONFIG.feature_cols

xgb_model2 = None
with open(CONFIG.model_paths[4], "rb") as fp:
    result = pickle.load(fp)
    xgb_model2 = result["model"]

xgb_model4 = None
with open(CONFIG.model_paths[6], "rb") as fp:
    result = pickle.load(fp)
    xgb_model4 = result["model"]

xgb_model5 = None
with open(CONFIG.model_paths[1], "rb") as fp:
    result = pickle.load(fp)
    xgb_model5 = result["model"]

def r2_val(y_true, y_pred, sample_weight):
    """
    Calculate weighted R² score
    Args:
        y_true: True values
        y_pred: Predicted values
        sample_weight: Weights for each sample
    Returns:
        Weighted R² score
    """
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return r2

class NN(LightningModule):
    """Neural Network model using PyTorch Lightning"""
    
    def __init__(self, input_dim, hidden_dims, dropouts, lr, weight_decay):
        """
        Initialize the neural network
        Args:
            input_dim: Input feature dimension
            hidden_dims: List of hidden layer dimensions
            dropouts: List of dropout rates
            lr: Learning rate
            weight_decay: Weight decay for regularization
        """
        super().__init__()
        self.save_hyperparameters()
        
        # Build network architecture
        layers = []
        in_dim = input_dim
        for i, hidden_dim in enumerate(hidden_dims):
            layers.append(nn.BatchNorm1d(in_dim))  # Batch normalization
            if i > 0:
                layers.append(nn.SiLU())  # SiLU activation (except first layer)
            if i < len(dropouts):
                layers.append(nn.Dropout(dropouts[i]))  # Dropout for regularization
            layers.append(nn.Linear(in_dim, hidden_dim))  # Linear layer
            in_dim = hidden_dim
            
        # Output layer
        layers.append(nn.Linear(in_dim, 1))
        layers.append(nn.Tanh())  # Tanh activation for bounded output
        
        self.model = nn.Sequential(*layers)
        self.lr = lr
        self.weight_decay = weight_decay
        self.validation_step_outputs = []

    def forward(self, x):
        """Forward pass with scaling"""
        return 5 * self.model(x).squeeze(-1)  # Scale output to [-5, 5] range

    def training_step(self, batch):
        """Single training step"""
        x, y, w = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y, reduction='none') * w  # Weighted MSE loss
        loss = loss.mean()
        self.log('train_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        return loss

    def validation_step(self, batch):
        """Single validation step"""
        x, y, w = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y, reduction='none') * w
        loss = loss.mean()
        self.log('val_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        self.validation_step_outputs.append((y_hat, y, w))
        return loss

    def on_validation_epoch_end(self):
        """Compute validation metrics at epoch end"""
        if not self.trainer.sanity_checking:
            y = torch.cat([x[1] for x in self.validation_step_outputs]).cpu().numpy()
            prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
            weights = torch.cat([x[2] for x in self.validation_step_outputs]).cpu().numpy()
            val_r_square = r2_val(y, prob, weights)
            self.log("val_r_square", val_r_square, prog_bar=True, on_step=False, on_epoch=True)
        self.validation_step_outputs.clear()

    def configure_optimizers(self):
        """Configure optimizer and learning rate scheduler"""
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, 
            mode='min', 
            factor=0.5, 
            patience=5, 
            verbose=True
        )
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss',
            }
        }

    def on_train_epoch_end(self):
        """Log metrics at end of training epoch"""
        if not self.trainer.sanity_checking:
            epoch = self.trainer.current_epoch
            metrics = {k: v.item() if isinstance(v, torch.Tensor) else v 
                      for k, v in self.trainer.logged_metrics.items()}
            formatted_metrics = {k: f"{v:.5f}" for k, v in metrics.items()}
            print(f"Epoch {epoch}: {formatted_metrics}")

# Load ensemble of models (5-fold cross-validation)
N_folds = 5
models = []
for fold in range(N_folds):
    checkpoint_path = f"{CONFIG.model_paths[0]}/nn_{fold}.model"
    model = NN.load_from_checkpoint(checkpoint_path)
    models.append(model.to("cuda:0"))

In [None]:
# Clear validation data from memory to free up space
del valid
gc.collect()

# Global variable to store lagged features
lags_: pl.DataFrame | None = None

def predict_nn_xgb(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """
    Make predictions using ensemble of XGBoost and Neural Network models
    
    Args:
        test: DataFrame containing test data
        lags: DataFrame containing lagged features (optional)
        
    Returns:
        DataFrame with predictions
    """
    global lags_
    
    # Store lags in global variable if provided
    if lags is not None:
        lags_ = lags

    # Initialize predictions DataFrame with row_id and placeholder predictions
    predictions_nn = test.select('row_id', pl.lit(0.0).alias('responder_6',))

    # Process lagged features
    # Get last record for each date_id and symbol_id combination
    lags = lags_.clone().group_by(["date_id", "symbol_id"], maintain_order=True).last()
    
    # Join test data with lagged features
    test = test.join(lags, on=["date_id", "symbol_id"], how="left")

    # Initialize arrays for model predictions
    preds_xgb = np.zeros((test.shape[0],))  # XGBoost predictions
    preds_nn = np.zeros((test.shape[0],))   # Neural Network predictions

    # Generate XGBoost predictions
    preds_xgb += xgb_model.predict(test[xgb_feature_cols].to_pandas()) * 0.25
    preds_xgb += xgb_model2.predict(test[xgb_feature_cols].to_pandas()) * 0.25
    preds_xgb += xgb_model4.predict(test[xgb_feature_cols].to_pandas()) * 0.25
    preds_xgb += xgb_model5.predict(test[xgb_feature_cols].to_pandas()) * 0.25

    # Generate Neural Network predictions
    # Prepare input data
    test_input = test[CONFIG.feature_cols].to_pandas()
    # Handle missing values: forward fill then fill remaining with zeros
    test_input = test_input.fillna(method='ffill').fillna(0)
    # Convert to PyTorch tensor and move to GPU
    test_input = torch.FloatTensor(test_input.values).to("cuda:0")

    # Generate predictions from Neural Network ensemble
    with torch.no_grad():  # Disable gradient calculation for inference
        for i, nn_model in enumerate(models):
            nn_model.eval()  # Set model to evaluation mode
            # Average predictions from all models
            preds_nn += nn_model(test_input).cpu().numpy() / len(models)

    # Combine predictions with equal weights (50% XGBoost, 50% Neural Network)
    preds = 0.55 * preds_xgb + 0.45 * preds_nn

    # Create final predictions DataFrame
    predictions_nn = test.select('row_id').\
        with_columns(
            pl.Series(
                name='responder_6',
                values=np.clip(preds, a_min=-5, a_max=5),  # Clip predictions to [-5, 5] range
                dtype=pl.Float64,
            )
        )

    return predictions_nn

## AE MLP

In [None]:
from joblib import load

In [None]:
# AE-MLP with Dropout & L2-regulirization
class AE_MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=1, dropout_rate=0.3):
        super(AE_MLP, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout_rate),  # Dropout after activation not to overfit
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(p=dropout_rate)
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim // 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout_rate),
            nn.Linear(hidden_dim, output_dim)
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Взвешенная Huber Loss
def weighted_loss(predictions, targets, weights, delta=1.0):
    loss = nn.SmoothL1Loss(beta=delta, reduction='none')  # Huber Loss
    per_sample_loss = loss(predictions, targets)
    weighted_loss = (per_sample_loss * weights).mean()  # weight loss
    return weighted_loss

    
ae_model = AE_MLP(input_dim=91, hidden_dim=128) 

ae_model.load_state_dict(torch.load(CONFIG.ae_model_path, weights_only=True))
ae_model.to('cuda:0')

#ae_model.eval()
scaler = load(CONFIG.scaler_path)

In [None]:
# Global variable to store lagged features
lags_: pl.DataFrame | None = None

def predict_nn_ae(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """
    Make predictions using ensemble of XGBoost and Neural Network models
    
    Args:
        test: DataFrame containing test data
        lags: DataFrame containing lagged features (optional)
        
    Returns:
        DataFrame with predictions
    """
    global lags_
    
    # Store lags in global variable if provided
    if lags is not None:
        lags_ = lags

    # Initialize predictions DataFrame with row_id and placeholder predictions
    predictions_nn = test.select('row_id', pl.lit(0.0).alias('responder_6',))

    # Process lagged features
    # Get last record for each date_id and symbol_id combination
    lags = lags_.clone().group_by(["date_id", "symbol_id"], maintain_order=True).last()
    
    # Join test data with lagged features
    test = test.join(lags, on=["date_id", "symbol_id"], how="left")
    preds_nn = np.zeros((test.shape[0],))   # Neural Network predictions

    # Generate Neural Network predictions
    # Prepare input data
    test_input = test[CONFIG.ae_mlp_cols].to_pandas()
    # Handle missing values: forward fill then fill remaining with zeros
    test_input = test_input.fillna(method='ffill').fillna(0)
    test_input = scaler.transform(test_input.values)
    # Convert to PyTorch tensor and move to GPU
    test_input = torch.FloatTensor(test_input).to("cuda:0")

    # Generate predictions from Neural Network ensemble
    with torch.no_grad():  # Disable gradient calculation for inference
        ae_model.eval()  # Set model to evaluation mode
        # Average predictions from all models
        preds_nn = ae_model(test_input).cpu().numpy().flatten()


    # Create final predictions DataFrame
    predictions_nn = test.select('row_id').\
        with_columns(
            pl.Series(
                name='responder_6',
                values=np.clip(preds_nn, a_min=-5, a_max=5),  # Clip predictions to [-5, 5] range
                dtype=pl.Float64,
            )
        )

    return predictions_nn

In [None]:
def load_from_dill(model_name, model_path=None, file_ext='.dill'):
    """
    Load a model from a dill file
    
    Args:
        model_name: Name of the model file (without extension)
        model_path: Directory path containing the model file
        file_ext: File extension (default: '.dill')
        
    Returns:
        Loaded model object
    """
    model_object = None
    # Open and load the model file using dill
    with open(f"{model_path}/{model_name}{file_ext}", "rb") as file_handle:
        model_object = dill.load(file_handle)
    return model_object

# Load pre-trained Ridge Regression model
rdg = load_from_dill(
    model_name='Ridge', 
    model_path=ridge_model_path
)

def predict_ridge(test, lags):
    """
    Make predictions using Ridge Regression model
    
    Args:
        test: DataFrame containing test data
        lags: DataFrame containing lagged features (unused in this function)
        
    Returns:
        DataFrame with predictions
    """
    # Select the 79 numerical features
    cols = [f'feature_{i:02}' for i in range(79)]

    # Initialize predictions DataFrame with row_id and placeholder predictions
    predictions = test.select(
        'row_id',
        pl.lit(0.0).alias('responder_6'),
    )

    # Generate predictions:
    # 1. Select required features
    # 2. Convert to pandas
    # 3. Fill missing values with 3
    # 4. Make predictions using Ridge model
    test_preds = rdg.predict(test[cols].to_pandas().fillna(3).values)

    # Add predictions to result DataFrame
    predictions = predictions.with_columns(pl.Series('responder_6', test_preds.ravel()))

    return predictions

# Ensemble notebook

In [None]:
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """ 
    Args:
        test: DataFrame containing test data
        lags: DataFrame containing lagged features
        
    Returns:
        DataFrame with final ensemble predictions
    """
    # Get predictions from each model/ensemble

    pd_ae_mlp = predict_nn_ae(test, lags).to_pandas() 
    pd_nn_xgb = predict_nn_xgb(test, lags).to_pandas()     # Neural Network + XGBoost ensemble
    pd_ridge = predict_ridge(test, lags).to_pandas()        # Ridge Regression
    # pd_tabm = predict_tabm(test, lags).to_pandas()  

    

    # Rename prediction columns to avoid conflicts
    pd_nn_xgb = pd_nn_xgb.rename(columns={'responder_6': 'col_nn_xgb'})
    pd_ridge = pd_ridge.rename(columns={'responder_6': 'col_ridge'})
    # pd_tabm  = pd_tabm.rename(columns={'responder_6': 'col_tabm'})
    pd_ae_mlp  = pd_ae_mlp.rename(columns={'responder_6': 'col_ae_mlp'})

    # Merge all predictions based on row_id
    pds = pd.merge(pd_nn_xgb, pd_ridge, on=['row_id'])
    # pds = pd.merge(pds, pd_tabm, on=['row_id'])
    pds = pd.merge(pds, pd_ae_mlp, on=['row_id'])

    e_weights = [0.60, 0.10, 0.30, 0.5] 
    # Create final weighted ensemble prediction:
    pds['responder_6'] = (
        pds['col_nn_xgb'] * e_weights[0] +
        pds['col_ridge'] * e_weights[1] +
        #pds['col_tabm'] * e_weights[2]  +
        pds['col_ae_mlp'] * e_weights[3]
    )

    # Create final predictions DataFrame in required format
    predictions = test.select('row_id', pl.lit(0.0).alias('responder_6'))
    pred = pds['responder_6'].to_numpy()
    predictions = predictions.with_columns(pl.Series('responder_6', pred.ravel()))

    return predictions

In [None]:
import kaggle_evaluation.jane_street_inference_server
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )