In [1]:
import os
import gc
import random
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import catboost as cat
import polars as pl  # Ensure Polars is imported
from tqdm import tqdm
from typing import Tuple, List, Union
import kaggle_evaluation.jane_street_inference_server


# Global settings
SEED = 42
N_ESTIMATORS = 10_000  # Increased estimators
EARLY_STOP = 200  # Increased early stopping rounds
N_SPLITS = 5  # Cross-validation splits
NUM_VALID_DATES = 100
NUM_ROWS_NOT_TO_BE_USED = 25023058
count = 0
MODEL_NAMES = ['lgb']  # Define MODEL_NAMES


# Paths
INPUT_DIR = '/kaggle/input/jane-street-real-time-market-data-forecasting'
OUTPUT_DIR = '/kaggle/output'
TARGET = 'responder_6'
TIME_COLS = ['date_id', 'time_id']
LEAD_COLS = ['symbol_id', 'weight']
RESPONDER_COLS = [f"responder_{i}" for i in range(9)]
FEAT_COLS = [f"feature_{i:02d}" for i in range(79)]

# Utility functions
def seed_everything(seed=SEED):
    """Set the seed for reproducibility."""
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    print(f"[INFO] Seed set to {seed}")

seed_everything()

# Define objective functions for model training
def r2_gbt(y_pred, dtrain: Union[lgb.Dataset, xgb.DMatrix]):
    y_true = dtrain.get_label()
    weight = dtrain.get_weight()
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=weight) / (np.average((y_true) ** 2, weights=weight) + 1e-38)
    return 'r2', r2, True if isinstance(dtrain, lgb.Dataset) else -r2

class r2_cbt:
    def get_final_error(self, error, weight):
        return 1 - error / (weight + 1e-38)

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        approx = approxes[0]
        error_sum = 0.0
        weight_sum = 0.0
        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w * (target[i] ** 2)
            error_sum += w * ((approx[i] - target[i]) ** 2)
        return error_sum, weight_sum

# Model parameters
LGB_PARAMS = {
    'objective': 'l2',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,  # Lowered learning rate for finer tuning
    'num_leaves': 255,  # Increased complexity
    'verbose': -1,
    'random_state': SEED,
    'max_depth': 16,  # Limited depth to prevent overfitting
}

XGB_PARAMS = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'hist',
    'learning_rate': 0.01,
    'max_depth': 16,
    'random_state': SEED,
}

CAT_PARAMS = {
    'loss_function': 'RMSE',
    'eval_metric': r2_cbt(),
    'n_estimators': N_ESTIMATORS,
    'learning_rate': 0.01,
    'verbose': 0,
    'random_state': SEED,
    'early_stopping_rounds': EARLY_STOP,
}

# Data loading
def get_df():
    """Load and concatenate all necessary data."""
    print('[INFO] Loading time, lead, target, and feature data...')
    
    time_df = pd.read_parquet(f"{INPUT_DIR}/train.parquet", columns=TIME_COLS)[NUM_ROWS_NOT_TO_BE_USED:].astype(np.uint16)
    print(f"[INFO] Time data loaded with shape {time_df.shape}")
    
    lead_df = pd.read_parquet(f"{INPUT_DIR}/train.parquet", columns=LEAD_COLS)[NUM_ROWS_NOT_TO_BE_USED:]
    lead_df['symbol_id'] = lead_df['symbol_id'].astype(np.uint32)
    lead_df['weight'] = lead_df['weight'].astype(np.float32)
    print(f"[INFO] Lead data loaded with shape {lead_df.shape}")
    
    responder_6_df = pd.read_parquet(f"{INPUT_DIR}/train.parquet", columns=[TARGET])[NUM_ROWS_NOT_TO_BE_USED:].astype(np.float32)
    print(f"[INFO] Target data loaded with shape {responder_6_df.shape}")
    
    feat_dfs = []
    chunk_unit_len = (len(FEAT_COLS) // 10) + 1
    for i in tqdm(range(10), desc='Loading feature chunks'):
        read_feat_cols = FEAT_COLS[i*chunk_unit_len:(i+1)*chunk_unit_len]
        feat_dfs.append(pd.read_parquet(f"{INPUT_DIR}/train.parquet", columns=read_feat_cols)[NUM_ROWS_NOT_TO_BE_USED:].astype(np.float32))
    print(f"[INFO] Feature data chunks loaded")
    
    return pd.concat([time_df, lead_df] + feat_dfs + [responder_6_df], axis=1)

# Model training
def train_model(df, model_names: List[str]):
    """Train specified models on the given dataset."""
    models = [None] * len(model_names)
    
    dates = df['date_id'].unique()
    train_dates = dates[:-NUM_VALID_DATES]
    valid_dates = dates[-NUM_VALID_DATES:]
    
    tr_X = df.loc[df['date_id'].isin(train_dates), FEAT_COLS]
    val_X = df.loc[df['date_id'].isin(valid_dates), FEAT_COLS]
    
    tr_y = df.loc[df['date_id'].isin(train_dates), TARGET]
    val_y = df.loc[df['date_id'].isin(valid_dates), TARGET]
    
    tr_weight = df['weight'].loc[df['date_id'].isin(train_dates)]
    val_weight = df['weight'].loc[df['date_id'].isin(valid_dates)]
    
    if 'lgb' in model_names:
        print('[INFO] Training LightGBM model...')
        tr_ds = lgb.Dataset(tr_X, label=tr_y, weight=tr_weight)
        val_ds = lgb.Dataset(val_X, label=val_y, weight=val_weight, reference=tr_ds)
        model = lgb.train(
            LGB_PARAMS, tr_ds, N_ESTIMATORS, valid_sets=[val_ds], feval=r2_gbt,
            callbacks=[lgb.early_stopping(EARLY_STOP)]
        )
        print(f"[INFO] LightGBM Best R²: {model.best_score['valid_0']['r2']:.06f}")
        models[0] = model
        del tr_ds, val_ds
        gc.collect()
    
    # XGBoost and CatBoost training are commented out for now.
    
    return models

# Inference function
def infer(data, models):
    """Perform inference using the trained models."""
    return np.mean([model.predict(data) for model in models if model is not None], axis=0)

# Predict function for Kaggle inference server
def predict(test: pl.DataFrame, lags: Union[pl.DataFrame, None]) -> Union[pl.DataFrame, pd.DataFrame]:
    global models, count
    if count == 0:
        print('[INFO] First prediction call, training models...')
        df = get_df()
        print(f"[INFO] Data shape: {df.shape}")
        models = train_model(df, MODEL_NAMES)
        del df
        gc.collect()

    count += 1

    feat = test[FEAT_COLS].to_pandas()
    predictions = test.select('row_id', pl.lit(0.0).alias('responder_6'))
    lgb_pred = models[0].predict(feat)
    
    predictions = predictions.with_columns(pl.Series('responder_6', lgb_pred))
    
    assert isinstance(predictions, pl.DataFrame | pd.DataFrame)
    assert predictions.columns == ['row_id', 'responder_6']
    assert len(predictions) == len(test)
    
    return predictions

# Kaggle inference server
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )


[INFO] Seed set to 42
[INFO] First prediction call, training models...
[INFO] Loading time, lead, target, and feature data...
[INFO] Time data loaded with shape (22104280, 2)
[INFO] Lead data loaded with shape (22104280, 2)
[INFO] Target data loaded with shape (22104280, 1)


Loading feature chunks: 100%|██████████| 10/10 [00:29<00:00,  2.94s/it]


[INFO] Feature data chunks loaded
[INFO] Data shape: (22104280, 84)
[INFO] Training LightGBM model...
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[873]	valid_0's l2: 0.623882	valid_0's r2: 0.00805815
[INFO] LightGBM Best R²: 0.008058
