# FinSurvival Competition: Starter Notebook (XGBoost Cox Model Prediction Submission)

**Objective:** This notebook provides a workflow for creating a valid prediction submission using the XGBoost Cox survival model. The competition requires you to submit a `.zip` file containing 16 separate prediction files in CSV format.

This notebook will guide you through:
1.  Loading the training and test sets for each of the 16 tasks from a single directory.
2.  Training a model (using XGBoost Cox model as an example).
3.  Generating predictions on the test set in the required format.
4.  Saving each set of predictions to a correctly named CSV file.
5.  Zipping all 16 prediction files for submission.

## Step 1: Setup and Imports

In [141]:
!export CUDA_VISIBLE_DEVICES=1

# Install required packages
# pip install -q pandas xgboost scikit-learn numpy

# Import libraries
import pandas as pd
import numpy as np
import os
import shutil
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from typing import Tuple, Optional
import pickle as pkl
from itertools import chain

In [142]:
DATA_PATH = "./data/"
CACHE_DIR = "./cache/"
os.makedirs(CACHE_DIR, exist_ok=True)
MODEL_CACHE_DIR = os.path.join(CACHE_DIR, "models")
os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
DATA_CACHE_DIR = os.path.join(CACHE_DIR, "data")
os.makedirs(DATA_CACHE_DIR, exist_ok=True)
seed = 42

## Step 2: Define a Preprocessing Function

Even though you are not submitting this code, you will still need a preprocessing pipeline to train your models effectively. You can use the one below as a starting point.

In [143]:
def preprocess(
    train_df_with_labels: Optional[pd.DataFrame] = None,
    test_features_df: Optional[pd.DataFrame] = None,
    model_date: Optional[int] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]:

    # Create unique prefix for saving/loading preprocessing objects
    unique_prefix = (
        (
            present_dataframe := (
                train_df_with_labels
                if train_df_with_labels is not None
                else test_features_df
            )
        )["Index Event"].iloc[0]
        + "_"
        + present_dataframe["Outcome Event"].iloc[0]
        + (f"_{model_date}_" if model_date is not None else "_")
    )
    # Define paths for saving/loading preprocessing objects
    scaler_path = os.path.join(DATA_CACHE_DIR, unique_prefix + "scaler.pkl")
    train_cols = os.path.join(DATA_CACHE_DIR, unique_prefix + "train_cols.pkl")
    top_categories_dict_path = os.path.join(
        DATA_CACHE_DIR, unique_prefix + "top_categories_dict.pkl"
    )

    if train_df_with_labels is not None:
        if model_date is not None:
            train_df_with_labels = train_df_with_labels[
                (train_df_with_labels["timestamp"] + train_df_with_labels["timeDiff"])
                <= model_date
            ]

        # Separate features and targets (and drop unneeded columns from features)
        target_columns = ["timeDiff", "status"]
        train_targets = train_df_with_labels[target_columns]
        cols_to_drop = [
            "id",
            "user",
            "pool",
            "Index Event",
            "Outcome Event",
            "type",
            "timestamp",
        ]
        train_features = train_df_with_labels.drop(
            columns=target_columns + cols_to_drop, errors="ignore"
        )

        # Make uncommon categories "Other" and one-hot encode categorical features
        categorical_cols = train_features.select_dtypes(
            include=["object", "category"]
        ).columns
        top_categories_dict = {}
        for col in categorical_cols:
            if col not in top_categories_dict:
                top_categories_dict[col] = (
                    train_features[col].value_counts().nlargest(10).index
                )
            train_features[col] = train_features[col].where(
                train_features[col].isin(top_categories_dict[col]), "Other"
            )
        train_features_encoded = pd.get_dummies(
            train_features, columns=categorical_cols, dummy_na=True, drop_first=True
        )

        # Standardize numerical features
        numerical_cols = train_features_encoded.select_dtypes(include=np.number).columns
        scaler = StandardScaler()
        train_features_scaled = scaler.fit_transform(
            train_features_encoded[numerical_cols]
        )
        train_features_final = pd.DataFrame(
            train_features_scaled,
            index=train_features_encoded.index,
            columns=numerical_cols,
        ).fillna(0)

        # Remove zero-variance columns
        cols_to_keep = train_features_final.columns[train_features_final.var() != 0]
        train_features_final = train_features_final[cols_to_keep]

        # Save preprocessing objects
        with open(scaler_path, "wb") as f:
            pkl.dump(scaler, f)
        with open(train_cols, "wb") as f:
            pkl.dump(train_features_encoded.columns, f)
        with open(top_categories_dict_path, "wb") as f:
            pkl.dump(top_categories_dict, f)

    # Process test features if provided
    test_processed_features = None
    if test_features_df is not None:
        test_features = test_features_df.drop(columns=cols_to_drop, errors="ignore")
        with open(top_categories_dict_path, "rb") as f:
            top_categories_dict = pkl.load(f)
        for col in categorical_cols:
            top_categories = top_categories_dict[col]
            test_features[col] = test_features[col].where(
                test_features[col].isin(top_categories), "Other"
            )
        test_features_encoded = pd.get_dummies(
            test_features, columns=categorical_cols, dummy_na=True, drop_first=True
        )
        with open(train_cols, "rb") as f:
            train_cols = pkl.load(f)
        test_features_aligned = test_features_encoded.reindex(
            columns=train_cols, fill_value=0
        )
        with open(scaler_path, "rb") as f:
            scaler = pkl.load(f)
        test_features_scaled = scaler.transform(test_features_aligned[numerical_cols])
        test_features_final = pd.DataFrame(
            test_features_scaled,
            index=test_features_aligned.index,
            columns=numerical_cols,
        ).fillna(0)
        test_processed_features = test_features_final[cols_to_keep]
    return train_features_final, train_targets, test_processed_features

## Step 3: Loop, Train, and Save Predictions

This is the main part of the notebook. We will loop through all 16 tasks. For each task, we will:
1. Load the training data and the test features.
2. Preprocess both.
3. Train a model on the training data.
4. Generate predictions on the processed test features.
5. Save the predictions to a CSV file with the correct name.

In [144]:
def get_model_for_pair_and_date(
    index_event: str, outcome_event: str, model_date: int | None = None, verbose: bool = False
):
    # normalize model_date for filename
    model_date_str = str(model_date) if model_date is not None else "latest"
    model_filename = f"xgboost_cox_{index_event}_{outcome_event}_{model_date_str}.ubj"
    model_path = os.path.join(MODEL_CACHE_DIR, model_filename)

    # Create model with Cox objective
    model = XGBRegressor(
        objective="survival:cox",
        eval_metric="cox-nloglik",
        tree_method="hist",
        predictor="gpu_predictor",
        device="cuda",
        seed=42,
        verbosity=0,
        max_bin=64,
        learning_rate=0.04,
        max_depth=5,
        subsample=0.85,
        colsample_bytree=0.8,
        min_child_weight=5,
        reg_lambda=1.0,
        reg_alpha=0.1,
    )

    # If model file exists, try to load into the estimator and return the estimator
    if os.path.exists(model_path):
        if verbose:
            print(f"Loading existing model from {model_path}")
        try:
            model.load_model(model_path)
            if verbose:
                print(f"model loaded from {model_path}")
            return model
        except Exception as e:
            print(f"Warning: failed to load model from {model_path}: {e}. Will retrain.")

    dataset_path = os.path.join(index_event, outcome_event)

    # --- Load and Preprocess ---
    if verbose:
        print(f"Loading data from {os.path.join(DATA_PATH, dataset_path, 'data.csv')}")
    train_df = pd.read_csv(os.path.join(DATA_PATH, dataset_path, "data.csv"))

    X_train, y_train, _ = preprocess(train_df, model_date=model_date)

    # --- Train Model ---
    # Prepare target variables for Cox regression
    y_train_duration = y_train["timeDiff"].values
    y_train_event = y_train["status"].values

    # Fit model: XGBoost Cox expects labels to be the event indicators
    # and the sample_weight to be the durations
    if verbose:
        print("Training model...")
    try:
        model.fit(X_train, y_train_event, sample_weight=y_train_duration)
    except Exception as e:
        print(f"ERROR: Model training failed for {dataset_path}: {e}")
        raise

    # Save model: try estimator's save_model, fall back to Booster.save_model
    try:
        # XGBRegressor implements save_model; call it and confirm file created
        model.save_model(model_path)
        if verbose:
            print(f"Model saved to {model_path}")
    except Exception:
        try:
            booster = model.get_booster()
            booster.save_model(model_path)
            if verbose:
                print(f"Model booster saved to {model_path}")
        except Exception as e:
            print(f"Warning: Failed to save model to {model_path}: {e}")
    
    return model

In [145]:
def train_models_for_all_event_pairs(
    model_date: int | None = None, verbose: bool = False
):
    # Define all 16 event pairs
    index_events = ["Liquidated", "Borrow", "Deposit", "Repay", "Withdraw"]
    outcome_events = index_events
    event_pairs = [
        event_pair
        for sub_event_pairs in [
            [(index_event, outcome_event) for outcome_event in outcome_events]
            for index_event in index_events
        ]
        for event_pair in sub_event_pairs
    ]

    for index_event, outcome_event in event_pairs:
        if index_event == outcome_event and index_event == "Liquidated":
            continue
        if verbose:
            print(f"\n{'='*50}")
            print(f"Training for: {index_event} -> {outcome_event}")
            print(f"{'='*50}")

        get_model_for_pair_and_date(index_event, outcome_event, model_date=model_date, verbose=verbose)

    if verbose:
        print("\n\nAll prediction files have been generated.")

In [None]:
train_models_for_all_event_pairs(model_date=1751328000, verbose=True)


Training for: Liquidated -> Borrow
Loading data from ./data/Liquidated/Borrow/data.csv


Training model...
Model booster saved to ./cache/models/xgboost_cox_Liquidated_Borrow_1751328000.ubj

Training for: Liquidated -> Deposit
Loading data from ./data/Liquidated/Deposit/data.csv
Training model...
Model booster saved to ./cache/models/xgboost_cox_Liquidated_Deposit_1751328000.ubj

Training for: Liquidated -> Repay
Loading data from ./data/Liquidated/Repay/data.csv
Training model...
Model booster saved to ./cache/models/xgboost_cox_Liquidated_Repay_1751328000.ubj

Training for: Liquidated -> Withdraw
Loading data from ./data/Liquidated/Withdraw/data.csv
Training model...
Model booster saved to ./cache/models/xgboost_cox_Liquidated_Withdraw_1751328000.ubj

Training for: Borrow -> Liquidated
Loading data from ./data/Borrow/Liquidated/data.csv
Training model...
Model booster saved to ./cache/models/xgboost_cox_Borrow_Liquidated_1751328000.ubj

Training for: Borrow -> Borrow
Loading data from ./data/Borrow/Borrow/data.csv
Training model...
Model booster saved to ./cache/models/x

In [None]:
def get_date_ranges():
    if os.path.exists(os.path.join(CACHE_DIR, "date_ranges.pkl")):
        with open(os.path.join(CACHE_DIR, "date_ranges.pkl"), "rb") as f:
            return pkl.load(f)
    date_df = pd.read_csv(os.path.join(DATA_PATH, "Withdraw", "Withdraw", "data.csv"))
    min_date = date_df["timestamp"].min()
    max_date = date_df["timestamp"].max()
    train_start_date = min_date + 0.4 * (max_date - min_date)
    test_start_date = min_date + 0.8 * (max_date - min_date)
    train_dates = pd.date_range(start=train_start_date, end=test_start_date, freq="2W")
    test_dates = pd.date_range(start=test_start_date, end=max_date, freq="2W")
    with open(os.path.join(CACHE_DIR, "date_ranges.pkl"), "wb") as f:
        pkl.dump((train_dates, test_dates), f)
    return train_dates, test_dates

In [None]:
for date in chain(get_date_ranges()):
    train_models_for_all_event_pairs(
        model_date=int(date.astype(int) / 10**9), verbose=True
    )