# FinSurvival Competition: Starter Notebook (AFT Model Prediction Submission)

**Objective:** This notebook provides a workflow for creating a valid prediction submission using the `WeibullAFTFitter` model. The competition requires you to submit a `.zip` file containing 16 separate prediction files in CSV format.

This notebook will guide you through:
1.  Loading the training and test sets for each of the 16 tasks from a single directory.
2.  Training a model (using `WeibullAFTFitter` as an example).
3.  Generating predictions on the test set in the required format.
4.  Saving each set of predictions to a correctly named CSV file.
5.  Zipping all 16 prediction files for submission.

## Step 1: Setup and Imports

In [7]:
# Install required packages
# pip install -q pandas lifelines==0.27.8 scikit-learn==1.2.2 scikit-survival==0.21.0 numpy

# Import libraries
import pandas as pd
import numpy as np
import os
import shutil
from lifelines import WeibullAFTFitter
from lifelines.exceptions import ConvergenceError
from sklearn.preprocessing import StandardScaler
from typing import Tuple, Optional
from utils.constants import *
import pickle as pkl
from xgboost import XGBRegressor
from sksurv.metrics import concordance_index_censored

# Module-level cache for loaded/trained models to reuse across calls
MODELS_CACHE: dict = {}

# Module-level cache for preprocessing artifacts (scaler, columns, categories)
PREPROCESS_CACHE: dict = {}

def get_concordance_index(
    test_df: pd.DataFrame, 
    predictions: np.ndarray
) -> float:
    """
    Calculates the concordance index for survival models using scikit-survival.
    Replaces any NaN predictions with -1.
    """
    # Replace NaN predictions with a value representing the worst possible score (shortest survival)
    # Using -1 is a robust way to handle failed predictions without causing numerical errors.
    predictions[np.isnan(predictions)] = -1
    
    event_indicator = test_df['status'].astype(bool)
    event_time = test_df['timeDiff']

    # Handle cases where all events are censored or all are non-censored in the test set
    if len(np.unique(event_indicator)) == 1:
        return 0.5  # Return a neutral score

    c_index, _, _, _, _ = concordance_index_censored(
        event_indicator, event_time, -predictions
    )
    
    return c_index

## Step 2: Define a Preprocessing Function

Even though you are not submitting this code, you will still need a preprocessing pipeline to train your models effectively. You can use the one below as a starting point.

In [8]:
def preprocess(
    train_df_with_labels: Optional[pd.DataFrame] = None,
    test_features_df: Optional[pd.DataFrame] = None,
    model_date: Optional[int] = None,
) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]:

    # Create unique prefix for saving/loading preprocessing objects
    unique_prefix = (
        (
            present_dataframe := (
                train_df_with_labels
                if train_df_with_labels is not None
                else test_features_df
            )
        )["Index Event"].iloc[0]
        + "_"
        + present_dataframe["Outcome Event"].iloc[0]
        + (f"_{model_date}_" if model_date is not None else "_")
    )
    # Define paths for saving/loading preprocessing objects
    scaler_path = os.path.join(DATA_CACHE_DIR, unique_prefix + "scaler.pkl")
    train_cols = os.path.join(DATA_CACHE_DIR, unique_prefix + "train_cols.pkl")
    top_categories_dict_path = os.path.join(
        DATA_CACHE_DIR, unique_prefix + "top_categories_dict.pkl"
    )
    categorical_cols_path = os.path.join(
        DATA_CACHE_DIR, unique_prefix + "categorical_cols.pkl"
    )
    numerical_cols_path = os.path.join(
        DATA_CACHE_DIR, unique_prefix + "numerical_cols.pkl"
    )
    cols_to_keep_path = os.path.join(DATA_CACHE_DIR, unique_prefix + "cols_to_keep.pkl")

    target_columns = ["timeDiff", "status"]
    cols_to_drop = [
        "id",
        "user",
        "pool",
        "Index Event",
        "Outcome Event",
        "type",
        "timestamp",
    ]

    if train_df_with_labels is not None:
        if model_date is not None:
            # model_date may be a pandas.Timestamp while dataframe timestamps are numeric.
            # Convert model_date to numeric epoch seconds for a safe comparison.
            if isinstance(model_date, pd.Timestamp):
                model_date_value = model_date.timestamp()
            else:
                try:
                    model_date_value = float(model_date)
                except Exception:
                    model_date_value = model_date

            train_df_with_labels = train_df_with_labels[
                (train_df_with_labels["timestamp"] + train_df_with_labels["timeDiff"])
                <= model_date_value
            ]

        # Separate features and targets (and drop unneeded columns from features)
        train_targets = train_df_with_labels[target_columns]
        train_features = train_df_with_labels.drop(
            columns=target_columns + cols_to_drop, errors="ignore"
        )

        # Make uncommon categories "Other" and one-hot encode categorical features
        categorical_cols = train_features.select_dtypes(
            include=["object", "category"]
        ).columns
        top_categories_dict = {}
        for col in categorical_cols:
            top_categories_dict[col] = (
                train_features[col].value_counts().nlargest(10).index
            )
            train_features[col] = train_features[col].where(
                train_features[col].isin(top_categories_dict[col]), "Other"
            )
        train_features_encoded = pd.get_dummies(
            train_features, columns=categorical_cols, dummy_na=True, drop_first=True
        )

        # Standardize numerical features
        numerical_cols = train_features_encoded.select_dtypes(include=np.number).columns
        scaler = StandardScaler()
        train_features_scaled = scaler.fit_transform(
            train_features_encoded[numerical_cols]
        )
        train_features_final = pd.DataFrame(
            train_features_scaled,
            index=train_features_encoded.index,
            columns=numerical_cols,
        ).fillna(0)

        # Remove zero-variance columns
        cols_to_keep = train_features_final.columns[train_features_final.var() != 0]
        train_features_final = train_features_final[cols_to_keep]

        # Save preprocessing objects
        with open(scaler_path, "wb") as f:
            pkl.dump(scaler, f)
        with open(train_cols, "wb") as f:
            pkl.dump(train_features_encoded.columns, f)
        with open(top_categories_dict_path, "wb") as f:
            pkl.dump(top_categories_dict, f)
        with open(categorical_cols_path, "wb") as f:
            pkl.dump(categorical_cols, f)
        with open(numerical_cols_path, "wb") as f:
            pkl.dump(numerical_cols, f)
        with open(cols_to_keep_path, "wb") as f:
            pkl.dump(cols_to_keep, f)

        # Populate in-memory preprocess cache for fast reuse
        PREPROCESS_CACHE[unique_prefix] = {
            "scaler": scaler,
            "train_cols": train_features_encoded.columns,
            "top_categories_dict": top_categories_dict,
            "categorical_cols": categorical_cols,
            "numerical_cols": numerical_cols,
            "cols_to_keep": cols_to_keep,
        }
    else:
        train_features_final = None
        train_targets = None

    # Process test features if provided
    if test_features_df is not None:
        if unique_prefix not in PREPROCESS_CACHE:
            PREPROCESS_CACHE[unique_prefix] = {}
            with open(categorical_cols_path, "rb") as f:
                PREPROCESS_CACHE[unique_prefix]["categorical_cols"] = pkl.load(f)
            with open(top_categories_dict_path, "rb") as f:
                PREPROCESS_CACHE[unique_prefix]["top_categories_dict"] = pkl.load(f)
            with open(numerical_cols_path, "rb") as f:
                PREPROCESS_CACHE[unique_prefix]["numerical_cols"] = pkl.load(f)
            with open(cols_to_keep_path, "rb") as f:
                PREPROCESS_CACHE[unique_prefix]["cols_to_keep"] = pkl.load(f)
            with open(train_cols, "rb") as f:
                PREPROCESS_CACHE[unique_prefix]["train_cols"] = pkl.load(f)
            with open(scaler_path, "rb") as f:
                PREPROCESS_CACHE[unique_prefix]["scaler"] = pkl.load(f)

        test_features = test_features_df.drop(columns=cols_to_drop, errors="ignore")
        categorical_cols = PREPROCESS_CACHE[unique_prefix]["categorical_cols"]
        top_categories_dict = PREPROCESS_CACHE[unique_prefix]["top_categories_dict"]
        for col in categorical_cols:
            top_categories = top_categories_dict[col]
            test_features[col] = test_features[col].where(
                test_features[col].isin(top_categories), "Other"
            )
        test_features_encoded = pd.get_dummies(
            test_features, columns=categorical_cols, dummy_na=True, drop_first=True
        )
        train_cols = PREPROCESS_CACHE[unique_prefix]["train_cols"]
        test_features_aligned = test_features_encoded.reindex(
            columns=train_cols, fill_value=0
        )
        scaler = PREPROCESS_CACHE[unique_prefix]["scaler"]
        numerical_cols = PREPROCESS_CACHE[unique_prefix]["numerical_cols"]
        test_features_scaled = scaler.transform(test_features_aligned[numerical_cols])
        test_features_final = pd.DataFrame(
            test_features_scaled,
            index=test_features_aligned.index,
            columns=numerical_cols,
        ).fillna(0)
        cols_to_keep = PREPROCESS_CACHE[unique_prefix]["cols_to_keep"]
        # logger.debug("cols_to_keep:%s", cols_to_keep)
        test_processed_features = test_features_final[cols_to_keep]
    else:
        test_processed_features = None
    return train_features_final, train_targets, test_processed_features

## Step 3: Loop, Train, and Save Predictions

This is the main part of the notebook. We will loop through all 16 tasks. For each task, we will:
1. Load the training data and the test features.
2. Preprocess both.
3. Train a model on the training data.
4. Generate predictions on the processed test features.
5. Save the predictions to a CSV file with the correct name.

In [None]:
# Define path to the single participant data folder.
PARTICIPANT_DATA_PATH = "./data/"

# Define all 16 event pairs
index_events = ["Borrow", "Deposit", "Repay", "Withdraw", "Liquidated"]
outcome_events = index_events
event_pairs = [
    (index_event, outcome_event)
    for index_event in index_events
    for outcome_event in outcome_events
]

for index_event, outcome_event in event_pairs:
    print(f"\n{'='*50}")
    print(f"Processing and Predicting for: {index_event} -> {outcome_event}")
    print(f"{'='*50}")

    dataset_path = os.path.join(index_event, outcome_event)

    # --- Load and Preprocess ---
    try:
        data_df = pd.read_csv(
            os.path.join(PARTICIPANT_DATA_PATH, dataset_path, "data.csv")
        )
    except FileNotFoundError as e:
        print(f"Data not found for {dataset_path}. Skipping.")
        continue

    buffer_duration = 30 * 60 * 60 * 24
    train_cutoff = 1722526142 - buffer_duration

    train_df = data_df[data_df["timestamp"] <= train_cutoff]
    test_df = data_df[data_df["timestamp"] > train_cutoff]
    reference_cols = ["timeDiff", "status"]
    feature_cols = [col for col in train_df.columns if col not in reference_cols]
    test_features_df = test_df[feature_cols]
    test_references_df = test_df[reference_cols]

    X_train, y_train, X_test_processed = preprocess(train_df, test_features_df)

    # --- Train Model ---
    try:
        lifelines_train_df = pd.concat(
            [X_train, y_train.reset_index(drop=True)], axis=1
        )
        lifelines_train_df = lifelines_train_df.loc[
            lifelines_train_df["timeDiff"] > 0
        ].copy()

        model = XGBRegressor(
            objective="survival:cox",
            eval_metric="cox-nloglik",
            tree_method="hist",
            predictor="gpu_predictor",
            device="cuda",
            seed=42,
            verbosity=0,
            max_bin=64,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_weight=5,
            reg_lambda=1.0,
            reg_alpha=0.1,
        )
        y_train_duration = y_train["timeDiff"].values
        y_train_event = y_train["status"].values
        model.fit(X_train, y_train_event, sample_weight=y_train_duration)

        # --- Generate and Save Predictions ---
        print(f"Generating predictions for {dataset_path}...")
        # Use the processed test features to make predictions
        predictions = model.predict(X_test_processed, output_margin=True)

        print(f"Concordance index: {get_concordance_index(test_references_df, predictions)}")

    except (ConvergenceError, ValueError) as e:
        print(
            f"\nERROR: The model for {dataset_path} failed to train. No prediction file will be created."
        )
        print(f"Details: {e}")

print("\n\nAll prediction files have been generated.")


Processing and Predicting for: Borrow -> Borrow
Generating predictions for Borrow/Borrow...
