In [None]:
import numpy as np
import pandas as pd

import optuna
from sklearn.model_selection import StratifiedKFold, train_test_split

from pycox.datasets import metabric

from xgbse.converters import convert_to_structured
from xgbse import XGBSEDebiasedBCE
from xgbse.metrics import concordance_index

# Prepare data

In [None]:
data = pd.read_parquet("data/train_val_data.parquet.gzip")

In [None]:
target_cols = [col for col in data.columns if "target" in col] + ['term']
target_cols

In [None]:
features = ["annual_inc", "loan_amnt", "fico_average", "emp_length", "acc_open_past_24mths", "avg_cur_bal" , "sub_grade", "int_rate", "revol_util"]

features += [col for col in data.columns if "home_ownership_ohe_" in col]

data_used_cols = data[features + target_cols].dropna()
# assert data.shape[0] == data_used_cols.shape[0]

features

In [None]:
# splitting to X, T, E format
X = data_used_cols.drop(['target', 'target_month', 'target_xgb'], axis=1)
T = data_used_cols['target_month']
E = data_used_cols['target']
y = data_used_cols[['target', 'target_month']]
y_structured = convert_to_structured(T, E)

In [None]:
def train_xgbse_model(X_train, y_train, X_val, y_val, xgb_params, lr_params):
    xgbse_model = XGBSEDebiasedBCE(
        xgb_params = xgb_params,
        lr_params=lr_params
    )
    TIME_BINS = np.arange(1, 61, 1)
    xgbse_model.fit(
        X_train.values, y_train, time_bins=TIME_BINS, num_boost_round=3
    )
    y_pred = xgbse_model.predict(X_val)

    return xgbse_model, y_pred

In [None]:
def objective(trial):

    skf = StratifiedKFold(n_splits=3, shuffle=True)

    xgb_params = {
        "objective": "survival:aft", # for survival analysis
        "eval_metric": "aft-nloglik", # for survival analysis
        "aft_loss_distribution": trial.suggest_categorical("aft_loss_distribution", ["normal", "logistic", "extreme"]),
        "aft_loss_distribution_scale": trial.suggest_float('aft_loss_distribution_scale', 0.5, 1.5, log=True),
        "tree_method": "auto", # shouldn't change
        "learning_rate": trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        "max_depth": trial.suggest_int('max_depth', 3, 10, log=True),
        "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        "subsample": trial.suggest_float('subsample', 0.5, 1, log=True),
        "min_child_weight": 50, # was choosed
        "colsample_bynode": trial.suggest_float('colsample_bynode', 1e-3, 1, log=True),
    }
    lr_params={
        'C': trial.suggest_float('C', 1e-5, 1e-1, log=True), 
        'max_iter': 100 # don't really change the situation
    }

    # Train and test model on each fold
    metrics = []
    for train_index, val_index in list(skf.split(X, y['target_month'])):
        X_train, X_val = X.iloc[train_index].copy(), X.iloc[val_index].copy()
        y_train, y_val = y_structured[train_index].copy(), y_structured[val_index].copy()
        
        print("started_train")
        model, prob_for_month = train_xgbse_model(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, xgb_params=xgb_params, lr_params=lr_params)

        print("started_calculate")
        c_ind = concordance_index(y_val, prob_for_month)

        metrics.append(c_ind)
        print(
            "C_INDEX", round(
                c_ind, 5
            )
        )

    return np.mean(metrics)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

Best C-index: 0.6843114772562473

In [None]:
study.best_params