In [None]:
import pandas as pd
import numpy as np

train_enhanced = pd.read_csv('../data/train_pivot4.csv', sep=',')
eval_enhanced  = pd.read_csv('../data/eval_pivot4.csv',  sep=',')

In [None]:
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sksurv.metrics import concordance_index_censored
import xgboost as xgb
import optuna

time_col  = "OS_YEARS"
event_col = "OS_STATUS"
exclude_cols = {time_col, event_col, "ID"}
feature_cols = [c for c in train_enhanced.columns if c not in exclude_cols]

X_df = train_enhanced[feature_cols].astype(float).replace([np.inf, -np.inf], np.nan)
X = X_df.to_numpy(dtype=float)

time_vals  = train_enhanced[time_col].to_numpy(dtype=float)
event_vals = train_enhanced[event_col].to_numpy(dtype=int)  # 1=event, 0=censuré

In [None]:
# Custom metric C-index (XGBoost >= 2.x préfère custom_metric)
def cindex_custom_metric(preds, dmatrix):
    t = dmatrix.get_label().astype(float)
    e = dmatrix.get_weight()
    if e is None or len(e) == 0:
        e = np.ones_like(t)
    e = e.astype(bool)

    c = concordance_index_censored(e, t, preds.astype(float))[0]
    return ("cindex", float(c))

In [None]:
def objective(trial):
    params = {
        "objective": "survival:cox",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10.0),
        "tree_method": "hist",
        "seed": 42,
        # pas de eval_metric ici -> on utilise custom_metric
    }

    num_boost_round = trial.suggest_int("num_boost_round", 50, 2000)
    early_stopping_rounds = trial.suggest_int("early_stopping_rounds", 20, 200)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    c_indices = []

    for train_idx, val_idx in kf.split(X):
        X_tr, X_va = X[train_idx], X[val_idx]
        t_tr, t_va = time_vals[train_idx], time_vals[val_idx]
        e_tr, e_va = event_vals[train_idx], event_vals[val_idx]

        dtrain = xgb.DMatrix(X_tr, label=t_tr, weight=e_tr.astype(float))
        dvalid = xgb.DMatrix(X_va, label=t_va, weight=e_va.astype(float))

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=[(dvalid, "valid")],
            custom_metric=cindex_custom_metric,
            maximize=True,
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=False,
        )

        pred_va = model.predict(dvalid, iteration_range=(0, model.best_iteration + 1))
        c = concordance_index_censored(e_va.astype(bool), t_va, pred_va)[0]
        c_indices.append(c)

    return float(np.mean(c_indices))

In [None]:
study = optuna.create_study(direction="maximize", study_name="xgb_cindex_tuning")
study.optimize(objective, n_trials=1000, show_progress_bar=True)

print("Meilleurs hyperparamètres:")
print(study.best_params)
print(f"\nMeilleur C-index moyen: {study.best_value:.4f}")

In [None]:
best_params = study.best_params.copy()
best_num_boost_round = best_params.pop("num_boost_round")
best_early_stopping_rounds = best_params.pop("early_stopping_rounds")

best_params.update({
    "objective": "survival:cox",
    "tree_method": "hist",
    "seed": 42,
})

In [None]:
# Fit final + early stopping sur un split interne
idx = np.arange(X.shape[0])
tr_idx, va_idx = train_test_split(idx, test_size=0.2, random_state=42, shuffle=True)

dtrain_full = xgb.DMatrix(X[tr_idx], label=time_vals[tr_idx], weight=event_vals[tr_idx].astype(float))
dvalid_full = xgb.DMatrix(X[va_idx], label=time_vals[va_idx], weight=event_vals[va_idx].astype(float))

model_tuned = xgb.train(
    best_params,
    dtrain_full,
    num_boost_round=best_num_boost_round,
    evals=[(dvalid_full, "valid")],
    custom_metric=cindex_custom_metric,
    maximize=True,
    early_stopping_rounds=best_early_stopping_rounds,
    verbose_eval=False,
)

In [None]:
# Predict eval
X_eval_final = eval_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
deval_final = xgb.DMatrix(X_eval_final)

risk_score_tuned = model_tuned.predict(deval_final, iteration_range=(0, model_tuned.best_iteration + 1))

submission_tuned = pd.DataFrame({
    "ID": eval_enhanced["ID"],
    "risk_score": risk_score_tuned
})

submission_tuned.to_csv('../submissions/submission_xgb_cindex_tuned.csv', index=False)
print("Prédictions sauvegardées dans '../submissions/submission_xgb_cindex_tuned.csv'")