In [None]:
import pandas as pd
import numpy as np

train_enhanced = pd.read_csv('../data/train_pivot4.csv', sep=',')
eval_enhanced  = pd.read_csv('../data/eval_pivot4.csv',  sep=',')

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sksurv.metrics import concordance_index_censored
import xgboost as xgb  # <--- important

time_col  = "OS_YEARS"
event_col = "OS_STATUS"
exclude_cols = {time_col, event_col, "ID"}
feature_cols = [c for c in train_enhanced.columns if c not in exclude_cols]

X_df = train_enhanced[feature_cols].astype(float)
X_df = X_df.replace([np.inf, -np.inf], np.nan)

X = X_df.to_numpy(dtype=float)

time_vals  = train_enhanced[time_col].to_numpy(dtype=float)   # temps bruts
event_vals = train_enhanced[event_col].to_numpy(dtype=int)    # 1 = event, 0 = censuré (int pour Cox)

In [None]:
# ## Tuning des hyperparamètres XGBoost COX avec Optuna
import optuna
from sklearn.model_selection import KFold
import numpy as np
import xgboost as xgb
from sksurv.metrics import concordance_index_censored

def objective(trial):
    params = {
        "objective": "survival:cox",
        "eval_metric": "cox-nloglik",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10.0),
        "tree_method": "hist",
        "seed": 42,
    }

    num_boost_round = trial.suggest_int("num_boost_round", 50, 500)

    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    c_indices = []

    for train_idx, val_idx in kf.split(X):
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        t_train_fold, t_val_fold = time_vals[train_idx], time_vals[val_idx]
        e_train_fold, e_val_fold = event_vals[train_idx], event_vals[val_idx]

        # XGBoost Cox: label = time, et on fournit la censure via weight=event (0=censuré, 1=event)
        dtrain_fold = xgb.DMatrix(X_train_fold, label=t_train_fold, weight=e_train_fold.astype(float))
        dval_fold   = xgb.DMatrix(X_val_fold,   label=t_val_fold,   weight=e_val_fold.astype(float))

        model_fold = xgb.train(
            params,
            dtrain_fold,
            num_boost_round=num_boost_round,
            evals=[(dval_fold, "valid")],
            verbose_eval=False,
        )

        # Pour Cox, predict = log-risk score (plus grand = plus de risque)
        pred_val = model_fold.predict(dval_fold)

        c_index = concordance_index_censored(
            e_val_fold.astype(bool),
            t_val_fold,
            pred_val
        )[0]

        c_indices.append(c_index)

    return float(np.mean(c_indices))

In [None]:
study = optuna.create_study(direction="maximize", study_name="xgb_cox_tuning")
study.optimize(objective, n_trials=1000, show_progress_bar=True)

In [None]:
print("Meilleurs hyperparamètres:")
print(study.best_params)
print(f"\nMeilleur C-index moyen: {study.best_value:.4f}")

In [None]:
best_params = {'learning_rate': 0.01426075160919655, 'max_depth': 3, 'subsample': 0.5784653179987965, 'colsample_bytree': 0.5780335399700407, 'min_child_weight': 8, 'gamma': 3.613478865761625, 'reg_alpha': 0.10045067420915038, 'reg_lambda': 3.2166983232864625, 'num_boost_round': 446}

best_params["objective"] = "survival:cox"
best_params["eval_metric"] = "cox-nloglik"
best_params["tree_method"] = "hist"
best_params["seed"] = 42

best_num_boost_round = best_params.pop("num_boost_round")

In [None]:
# Fit final sur tout le train, puis prédictions sur eval, puis sauvegarde submission
X_full = train_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
t_full = time_vals
e_full = event_vals.astype(float)

dfull_final = xgb.DMatrix(X_full, label=t_full, weight=e_full)

model_tuned = xgb.train(
    best_params,
    dfull_final,
    num_boost_round=best_num_boost_round,
    verbose_eval=False
)

X_eval_final = eval_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
deval_final = xgb.DMatrix(X_eval_final)

# Cox -> predict = log-risk score
risk_score_tuned = model_tuned.predict(deval_final)

submission_tuned = pd.DataFrame({
    "ID": eval_enhanced["ID"],
    "risk_score": risk_score_tuned
})

submission_tuned.to_csv('../submissions/submission_xgb_cox_tuned.csv', index=False)
print("Prédictions avec hyperparamètres optimisés sauvegardées dans '../submissions/submission_xgb_cox_tuned.csv'")