In [84]:
import pandas as pd
import numpy as np
train_enhanced = pd.read_csv('../data/train_pivot4.csv', sep = ',')
eval_enhanced = pd.read_csv('../data/eval_pivot4.csv', sep = ',')


In [85]:
# %%
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sksurv.metrics import concordance_index_censored
import xgboost as xgb  # <--- ajout important

time_col = "OS_YEARS"
event_col = "OS_STATUS"
exclude_cols = {time_col, event_col, "ID"}
feature_cols = [c for c in train_enhanced.columns if c not in exclude_cols]

X_df = train_enhanced[feature_cols].astype(float)
X_df = X_df.replace([np.inf, -np.inf], np.nan)

X = X_df.to_numpy(dtype=float)

time_vals = train_enhanced[time_col].to_numpy(dtype=float)     # temps bruts
event_vals = train_enhanced[event_col].to_numpy(dtype=bool)    # 1 = event, 0 = censuré

# Préparer y pour XGBoost AFT : log(1 + temps)
y_log_time = np.log1p(time_vals)


## Tuning des hyperparamètres XGBoost AFT avec Optuna

In [87]:
import optuna
from sklearn.model_selection import KFold
import numpy as np
import xgboost as xgb
from sksurv.metrics import concordance_index_censored

In [88]:
def objective(trial):
    params = {
        "objective": "survival:aft",
        "eval_metric": "aft-nloglik",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "aft_loss_distribution": trial.suggest_categorical("aft_loss_distribution", ["normal", "logistic", "extreme"]),
        "aft_loss_distribution_scale": trial.suggest_float("aft_loss_distribution_scale", 0.5, 2.0),
        "tree_method": "hist",
        "seed": 42,
    }

    num_boost_round = trial.suggest_int("num_boost_round", 50, 500)

    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    c_indices = []

    for train_idx, val_idx in kf.split(X):
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_log_fold, y_val_log_fold = y_log_time[train_idx], y_log_time[val_idx]
        e_train_fold, e_val_fold = event_vals[train_idx], event_vals[val_idx]

        lb_train_fold = y_train_log_fold.copy()
        ub_train_fold = y_train_log_fold.copy()
        ub_train_fold[e_train_fold == 0] = np.inf

        lb_val_fold = y_val_log_fold.copy()
        ub_val_fold = y_val_log_fold.copy()
        ub_val_fold[e_val_fold == 0] = np.inf

        dtrain_fold = xgb.DMatrix(X_train_fold)
        dtrain_fold.set_float_info("label_lower_bound", lb_train_fold)
        dtrain_fold.set_float_info("label_upper_bound", ub_train_fold)

        dval_fold = xgb.DMatrix(X_val_fold)
        dval_fold.set_float_info("label_lower_bound", lb_val_fold)
        dval_fold.set_float_info("label_upper_bound", ub_val_fold)

        model_fold = xgb.train(
            params,
            dtrain_fold,
            num_boost_round=num_boost_round,
            evals=[(dval_fold, "valid")],
            verbose_eval=False,
        )

        y_pred_log_fold = model_fold.predict(dval_fold)

        c_index = concordance_index_censored(
            e_val_fold.astype(bool),
            time_vals[val_idx],
            -y_pred_log_fold
        )[0]

        c_indices.append(c_index)

    return np.mean(c_indices)

In [89]:
study = optuna.create_study(direction="maximize", study_name="xgb_aft_tuning")
study.optimize(objective, n_trials=1000, show_progress_bar=True)

[I 2025-12-09 11:10:03,144] A new study created in memory with name: xgb_aft_tuning


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2025-12-09 11:10:05,186] Trial 0 finished with value: 0.7413274946711315 and parameters: {'learning_rate': 0.019499447786243088, 'max_depth': 10, 'subsample': 0.5032452619412264, 'colsample_bytree': 0.5113262560886567, 'min_child_weight': 10, 'gamma': 2.4586256360010648, 'aft_loss_distribution': 'logistic', 'aft_loss_distribution_scale': 1.1951919586569284, 'num_boost_round': 437}. Best is trial 0 with value: 0.7413274946711315.
[I 2025-12-09 11:10:05,584] Trial 1 finished with value: 0.7282549375592631 and parameters: {'learning_rate': 0.015322670257256275, 'max_depth': 3, 'subsample': 0.8186496287992275, 'colsample_bytree': 0.5964763340860882, 'min_child_weight': 4, 'gamma': 4.190063771973546, 'aft_loss_distribution': 'normal', 'aft_loss_distribution_scale': 1.8765066494113685, 'num_boost_round': 63}. Best is trial 0 with value: 0.7413274946711315.
[I 2025-12-09 11:10:05,584] Trial 1 finished with value: 0.7282549375592631 and parameters: {'learning_rate': 0.015322670257256275, 'm

KeyboardInterrupt: 

In [90]:
print("Meilleurs hyperparamètres:")
print(study.best_params)
print(f"\nMeilleur C-index moyen: {study.best_value:.4f}")

Meilleurs hyperparamètres:
{'learning_rate': 0.011870438652519421, 'max_depth': 9, 'subsample': 0.5949698347663898, 'colsample_bytree': 0.5452475647691543, 'min_child_weight': 5, 'gamma': 4.2161833883837625, 'aft_loss_distribution': 'logistic', 'aft_loss_distribution_scale': 0.544997726146525, 'num_boost_round': 416}

Meilleur C-index moyen: 0.7453


In [93]:
best_params ={'learning_rate': 0.012380227718107034, 'max_depth': 10, 'subsample': 0.5264787658184986, 'colsample_bytree': 0.7044114064671851, 'min_child_weight': 10, 'gamma': 3.7157564777650958, 'aft_loss_distribution': 'normal', 'aft_loss_distribution_scale': 1.1746647891548192, 'num_boost_round': 402}



best_params["objective"] = "survival:aft"
best_params["eval_metric"] = "aft-nloglik"
best_params["tree_method"] = "hist"
best_params["seed"] = 42

best_num_boost_round = best_params.pop("num_boost_round")

In [94]:
X_full = train_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
y_full_log_time = np.log1p(time_vals)

lb_full = y_full_log_time.copy()
ub_full = y_full_log_time.copy()
ub_full[event_vals == 0] = np.inf

dfull = xgb.DMatrix(X_full)
dfull.set_float_info("label_lower_bound", lb_full)
dfull.set_float_info("label_upper_bound", ub_full)


lb_full_final = y_full_log_time.copy()
ub_full_final = y_full_log_time.copy()
ub_full_final[event_vals == 0] = np.inf

dfull_final = xgb.DMatrix(X_full)
dfull_final.set_float_info("label_lower_bound", lb_full_final)
dfull_final.set_float_info("label_upper_bound", ub_full_final)

model_tuned = xgb.train(
    best_params,
    dfull_final,
    num_boost_round=best_num_boost_round,
    verbose_eval=False
)

X_eval_final = eval_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
deval_final = xgb.DMatrix(X_eval_final)

y_pred_log_eval_tuned = model_tuned.predict(deval_final)
risk_score_tuned = -y_pred_log_eval_tuned


submission_tuned = pd.DataFrame({
    'ID': eval_enhanced['ID'],
    'risk_score': risk_score_tuned
})

submission_tuned.to_csv('../submissions/submission_xgb_aft_tuned.csv', index=False)
print("Prédictions avec hyperparamètres optimisés sauvegardées dans '../submissions/submission_xgb_aft_tuned2.csv'")

Prédictions avec hyperparamètres optimisés sauvegardées dans '../submissions/submission_xgb_aft_tuned2.csv'
