In [5]:
import pandas as pd
import numpy as np
train_enhanced = pd.read_csv('../data/train_pivot2.csv', sep = ',')
eval_enhanced = pd.read_csv('../data/eval_pivot2.csv', sep = ',')

molecular_train  =  pd.read_csv('../data/molecular_train.csv', sep = ',')
molecular_eval  =  pd.read_csv('../data/molecular_val.csv', sep = ',')

In [6]:
# %%
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sksurv.metrics import concordance_index_censored
import xgboost as xgb  # <--- ajout important

time_col = "OS_YEARS"
event_col = "OS_STATUS"
exclude_cols = {time_col, event_col, "ID"}
feature_cols = [c for c in train_enhanced.columns if c not in exclude_cols]

X_df = train_enhanced[feature_cols].astype(float)
X_df = X_df.replace([np.inf, -np.inf], np.nan)

X = X_df.to_numpy(dtype=float)

time_vals = train_enhanced[time_col].to_numpy(dtype=float)     # temps bruts
event_vals = train_enhanced[event_col].to_numpy(dtype=bool)    # 1 = event, 0 = censuré

# Préparer y pour XGBoost AFT : log(1 + temps)
y_log_time = np.log1p(time_vals)


In [7]:
# Paramètres AFT
params = {
    "objective": "survival:aft",
    "eval_metric": "aft-nloglik",
    "learning_rate": 0.05,
    "max_depth": 4,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "min_child_weight": 1,
    "gamma": 0.0,
    "aft_loss_distribution": "normal",
    "aft_loss_distribution_scale": 1.0,
    "tree_method": "hist",
    "seed": 42,
}

num_boost_round = 200

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

c_indices = []
fold = 1

for train_idx, val_idx in kf.split(X):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train_log, y_val_log = y_log_time[train_idx], y_log_time[val_idx]
    e_train, e_val = event_vals[train_idx], event_vals[val_idx]

    # Bornes inf / sup en log-temps :
    # - non censuré : [log(t), log(t)]
    # - censuré à droite : [log(t), +inf]
    lb_train = y_train_log.copy()
    ub_train = y_train_log.copy()
    ub_train[e_train == 0] = np.inf

    lb_val = y_val_log.copy()
    ub_val = y_val_log.copy()
    ub_val[e_val == 0] = np.inf

    dtrain = xgb.DMatrix(X_train, missing=np.nan)


    dtrain.set_float_info("label_lower_bound", lb_train)
    dtrain.set_float_info("label_upper_bound", ub_train)

    dval = xgb.DMatrix(X_val, missing=np.nan)
    dval.set_float_info("label_lower_bound", lb_val)
    dval.set_float_info("label_upper_bound", ub_val)

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=[(dval, "valid")],
        verbose_eval=False,
    )

    # Prédictions de log-temps sur la validation
    y_pred_log = model.predict(dval)

    # C-index : on prend -y_pred_log comme "risque"
    c_index = concordance_index_censored(
    e_val.astype(bool),      # event_indicator
    time_vals[val_idx],      # event_time
    -y_pred_log)[0]


    c_indices.append(c_index)
    print(f"Fold {fold} — C-index: {c_index:.4f}")
    fold += 1

print("\nC-index moyen :", np.mean(c_indices))


Fold 1 — C-index: 0.7511
Fold 2 — C-index: 0.7143
Fold 2 — C-index: 0.7143
Fold 3 — C-index: 0.7444
Fold 3 — C-index: 0.7444
Fold 4 — C-index: 0.7314
Fold 4 — C-index: 0.7314
Fold 5 — C-index: 0.7314

C-index moyen : 0.734531922677729
Fold 5 — C-index: 0.7314

C-index moyen : 0.734531922677729


## Tuning des hyperparamètres XGBoost AFT avec Optuna

In [8]:
import optuna
from sklearn.model_selection import KFold
import numpy as np
import xgboost as xgb
from sksurv.metrics import concordance_index_censored

In [9]:
def objective(trial):
    params = {
        "objective": "survival:aft",
        "eval_metric": "aft-nloglik",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "aft_loss_distribution": trial.suggest_categorical("aft_loss_distribution", ["normal", "logistic", "extreme"]),
        "aft_loss_distribution_scale": trial.suggest_float("aft_loss_distribution_scale", 0.5, 2.0),
        "tree_method": "hist",
        "seed": 42,
    }

    num_boost_round = trial.suggest_int("num_boost_round", 50, 500)

    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    c_indices = []

    for train_idx, val_idx in kf.split(X):
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_log_fold, y_val_log_fold = y_log_time[train_idx], y_log_time[val_idx]
        e_train_fold, e_val_fold = event_vals[train_idx], event_vals[val_idx]

        lb_train_fold = y_train_log_fold.copy()
        ub_train_fold = y_train_log_fold.copy()
        ub_train_fold[e_train_fold == 0] = np.inf

        lb_val_fold = y_val_log_fold.copy()
        ub_val_fold = y_val_log_fold.copy()
        ub_val_fold[e_val_fold == 0] = np.inf

        dtrain_fold = xgb.DMatrix(X_train_fold)
        dtrain_fold.set_float_info("label_lower_bound", lb_train_fold)
        dtrain_fold.set_float_info("label_upper_bound", ub_train_fold)

        dval_fold = xgb.DMatrix(X_val_fold)
        dval_fold.set_float_info("label_lower_bound", lb_val_fold)
        dval_fold.set_float_info("label_upper_bound", ub_val_fold)

        model_fold = xgb.train(
            params,
            dtrain_fold,
            num_boost_round=num_boost_round,
            evals=[(dval_fold, "valid")],
            verbose_eval=False,
        )

        y_pred_log_fold = model_fold.predict(dval_fold)

        c_index = concordance_index_censored(
            e_val_fold.astype(bool),
            time_vals[val_idx],
            -y_pred_log_fold
        )[0]

        c_indices.append(c_index)

    return np.mean(c_indices)

In [10]:
study = optuna.create_study(direction="maximize", study_name="xgb_aft_tuning")
study.optimize(objective, n_trials=1000, show_progress_bar=True)

[I 2025-12-07 18:05:16,526] A new study created in memory with name: xgb_aft_tuning


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2025-12-07 18:05:17,823] Trial 0 finished with value: 0.7352262199152506 and parameters: {'learning_rate': 0.09686890964451904, 'max_depth': 10, 'subsample': 0.5189633881864015, 'colsample_bytree': 0.601053919246853, 'min_child_weight': 1, 'gamma': 3.3328006098233587, 'aft_loss_distribution': 'extreme', 'aft_loss_distribution_scale': 1.79937461603429, 'num_boost_round': 346}. Best is trial 0 with value: 0.7352262199152506.
[I 2025-12-07 18:05:20,519] Trial 1 finished with value: 0.7377955995451175 and parameters: {'learning_rate': 0.023291838381897426, 'max_depth': 7, 'subsample': 0.9929754943734367, 'colsample_bytree': 0.614798016544017, 'min_child_weight': 4, 'gamma': 3.180037661646641, 'aft_loss_distribution': 'extreme', 'aft_loss_distribution_scale': 0.5780304250686013, 'num_boost_round': 415}. Best is trial 1 with value: 0.7377955995451175.
[I 2025-12-07 18:05:20,519] Trial 1 finished with value: 0.7377955995451175 and parameters: {'learning_rate': 0.023291838381897426, 'max_de

In [11]:
print("Meilleurs hyperparamètres:")
print(study.best_params)
print(f"\nMeilleur C-index moyen: {study.best_value:.4f}")

Meilleurs hyperparamètres:
{'learning_rate': 0.014638913223416235, 'max_depth': 6, 'subsample': 0.5201235534964794, 'colsample_bytree': 0.6013067573963837, 'min_child_weight': 10, 'gamma': 3.997367708061503, 'aft_loss_distribution': 'normal', 'aft_loss_distribution_scale': 1.0360201374878424, 'num_boost_round': 450}

Meilleur C-index moyen: 0.7447


In [12]:
best_params = study.best_params
best_params["objective"] = "survival:aft"
best_params["eval_metric"] = "aft-nloglik"
best_params["tree_method"] = "hist"
best_params["seed"] = 42

best_num_boost_round = best_params.pop("num_boost_round")

In [14]:
X_full = train_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
y_full_log_time = np.log1p(time_vals)

lb_full = y_full_log_time.copy()
ub_full = y_full_log_time.copy()
ub_full[event_vals == 0] = np.inf

dfull = xgb.DMatrix(X_full)
dfull.set_float_info("label_lower_bound", lb_full)
dfull.set_float_info("label_upper_bound", ub_full)


lb_full_final = y_full_log_time.copy()
ub_full_final = y_full_log_time.copy()
ub_full_final[event_vals == 0] = np.inf

dfull_final = xgb.DMatrix(X_full)
dfull_final.set_float_info("label_lower_bound", lb_full_final)
dfull_final.set_float_info("label_upper_bound", ub_full_final)

model_tuned = xgb.train(
    best_params,
    dfull_final,
    num_boost_round=best_num_boost_round,
    verbose_eval=False
)

In [15]:
eval_enhanced

Unnamed: 0,ANC,BM_BLAST,CHR_10_count,CHR_11_count,CHR_12_count,CHR_13_count,CHR_14_count,CHR_15_count,CHR_16_count,CHR_17_count,...,n_splicing_mut,n_signaling_mut,TP53_VAF_interaction,ANC_WBC_ratio,BLAST_WBC_ratio,major_clone_VAF,subclonality,karyo_score_clinical,vaf_entropy,PLT_HB_ratio
0,-0.526541,9.489051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,0.0,-0.000000,-0.695632,13.086583,-0.171875,-1.606796,2.0,0.204085,-2.415017
1,-0.283032,4.671533,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,1.0,-0.000000,-0.406703,7.007401,1.113281,-107.637397,2.0,-0.251553,-0.517147
2,2.488359,1.069229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,0.0,-0.000000,0.895365,0.401619,-0.394531,-0.520674,1.0,-0.051060,-0.316623
3,0.019929,8.467153,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,0.0,0.000000,0.016184,7.177764,0.000000,0.128167,0.0,0.121087,-1.419494
4,-0.470069,-0.145985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,-0.000000,-1.874263,-0.607624,-0.082031,-4.304717,7.0,-0.030057,-1.039061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1188,-0.263454,0.374182,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,0.0,0.000000,-0.392811,0.582395,0.183594,-0.496069,0.0,-0.138957,0.055340
1189,-0.263454,0.374182,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,1.0,0.000000,-0.392811,0.582395,0.859375,0.749892,0.0,0.030148,0.055340
1190,-0.263454,0.374182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-1.0,0.0,-0.671280,-0.392811,0.582395,-1.195312,0.301222,0.0,-0.640841,0.055340
1191,-0.263454,0.374182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,0.0,0.000000,-0.392811,0.582395,0.308594,2.395448,0.0,0.083659,0.055340


In [16]:
X_eval_final = eval_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
deval_final = xgb.DMatrix(X_eval_final)

y_pred_log_eval_tuned = model_tuned.predict(deval_final)
risk_score_tuned = -y_pred_log_eval_tuned


In [17]:

submission_tuned = pd.DataFrame({
    'ID': eval_enhanced['ID'],
    'risk_score': risk_score_tuned
})

submission_tuned.to_csv('../submissions/submission_xgb_aft_tuned.csv', index=False)
print("Prédictions avec hyperparamètres optimisés sauvegardées dans '../submissions/submission_xgb_aft_tuned.csv'")

Prédictions avec hyperparamètres optimisés sauvegardées dans '../submissions/submission_xgb_aft_tuned.csv'


In [18]:
import plotly.express as px

fig = optuna.visualization.plot_optimization_history(study)
fig.show()

In [19]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()
