In [27]:
import pandas as pd
train_enhanced = pd.read_csv('../../data/train_pivot.csv', sep = ',')
eval_enhanced = pd.read_csv('../../data/eval_pivot.csv', sep = ',')


In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_censored
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.util import Surv

time_col = "OS_YEARS"
event_col = "OS_STATUS"

# Build feature matrix X by dropping time/event/ID from train_enhanced
exclude_cols = {time_col, event_col, "ID"}
feature_cols = [c for c in train_enhanced.columns if c not in exclude_cols]




X = train_enhanced[feature_cols].to_numpy(dtype=float, na_value=0.0)

time_vals = train_enhanced[time_col].to_numpy(dtype=float)
event_vals = train_enhanced[event_col].to_numpy(dtype=bool)

df = pd.DataFrame({
    'time': time_vals,
    'event': event_vals
})

y = Surv.from_dataframe("event", "time", df)



Résumé du plan:
- Préparer les données à partir de train_enhanced et y (temps et statut).
- Entraîner un XGB AFT avec K-fold cross-validation (K=5 par défaut) et évaluer les scores via concordance (C-index).
- Former le modèle sur l’ensemble des données et faire des prédictions sur le jeu d’évaluation, en générant un CSV avec les colonnes "ID" et "risk_score" (risque = opposé du temps de survie).
- Sauvegarder le CSV final dans le répertoire approprié.

In [29]:
# %%
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sksurv.metrics import concordance_index_censored
import xgboost as xgb  # <--- ajout important

# Chargement des données existantes
train_enhanced = pd.read_csv('../../data/train_pivot.csv', sep = ',')
eval_enhanced = pd.read_csv('../../data/eval_pivot.csv', sep = ',')

time_col = "OS_YEARS"
event_col = "OS_STATUS"
exclude_cols = {time_col, event_col, "ID"}
feature_cols = [c for c in train_enhanced.columns if c not in exclude_cols]

X = train_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
time_vals = train_enhanced[time_col].to_numpy(dtype=float)     # temps bruts
event_vals = train_enhanced[event_col].to_numpy(dtype=bool)    # 1 = event, 0 = censuré

# Préparer y pour XGBoost AFT : log(1 + temps)
y_log_time = np.log1p(time_vals)


In [33]:
for gene in ['Gene_EP300', 'Gene_KMT2C', 'Gene_KMT2D', 'Gene_ZBTB33']:
    eval_enhanced[gene] = 0

In [30]:
# Paramètres AFT
params = {
    "objective": "survival:aft",
    "eval_metric": "aft-nloglik",
    "learning_rate": 0.05,
    "max_depth": 4,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "min_child_weight": 1,
    "gamma": 0.0,
    "aft_loss_distribution": "normal",
    "aft_loss_distribution_scale": 1.0,
    "tree_method": "hist",
    "seed": 42,
}

num_boost_round = 200

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

c_indices = []
fold = 1

for train_idx, val_idx in kf.split(X):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train_log, y_val_log = y_log_time[train_idx], y_log_time[val_idx]
    e_train, e_val = event_vals[train_idx], event_vals[val_idx]

    # Bornes inf / sup en log-temps :
    # - non censuré : [log(t), log(t)]
    # - censuré à droite : [log(t), +inf]
    lb_train = y_train_log.copy()
    ub_train = y_train_log.copy()
    ub_train[e_train == 0] = np.inf

    lb_val = y_val_log.copy()
    ub_val = y_val_log.copy()
    ub_val[e_val == 0] = np.inf

    dtrain = xgb.DMatrix(X_train)
    dtrain.set_float_info("label_lower_bound", lb_train)
    dtrain.set_float_info("label_upper_bound", ub_train)

    dval = xgb.DMatrix(X_val)
    dval.set_float_info("label_lower_bound", lb_val)
    dval.set_float_info("label_upper_bound", ub_val)

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=[(dval, "valid")],
        verbose_eval=False,
    )

    # Prédictions de log-temps sur la validation
    y_pred_log = model.predict(dval)

    # C-index : on prend -y_pred_log comme "risque"
    c_index = concordance_index_censored(
    e_val.astype(bool),      # event_indicator
    time_vals[val_idx],      # event_time
    -y_pred_log)[0]


    c_indices.append(c_index)
    print(f"Fold {fold} — C-index: {c_index:.4f}")
    fold += 1

print("\nC-index moyen :", np.mean(c_indices))


Fold 1 — C-index: 0.7447
Fold 2 — C-index: 0.7090
Fold 3 — C-index: 0.7428
Fold 4 — C-index: 0.7243
Fold 5 — C-index: 0.7225

C-index moyen : 0.7286490227992678


## Entraînement sur l'ensemble complet et prédiction

In [18]:
X_full = train_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
y_full_log_time = np.log1p(time_vals)

lb_full = y_full_log_time.copy()
ub_full = y_full_log_time.copy()
ub_full[event_vals == 0] = np.inf

dfull = xgb.DMatrix(X_full)
dfull.set_float_info("label_lower_bound", lb_full)
dfull.set_float_info("label_upper_bound", ub_full)

model_full = xgb.train(
    params,
    dfull,
    num_boost_round=num_boost_round,
    verbose_eval=False
)

## Tuning des hyperparamètres XGBoost AFT avec Optuna

In [None]:
!pip install optuna

In [19]:
import optuna
from sklearn.model_selection import KFold
import numpy as np
import xgboost as xgb
from sksurv.metrics import concordance_index_censored

In [20]:
def objective(trial):
    params = {
        "objective": "survival:aft",
        "eval_metric": "aft-nloglik",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "aft_loss_distribution": trial.suggest_categorical("aft_loss_distribution", ["normal", "logistic", "extreme"]),
        "aft_loss_distribution_scale": trial.suggest_float("aft_loss_distribution_scale", 0.5, 2.0),
        "tree_method": "hist",
        "seed": 42,
    }

    num_boost_round = trial.suggest_int("num_boost_round", 50, 500)

    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    c_indices = []

    for train_idx, val_idx in kf.split(X):
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_log_fold, y_val_log_fold = y_log_time[train_idx], y_log_time[val_idx]
        e_train_fold, e_val_fold = event_vals[train_idx], event_vals[val_idx]

        lb_train_fold = y_train_log_fold.copy()
        ub_train_fold = y_train_log_fold.copy()
        ub_train_fold[e_train_fold == 0] = np.inf

        lb_val_fold = y_val_log_fold.copy()
        ub_val_fold = y_val_log_fold.copy()
        ub_val_fold[e_val_fold == 0] = np.inf

        dtrain_fold = xgb.DMatrix(X_train_fold)
        dtrain_fold.set_float_info("label_lower_bound", lb_train_fold)
        dtrain_fold.set_float_info("label_upper_bound", ub_train_fold)

        dval_fold = xgb.DMatrix(X_val_fold)
        dval_fold.set_float_info("label_lower_bound", lb_val_fold)
        dval_fold.set_float_info("label_upper_bound", ub_val_fold)

        model_fold = xgb.train(
            params,
            dtrain_fold,
            num_boost_round=num_boost_round,
            evals=[(dval_fold, "valid")],
            verbose_eval=False,
        )

        y_pred_log_fold = model_fold.predict(dval_fold)

        c_index = concordance_index_censored(
            e_val_fold.astype(bool),
            time_vals[val_idx],
            -y_pred_log_fold
        )[0]

        c_indices.append(c_index)

    return np.mean(c_indices)

In [21]:
study = optuna.create_study(direction="maximize", study_name="xgb_aft_tuning")
study.optimize(objective, n_trials=300, show_progress_bar=True)

[I 2025-11-24 17:10:59,647] A new study created in memory with name: xgb_aft_tuning


  0%|          | 0/300 [00:00<?, ?it/s]

[I 2025-11-24 17:11:01,899] Trial 0 finished with value: 0.7228901437080701 and parameters: {'learning_rate': 0.06684253851047778, 'max_depth': 8, 'subsample': 0.7191637417159951, 'colsample_bytree': 0.6871405827938297, 'min_child_weight': 1, 'gamma': 0.7929523904783164, 'aft_loss_distribution': 'normal', 'aft_loss_distribution_scale': 0.7416834986901565, 'num_boost_round': 353}. Best is trial 0 with value: 0.7228901437080701.
[I 2025-11-24 17:11:02,647] Trial 1 finished with value: 0.7226005505820919 and parameters: {'learning_rate': 0.20430005465369513, 'max_depth': 7, 'subsample': 0.9343667944796191, 'colsample_bytree': 0.9564666468924323, 'min_child_weight': 5, 'gamma': 1.8832948214869805, 'aft_loss_distribution': 'extreme', 'aft_loss_distribution_scale': 0.7480330233604198, 'num_boost_round': 164}. Best is trial 0 with value: 0.7228901437080701.
[I 2025-11-24 17:11:04,059] Trial 2 finished with value: 0.7188858567142618 and parameters: {'learning_rate': 0.17776548850021057, 'max_d

In [22]:
print("Meilleurs hyperparamètres:")
print(study.best_params)
print(f"\nMeilleur C-index moyen: {study.best_value:.4f}")

Meilleurs hyperparamètres:
{'learning_rate': 0.02404406172779825, 'max_depth': 5, 'subsample': 0.7859952955851598, 'colsample_bytree': 0.5277007919390605, 'min_child_weight': 3, 'gamma': 4.557664036425712, 'aft_loss_distribution': 'logistic', 'aft_loss_distribution_scale': 0.5341149197604264, 'num_boost_round': 415}

Meilleur C-index moyen: 0.7413


In [23]:
best_params = study.best_params
best_params["objective"] = "survival:aft"
best_params["eval_metric"] = "aft-nloglik"
best_params["tree_method"] = "hist"
best_params["seed"] = 42

best_num_boost_round = best_params.pop("num_boost_round")

In [35]:
best_num_boost_round

415

In [24]:
lb_full_final = y_full_log_time.copy()
ub_full_final = y_full_log_time.copy()
ub_full_final[event_vals == 0] = np.inf

dfull_final = xgb.DMatrix(X_full)
dfull_final.set_float_info("label_lower_bound", lb_full_final)
dfull_final.set_float_info("label_upper_bound", ub_full_final)

model_tuned = xgb.train(
    best_params,
    dfull_final,
    num_boost_round=best_num_boost_round,
    verbose_eval=False
)

In [34]:
n_models = 10
all_predictions = []

for seed in range(n_models):
    # Set seed for this model
    params_with_seed = best_params.copy()
    params_with_seed["seed"] = seed

    # Prepare data
    lb_full_seed = y_full_log_time.copy()
    ub_full_seed = y_full_log_time.copy()
    ub_full_seed[event_vals == 0] = np.inf

    dfull_seed = xgb.DMatrix(X_full)
    dfull_seed.set_float_info("label_lower_bound", lb_full_seed)
    dfull_seed.set_float_info("label_upper_bound", ub_full_seed)

    # Train model with current seed
    model_seed = xgb.train(
        params_with_seed,
        dfull_seed,
        num_boost_round=best_num_boost_round,
        verbose_eval=False
    )

    # Predict on evaluation set
    X_eval_seed = eval_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
    deval_seed = xgb.DMatrix(X_eval_seed)

    y_pred_log_seed = model_seed.predict(deval_seed)
    all_predictions.append(-y_pred_log_seed)

    print(f"Model {seed + 1}/{n_models} trained")

# Average predictions from all models
risk_score_averaged = np.mean(all_predictions, axis=0)

submission_averaged = pd.DataFrame({
    'ID': eval_enhanced['ID'],
    'risk_score': risk_score_averaged
})

submission_averaged.to_csv('../../submissions/submission_xgb_aft_tuned_avg10.csv', index=False)
print(f"\nAveraged predictions from {n_models} models saved to 'submission_xgb_aft_tuned_avg10.csv'")


Model 1/10 trained
Model 2/10 trained
Model 3/10 trained
Model 4/10 trained
Model 5/10 trained
Model 6/10 trained
Model 7/10 trained
Model 8/10 trained
Model 9/10 trained
Model 10/10 trained

Averaged predictions from 10 models saved to 'submission_xgb_aft_tuned_avg10.csv'


In [None]:
import plotly.express as px

fig = optuna.visualization.plot_optimization_history(study)
fig.show()

In [None]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()
