In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_censored
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.util import Surv

time_col = "OS_YEARS"
event_col = "OS_STATUS"

# Build feature matrix X by dropping time/event/ID from train_enhanced
exclude_cols = {time_col, event_col, "ID"}
feature_cols = [c for c in train_enhanced.columns if c not in exclude_cols]




X = train_enhanced[feature_cols].to_numpy(dtype=float, na_value=0.0)

time_vals = train_enhanced[time_col].to_numpy(dtype=float)
event_vals = train_enhanced[event_col].to_numpy(dtype=bool)

df = pd.DataFrame({
    'time': time_vals,
    'event': event_vals
})

y = Surv.from_dataframe("event", "time", df)



Résumé du plan:
- Préparer les données à partir de train_enhanced et y (temps et statut).
- Entraîner un XGB AFT avec K-fold cross-validation (K=5 par défaut) et évaluer les scores via concordance (C-index).
- Former le modèle sur l’ensemble des données et faire des prédictions sur le jeu d’évaluation, en générant un CSV avec les colonnes "ID" et "risk_score" (risque = opposé du temps de survie).
- Sauvegarder le CSV final dans le répertoire approprié.

In [6]:
# %%
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sksurv.metrics import concordance_index_censored
import xgboost as xgb  # <--- ajout important

# Chargement des données existantes
train_enhanced = pd.read_csv('../../data/train_enhanced_V2.csv', sep = ',')
eval_enhanced = pd.read_csv('../../data/eval_enhanced_V2.csv', sep = ',')

time_col = "OS_YEARS"
event_col = "OS_STATUS"
exclude_cols = {time_col, event_col, "ID"}
feature_cols = [c for c in train_enhanced.columns if c not in exclude_cols]

X = train_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
time_vals = train_enhanced[time_col].to_numpy(dtype=float)     # temps bruts
event_vals = train_enhanced[event_col].to_numpy(dtype=bool)    # 1 = event, 0 = censuré

# Préparer y pour XGBoost AFT : log(1 + temps)
y_log_time = np.log1p(time_vals)


In [7]:
# Paramètres AFT
params = {
    "objective": "survival:aft",
    "eval_metric": "aft-nloglik",
    "learning_rate": 0.05,
    "max_depth": 4,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "min_child_weight": 1,
    "gamma": 0.0,
    "aft_loss_distribution": "normal",
    "aft_loss_distribution_scale": 1.0,
    "tree_method": "hist",
    "seed": 42,
}

num_boost_round = 200

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

c_indices = []
fold = 1

for train_idx, val_idx in kf.split(X):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train_log, y_val_log = y_log_time[train_idx], y_log_time[val_idx]
    e_train, e_val = event_vals[train_idx], event_vals[val_idx]

    # Bornes inf / sup en log-temps :
    # - non censuré : [log(t), log(t)]
    # - censuré à droite : [log(t), +inf]
    lb_train = y_train_log.copy()
    ub_train = y_train_log.copy()
    ub_train[e_train == 0] = np.inf

    lb_val = y_val_log.copy()
    ub_val = y_val_log.copy()
    ub_val[e_val == 0] = np.inf

    dtrain = xgb.DMatrix(X_train)
    dtrain.set_float_info("label_lower_bound", lb_train)
    dtrain.set_float_info("label_upper_bound", ub_train)

    dval = xgb.DMatrix(X_val)
    dval.set_float_info("label_lower_bound", lb_val)
    dval.set_float_info("label_upper_bound", ub_val)

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=[(dval, "valid")],
        verbose_eval=False,
    )

    # Prédictions de log-temps sur la validation
    y_pred_log = model.predict(dval)

    # C-index : on prend -y_pred_log comme "risque"
    c_index = concordance_index_censored(
    e_val.astype(bool),      # event_indicator
    time_vals[val_idx],      # event_time
    -y_pred_log)[0]


    c_indices.append(c_index)
    print(f"Fold {fold} — C-index: {c_index:.4f}")
    fold += 1

print("\nC-index moyen :", np.mean(c_indices))


Fold 1 — C-index: 0.7531
Fold 2 — C-index: 0.7108
Fold 3 — C-index: 0.7426
Fold 4 — C-index: 0.7266
Fold 5 — C-index: 0.7233

C-index moyen : 0.7312803131959719


## Entraînement sur l'ensemble complet et prédiction

In [8]:
X_full = train_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
y_full_log_time = np.log1p(time_vals)

lb_full = y_full_log_time.copy()
ub_full = y_full_log_time.copy()
ub_full[event_vals == 0] = np.inf

dfull = xgb.DMatrix(X_full)
dfull.set_float_info("label_lower_bound", lb_full)
dfull.set_float_info("label_upper_bound", ub_full)

model_full = xgb.train(
    params,
    dfull,
    num_boost_round=num_boost_round,
    verbose_eval=False
)

## Tuning des hyperparamètres XGBoost AFT avec Optuna

In [6]:
!pip install optuna

'pip' is not recognized as an internal or external command,
operable program or batch file.


In [10]:
import optuna
from sklearn.model_selection import KFold
import numpy as np
import xgboost as xgb
from sksurv.metrics import concordance_index_censored

In [11]:
def objective(trial):
    params = {
        "objective": "survival:aft",
        "eval_metric": "aft-nloglik",
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "aft_loss_distribution": trial.suggest_categorical("aft_loss_distribution", ["normal", "logistic", "extreme"]),
        "aft_loss_distribution_scale": trial.suggest_float("aft_loss_distribution_scale", 0.3, 2.0),
        "tree_method": "hist",
        "seed": 42,
    }

    num_boost_round = trial.suggest_int("num_boost_round", 50, 500)

    n_splits = 7
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    c_indices = []

    for train_idx, val_idx in kf.split(X):
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_log_fold, y_val_log_fold = y_log_time[train_idx], y_log_time[val_idx]
        e_train_fold, e_val_fold = event_vals[train_idx], event_vals[val_idx]

        lb_train_fold = y_train_log_fold.copy()
        ub_train_fold = y_train_log_fold.copy()
        ub_train_fold[e_train_fold == 0] = np.inf

        lb_val_fold = y_val_log_fold.copy()
        ub_val_fold = y_val_log_fold.copy()
        ub_val_fold[e_val_fold == 0] = np.inf

        dtrain_fold = xgb.DMatrix(X_train_fold)
        dtrain_fold.set_float_info("label_lower_bound", lb_train_fold)
        dtrain_fold.set_float_info("label_upper_bound", ub_train_fold)

        dval_fold = xgb.DMatrix(X_val_fold)
        dval_fold.set_float_info("label_lower_bound", lb_val_fold)
        dval_fold.set_float_info("label_upper_bound", ub_val_fold)

        model_fold = xgb.train(
            params,
            dtrain_fold,
            num_boost_round=num_boost_round,
            evals=[(dval_fold, "valid")],
            verbose_eval=False,
        )

        y_pred_log_fold = model_fold.predict(dval_fold)

        c_index = concordance_index_censored(
            e_val_fold.astype(bool),
            time_vals[val_idx],
            -y_pred_log_fold
        )[0]

        c_indices.append(c_index)

    return np.mean(c_indices)

In [12]:
study = optuna.create_study(direction="maximize", study_name="xgb_aft_tuning")
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=1000, show_progress_bar=True)

[I 2025-12-06 11:35:42,740] A new study created in memory with name: xgb_aft_tuning


  0%|          | 0/1000 [00:00<?, ?it/s]

In [13]:
print("Meilleurs hyperparamètres:")
print(study.best_params)
print(f"\nMeilleur C-index moyen: {study.best_value:.4f}")

Meilleurs hyperparamètres:
{'learning_rate': 0.022561607522321357, 'max_depth': 4, 'subsample': 0.5878453201230895, 'colsample_bytree': 0.5707522886919048, 'min_child_weight': 10, 'gamma': 0.2809539949765667, 'aft_loss_distribution': 'extreme', 'aft_loss_distribution_scale': 0.40342091989236506, 'num_boost_round': 239}

Meilleur C-index moyen: 0.7428


## Save current results and retrain with different seeds

Saving the current best model results (seed=42) and retraining with 4 additional seeds (36, 500, 4321, 137).

In [16]:
import json
import os

results_dir = '../../results'
os.makedirs(results_dir, exist_ok=True)
best_params = study.best_params
best_num_boost_round = best_params.pop('num_boost_round')

results_seed_42 = {
    'seed': 42,
    'best_params': best_params.copy(),
    'best_num_boost_round': best_num_boost_round,
    'best_c_index': study.best_value,
    'submission_file': 'submission_xgb_aft_tuned_avg10.csv'
}

with open(f'{results_dir}/xgb_aft_seed_42.json', 'w') as f:
    json.dump(results_seed_42, f, indent=2)

print("Results for seed 42 saved to 'results/xgb_aft_seed_42.json'")

Results for seed 42 saved to 'results/xgb_aft_seed_42.json'


In [17]:
additional_seeds = [36, 500, 4321, 137]
all_seed_results = [results_seed_42]

for seed_value in additional_seeds:
    print(f"\n{'=' * 60}")
    print(f"Starting tuning with seed={seed_value}")
    print(f"{'=' * 60}\n")


    def objective_seed(trial):
        params = {
            "objective": "survival:aft",
            "eval_metric": "aft-nloglik",
            "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 7),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
            "gamma": trial.suggest_float("gamma", 0.0, 5.0),
            "aft_loss_distribution": trial.suggest_categorical("aft_loss_distribution",
                                                               ["normal", "logistic", "extreme"]),
            "aft_loss_distribution_scale": trial.suggest_float("aft_loss_distribution_scale", 0.3, 2.0),
            "tree_method": "hist",
            "seed": seed_value,
        }

        num_boost_round = trial.suggest_int("num_boost_round", 50, 500)

        n_splits = 7
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed_value)

        c_indices = []

        for train_idx, val_idx in kf.split(X):
            X_train_fold, X_val_fold = X[train_idx], X[val_idx]
            y_train_log_fold, y_val_log_fold = y_log_time[train_idx], y_log_time[val_idx]
            e_train_fold, e_val_fold = event_vals[train_idx], event_vals[val_idx]

            lb_train_fold = y_train_log_fold.copy()
            ub_train_fold = y_train_log_fold.copy()
            ub_train_fold[e_train_fold == 0] = np.inf

            lb_val_fold = y_val_log_fold.copy()
            ub_val_fold = y_val_log_fold.copy()
            ub_val_fold[e_val_fold == 0] = np.inf

            dtrain_fold = xgb.DMatrix(X_train_fold)
            dtrain_fold.set_float_info("label_lower_bound", lb_train_fold)
            dtrain_fold.set_float_info("label_upper_bound", ub_train_fold)

            dval_fold = xgb.DMatrix(X_val_fold)
            dval_fold.set_float_info("label_lower_bound", lb_val_fold)
            dval_fold.set_float_info("label_upper_bound", ub_val_fold)

            model_fold = xgb.train(
                params,
                dtrain_fold,
                num_boost_round=num_boost_round,
                evals=[(dval_fold, "valid")],
                verbose_eval=False,
            )

            y_pred_log_fold = model_fold.predict(dval_fold)

            c_index = concordance_index_censored(
                e_val_fold.astype(bool),
                time_vals[val_idx],
                -y_pred_log_fold
            )[0]

            c_indices.append(c_index)

        return np.mean(c_indices)


    study_seed = optuna.create_study(direction="maximize", study_name=f"xgb_aft_seed_{seed_value}")
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study_seed.optimize(objective_seed, n_trials=1000, show_progress_bar=True)

    print(f"\nBest C-index for seed {seed_value}: {study_seed.best_value:.4f}")

    best_params_seed = study_seed.best_params.copy()
    best_params_seed["objective"] = "survival:aft"
    best_params_seed["eval_metric"] = "aft-nloglik"
    best_params_seed["tree_method"] = "hist"
    best_params_seed["seed"] = seed_value

    best_num_boost_round_seed = best_params_seed.pop("num_boost_round")

    lb_full_seed = y_full_log_time.copy()
    ub_full_seed = y_full_log_time.copy()
    ub_full_seed[event_vals == 0] = np.inf

    dfull_seed = xgb.DMatrix(X_full)
    dfull_seed.set_float_info("label_lower_bound", lb_full_seed)
    dfull_seed.set_float_info("label_upper_bound", ub_full_seed)

    model_tuned_seed = xgb.train(
        best_params_seed,
        dfull_seed,
        num_boost_round=best_num_boost_round_seed,
        verbose_eval=False
    )

    X_eval_seed = eval_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
    deval_seed = xgb.DMatrix(X_eval_seed)

    y_pred_log_seed = model_tuned_seed.predict(deval_seed)
    risk_score_seed = -y_pred_log_seed

    submission_seed = pd.DataFrame({
        'ID': eval_enhanced['ID'],
        'risk_score': risk_score_seed
    })

    results_seed = {
        'seed': seed_value,
        'best_params': best_params_seed.copy(),
        'best_num_boost_round': best_num_boost_round_seed,
        'best_c_index': study_seed.best_value,
    }

    with open(f'{results_dir}/xgb_aft_seed_{seed_value}.json', 'w') as f:
        json.dump(results_seed, f, indent=2)

    all_seed_results.append(results_seed)

    print(f"Results saved to 'results/xgb_aft_seed_{seed_value}.json'")


Starting tuning with seed=36



  0%|          | 0/1000 [00:00<?, ?it/s]

[W 2025-12-06 12:43:06,724] Trial 397 failed with parameters: {'learning_rate': 0.006821405447168617, 'max_depth': 3, 'subsample': 0.5186227438823673, 'colsample_bytree': 0.7233758229915569, 'min_child_weight': 9, 'gamma': 4.571704486657374, 'aft_loss_distribution': 'logistic', 'aft_loss_distribution_scale': 0.390837155015343, 'num_boost_round': 461} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\arthr\Desktop\ENSAE\QRT-Challenge-2025\.venv\Lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\arthr\AppData\Local\Temp\ipykernel_15496\2388980546.py", line 55, in objective_seed
    model_fold = xgb.train(
                 ^^^^^^^^^^
  File "c:\Users\arthr\Desktop\ENSAE\QRT-Challenge-2025\.venv\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\arthr\Desktop\

KeyboardInterrupt: 

In [82]:
#TODO: Simplifier tt ça et déseeder le kfold

In [18]:
print("\n" + "=" * 60)
print("SUMMARY OF ALL SEEDS")
print("=" * 60)

summary_df = pd.DataFrame([
    {
        'Seed': r['seed'],
        'Best C-Index': r['best_c_index'],
        'Num Boost Rounds': r['best_num_boost_round'],
    }
    for r in all_seed_results
])

summary_df


SUMMARY OF ALL SEEDS


Unnamed: 0,Seed,Best C-Index,Num Boost Rounds
0,42,0.742321,439
1,36,0.744425,468
2,500,0.743527,474
3,4321,0.742554,256
4,137,0.744798,361


## Smart Weighted Ensemble of 5 Seeded Models

Creating a weighted ensemble that:
1. Normalizes predictions by dividing by absolute mean
2. Weights models by their best C-index
3. Evaluates in K-Fold CV
4. Generates final submission

In [46]:
seeds_to_use = [42, 36, 500, 4321, 137]

best_params_per_seed = {}
for seed_val in seeds_to_use:
    json_path = f'{results_dir}/xgb_aft_seed_{seed_val}.json'
    with open(json_path, 'r') as f:
        seed_data = json.load(f)
        best_params_per_seed[seed_val] = seed_data

print("Loaded best parameters for all seeds")
for seed_val in seeds_to_use:
    print(f"Seed {seed_val}: C-Index = {best_params_per_seed[seed_val]['best_c_index']:.4f}")

Loaded best parameters for all seeds
Seed 42: C-Index = 0.7423
Seed 36: C-Index = 0.7444
Seed 500: C-Index = 0.7435
Seed 4321: C-Index = 0.7426
Seed 137: C-Index = 0.7448


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sksurv.metrics import concordance_index_censored
import xgboost as xgb

def train_and_predict_seed_model(seed_val, dtrain, dval):
    best_params = best_params_per_seed[seed_val]['best_params'].copy()
    
    params = {
        "objective": "survival:aft",
        "eval_metric": "aft-nloglik",
        "learning_rate": best_params['learning_rate'],
        "max_depth": best_params['max_depth'],
        "subsample": best_params['subsample'],
        "colsample_bytree": best_params['colsample_bytree'],
        "min_child_weight": best_params['min_child_weight'],
        "gamma": best_params['gamma'],
        "aft_loss_distribution": best_params['aft_loss_distribution'],
        "aft_loss_distribution_scale": best_params['aft_loss_distribution_scale'],
        "tree_method": "hist",
        "seed": seed_val,
    }
    
    num_boost_round = best_params_per_seed[seed_val]['best_num_boost_round']
    best_c = best_params_per_seed[seed_val]['best_c_index']
    
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=[(dval, "valid")],
        verbose_eval=False,
    )
    
    y_pred_log = model.predict(dval)
    
    return y_pred_log, best_c
    
    #c_index = concordance_index_censored(
    #    e_val.astype(bool),
    #    time_vals[val_idx],
    #    -y_pred_log
    #)[0]
   # 
   #     c_indices.append(c_index)
   #     print(f"Fold {fold} — C-index: {c_index:.4f}")
   #     fold += 1
   # 
   #print(f"\nC-index moyen : {np.mean(c_indices):.4f}")

In [67]:
print("Evaluating Weighted Ensemble with K-Fold Cross-Validation\n")

kf_ensemble = KFold(n_splits=7, shuffle=True)
cv_scores_ensemble = []

fold = 1
    
seeds_score = {}
for seed_val in seeds_to_use:
    seeds_score[seed_val] = []
for train_idx, val_idx in kf.split(X):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train_log, y_val_log = y_log_time[train_idx], y_log_time[val_idx]
    e_train, e_val = event_vals[train_idx], event_vals[val_idx]
    
    lb_train = y_train_log.copy()
    ub_train = y_train_log.copy()
    ub_train[e_train == 0] = np.inf
    
    lb_val = y_val_log.copy()
    ub_val = y_val_log.copy()
    ub_val[e_val == 0] = np.inf
    
    dtrain = xgb.DMatrix(X_train)
    dtrain.set_float_info("label_lower_bound", lb_train)
    dtrain.set_float_info("label_upper_bound", ub_train)
    
    dval = xgb.DMatrix(X_val)
    dval.set_float_info("label_lower_bound", lb_val)
    dval.set_float_info("label_upper_bound", ub_val)

    
    for seed_val in seeds_to_use:
        y_pred_log, best_c = train_and_predict_seed_model(seed_val, dtrain, dval)
        c_index = concordance_index_censored(
            e_val.astype(bool),
            time_vals[val_idx],
            -y_pred_log
        )[0]
        
        print(f"Fold {fold} >> Model seed {seed_val} - Best C-Index: {best_c:.4f}. ON FOLD: {c_index}")
        seeds_score[seed_val].append(c_index)

    fold += 1
    
scores_final = [np.mean(seeds_score[seed]) for seed in seeds_to_use]
print(scores_final)
print(np.mean(scores_final))

Evaluating Weighted Ensemble with K-Fold Cross-Validation

Fold 1 >> Model seed 42 - Best C-Index: 0.7423. ON FOLD: 0.7590540364945889
Fold 1 >> Model seed 36 - Best C-Index: 0.7444. ON FOLD: 0.7590911622208609
Fold 1 >> Model seed 500 - Best C-Index: 0.7435. ON FOLD: 0.7600749939670695
Fold 1 >> Model seed 4321 - Best C-Index: 0.7426. ON FOLD: 0.759258227989085
Fold 1 >> Model seed 137 - Best C-Index: 0.7448. ON FOLD: 0.7508492509884724
Fold 2 >> Model seed 42 - Best C-Index: 0.7423. ON FOLD: 0.7351578106272473
Fold 2 >> Model seed 36 - Best C-Index: 0.7444. ON FOLD: 0.7318617658809429
Fold 2 >> Model seed 500 - Best C-Index: 0.7435. ON FOLD: 0.7298242109468638
Fold 2 >> Model seed 4321 - Best C-Index: 0.7426. ON FOLD: 0.7302836596084699
Fold 2 >> Model seed 137 - Best C-Index: 0.7448. ON FOLD: 0.7302636835797044
Fold 3 >> Model seed 42 - Best C-Index: 0.7423. ON FOLD: 0.7164540056669494
Fold 3 >> Model seed 36 - Best C-Index: 0.7444. ON FOLD: 0.721850240935588
Fold 3 >> Model seed 50

In [79]:
print("Evaluating Weighted Ensemble with K-Fold Cross-Validation\n")

kf_ensemble = KFold(n_splits=7, shuffle=True)
cv_scores_ensemble = []

fold = 1
    
fold_score = []
for seed_val in seeds_to_use:
    seeds_score[seed_val] = []
for train_idx, val_idx in kf.split(X):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train_log, y_val_log = y_log_time[train_idx], y_log_time[val_idx]
    e_train, e_val = event_vals[train_idx], event_vals[val_idx]
    
    lb_train = y_train_log.copy()
    ub_train = y_train_log.copy()
    ub_train[e_train == 0] = np.inf
    
    lb_val = y_val_log.copy()
    ub_val = y_val_log.copy()
    ub_val[e_val == 0] = np.inf
    
    dtrain = xgb.DMatrix(X_train)
    dtrain.set_float_info("label_lower_bound", lb_train)
    dtrain.set_float_info("label_upper_bound", ub_train)
    
    dval = xgb.DMatrix(X_val)
    dval.set_float_info("label_lower_bound", lb_val)
    dval.set_float_info("label_upper_bound", ub_val)

    y_preds = []
    for seed_val in seeds_to_use:
        y_pred_log, best_c = train_and_predict_seed_model(seed_val, dtrain, dval)
        best_cs.append(best_c)
        
        y_preds.append(best_c * y_pred_log / np.mean(np.abs(y_pred_log)))
    
    y_preds_comb = np.array(y_preds).sum(axis=0)
    c_index = concordance_index_censored(
        e_val.astype(bool),
        time_vals[val_idx],
        -y_preds_comb
    )[0]
        
    print(f"Fold {fold} >> {c_index}")
    fold_score.append(c_index)

    fold += 1
    
print(np.mean(fold_score))

Evaluating Weighted Ensemble with K-Fold Cross-Validation

Fold 1 >> 0.7593881680310371
Fold 2 >> 0.7328006392329205
Fold 3 >> 0.720514717825624
Fold 4 >> 0.7804842286614981
Fold 5 >> 0.7414059405940594
Fold 6 >> 0.7031669955831219
Fold 7 >> 0.7536402990948445
0.7416287127175866


In [80]:
best_cs = []
for seed_val in seeds_to_use:
    best_params = best_params_per_seed[seed_val]['best_params'].copy()

    params = {
        "objective": "survival:aft",
        "eval_metric": "aft-nloglik",
        "learning_rate": best_params['learning_rate'],
        "max_depth": best_params['max_depth'],
        "subsample": best_params['subsample'],
        "colsample_bytree": best_params['colsample_bytree'],
        "min_child_weight": best_params['min_child_weight'],
        "gamma": best_params['gamma'],
        "aft_loss_distribution": best_params['aft_loss_distribution'],
        "aft_loss_distribution_scale": best_params['aft_loss_distribution_scale'],
        "tree_method": "hist",
        "seed": seed_val,
    }

    num_boost_round = best_params_per_seed[seed_val]['best_num_boost_round']

    lb_full = y_full_log_time.copy()
    ub_full = y_full_log_time.copy()
    ub_full[event_vals == 0] = np.inf

    dfull = xgb.DMatrix(X_full)
    dfull.set_float_info("label_lower_bound", lb_full)
    dfull.set_float_info("label_upper_bound", ub_full)

    model = xgb.train(
        params,
        dfull,
        num_boost_round=num_boost_round,
        verbose_eval=False
    )

    X_eval = eval_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
    deval = xgb.DMatrix(X_eval)

    y_pred_log = model.predict(deval)

    best_c = best_params_per_seed[seed_val]['best_c_index']
    best_cs.append(best_c)

    pred_normalized = y_pred_log / np.mean(np.abs(y_pred_log))

    if seed_val == seeds_to_use[0]:
        weighted_pred = best_c * pred_normalized
    else:
        weighted_pred += best_c * pred_normalized

    print(f"Seed {seed_val} trained on full data, best_c={best_c:.4f}")

Seed 42 trained on full data, best_c=0.7423
Seed 36 trained on full data, best_c=0.7444
Seed 500 trained on full data, best_c=0.7435
Seed 4321 trained on full data, best_c=0.7426
Seed 137 trained on full data, best_c=0.7448


In [81]:
risk_scores = -weighted_pred

submission = pd.DataFrame({
    'ID': eval_enhanced['ID'],
    'risk_score': risk_scores
})

submission.to_csv('../../submissions/submission_xgb_aft_weighted_ensemble.csv', index=False)

print("Submission saved to: ../../submissions/submission_xgb_aft_weighted_ensemble.csv")


Submission saved to: ../../submissions/submission_xgb_aft_weighted_ensemble.csv
