In [301]:
import pandas as pd
import numpy as np
train_enhanced = pd.read_csv('../data/train_pivot2.csv', sep = ',')
eval_enhanced = pd.read_csv('../data/eval_pivot2.csv', sep = ',')

molecular_train  =  pd.read_csv('../data/molecular_train.csv', sep = ',')
molecular_eval  =  pd.read_csv('../data/molecular_val.csv', sep = ',')

In [302]:
train_enhanced['cyto_risk_score'] = (
    3 * train_enhanced['is_monosomal_karyotype'] +
    3 * train_enhanced['is_complex_karyotype'] +
    2 * train_enhanced['has_minus7_or_del7q'] +
    2 * train_enhanced['has_minus5_or_del5q'] +
    1 * train_enhanced['has_plus8']
)

eval_enhanced['cyto_risk_score'] = (
    3 * eval_enhanced['is_monosomal_karyotype'] +
    3 * eval_enhanced['is_complex_karyotype'] +
    2 * eval_enhanced['has_minus7_or_del7q'] +
    2 * eval_enhanced['has_minus5_or_del5q'] +
    1 * eval_enhanced['has_plus8']
)

train_enhanced['TP53_complex_interaction'] = train_enhanced['Gene_TP53'] * train_enhanced['is_complex_karyotype']
train_enhanced['ASXL1_minus7_interaction'] = train_enhanced['Gene_ASXL1'] * train_enhanced['has_minus7_or_del7q']
train_enhanced['NPM1_normal_interaction'] = train_enhanced['Gene_NPM1'] * (1 - train_enhanced['prop_any_abnormal'])

eval_enhanced['TP53_complex_interaction'] = eval_enhanced['Gene_TP53'] * eval_enhanced['is_complex_karyotype']
eval_enhanced['ASXL1_minus7_interaction'] = eval_enhanced['Gene_ASXL1'] * eval_enhanced['has_minus7_or_del7q']
eval_enhanced['NPM1_normal_interaction'] = eval_enhanced['Gene_NPM1'] * (1 - eval_enhanced['prop_any_abnormal'])

train_enhanced['high_risk_chr_load'] = (
    train_enhanced['CHR_5_count'] +
    train_enhanced['CHR_7_count'] +
    train_enhanced['CHR_17_count']
)

eval_enhanced['high_risk_chr_load'] = (
    eval_enhanced['CHR_5_count'] +
    eval_enhanced['CHR_7_count'] +
    eval_enhanced['CHR_17_count']
)

In [303]:
train_enhanced['risk_score_high_genes'] = (
    train_enhanced['Gene_TP53'] +
    train_enhanced['Gene_ASXL1'] +
    train_enhanced['Gene_RUNX1']
)

train_enhanced['risk_score_favorable_genes'] = train_enhanced['Gene_NPM1'] + train_enhanced['Gene_CEBPA']

eval_enhanced['risk_score_high_genes'] = (
    eval_enhanced['Gene_TP53'] +    
    eval_enhanced['Gene_ASXL1'] +
    eval_enhanced['Gene_RUNX1']
)

eval_enhanced['risk_score_favorable_genes'] = eval_enhanced['Gene_NPM1'] + eval_enhanced['Gene_CEBPA']


train_enhanced['n_splicing_mut'] = train_enhanced[['Gene_U2AF1','Gene_SRSF2','Gene_SF3B1','Gene_ZRSR2']].sum(axis=1)
train_enhanced['n_signaling_mut'] = train_enhanced[['Gene_NRAS','Gene_KRAS','Gene_JAK2','Gene_CBL']].sum(axis=1)

eval_enhanced['n_splicing_mut'] = eval_enhanced[['Gene_U2AF1','Gene_SRSF2','Gene_SF3B1','Gene_ZRSR2']].sum(axis=1)
eval_enhanced['n_signaling_mut'] = eval_enhanced[['Gene_NRAS','Gene_KRAS','Gene_JAK2','Gene_CBL']].sum(axis=1)

train_enhanced['TP53_VAF_interaction'] = train_enhanced['Gene_TP53'] * train_enhanced['VAF_avg']
eval_enhanced['TP53_VAF_interaction'] = eval_enhanced['Gene_TP53'] * eval_enhanced['VAF_avg']


train_enhanced['ANC_WBC_ratio'] = train_enhanced['ANC'] / (train_enhanced['WBC']+1)
train_enhanced['BLAST_WBC_ratio'] = train_enhanced['BM_BLAST'] / (train_enhanced['WBC']+1)

eval_enhanced['ANC_WBC_ratio'] = eval_enhanced['ANC'] / (eval_enhanced['WBC']+1)
eval_enhanced['BLAST_WBC_ratio'] = eval_enhanced['BM_BLAST'] / (eval_enhanced['WBC']+1)







train_enhanced['major_clone_VAF'] = train_enhanced['VAF_max']
train_enhanced['subclonality'] = train_enhanced['VAF_std'] / (train_enhanced['VAF_avg']+1e-6)

eval_enhanced['major_clone_VAF'] = eval_enhanced['VAF_max']
eval_enhanced['subclonality'] = eval_enhanced['VAF_std'] / (eval_enhanced['VAF_avg']+1e-6)


train_enhanced['karyo_score_clinical'] = (
    3 * train_enhanced['is_monosomal_karyotype'] +
    2 * train_enhanced['is_complex_karyotype'] +
    2 * train_enhanced['has_minus7_or_del7q'] +
    1 * train_enhanced['has_plus8']
)

eval_enhanced['karyo_score_clinical'] = (
    3 * eval_enhanced['is_monosomal_karyotype'] +
    2 * eval_enhanced['is_complex_karyotype'] +
    2 * eval_enhanced['has_minus7_or_del7q'] +
    1 * eval_enhanced['has_plus8']
)



In [304]:
import numpy as np
import pandas as pd

def compute_vaf_entropy(df_mut):
    """
    df_mut doit contenir au minimum :
        - 'ID'  : identifiant patient
        - 'VAF' : fraction variant allele
    Retourne un dataframe avec une seule ligne par patient :
        ID | vaf_entropy
    """

    # Fonction locale de Shannon entropy
    def entropy_from_vaf(vaf_list):
        vaf_arr = np.array(vaf_list)
        
        # Normalisation → proportions p_i
        p = vaf_arr / vaf_arr.sum()

        # Somme seulement sur p_i > 0 sinon log pose pb.
        p = p[p > 0]

        return -np.sum(p * np.log(p))

    entropy_per_patient = (
        df_mut.groupby('ID')['VAF']
              .apply(entropy_from_vaf)
              .reset_index()
              .rename(columns={'VAF': 'vaf_entropy'})
    )

    return entropy_per_patient


entropy_train = compute_vaf_entropy(molecular_train)
entropy_eval = compute_vaf_entropy(molecular_eval)
    
train_enhanced = train_enhanced.merge(entropy_train, on='ID', how='left')
eval_enhanced = eval_enhanced.merge(entropy_eval, on='ID', how='left')

train_enhanced["vaf_entropy"] = train_enhanced["vaf_entropy"].fillna(0)
eval_enhanced["vaf_entropy"] = eval_enhanced["vaf_entropy"].fillna(0)


Résumé du plan:
- Préparer les données à partir de train_enhanced et y (temps et statut).
- Entraîner un XGB AFT avec K-fold cross-validation (K=5 par défaut) et évaluer les scores via concordance (C-index).
- Former le modèle sur l’ensemble des données et faire des prédictions sur le jeu d’évaluation, en générant un CSV avec les colonnes "ID" et "risk_score" (risque = opposé du temps de survie).
- Sauvegarder le CSV final dans le répertoire approprié.

In [305]:
def safe_ratio(num, den):
    num = num.astype(float)
    den = den.astype(float)
    res = num / den
    res[~np.isfinite(res)] = np.nan   # remplace inf, -inf, nan par NaN
    return res

# 1) Remplacer les -1 par NaN dans HB
for df in [train_enhanced, eval_enhanced]:
    df.loc[df["HB"] == -1, "HB"] = np.nan

# 2) Calculer la médiane de HB sur le train
hb_median = train_enhanced["HB"].median()

# 3) Imputer les NaN de HB avec cette médiane (même valeur pour train et eval)
train_enhanced["HB"] = train_enhanced["HB"].fillna(hb_median)
eval_enhanced["HB"]  = eval_enhanced["HB"].fillna(hb_median)

# 4) Calculer le ratio PLT_HB_ratio
for df in [train_enhanced, eval_enhanced]:
    df["PLT_HB_ratio"] = safe_ratio(df["PLT"], df["HB"] + 1)


In [306]:
from sklearn.preprocessing import RobustScaler

# Définir les colonnes à exclure
exclude_cols = {'ID', 'OS_STATUS', 'OS_YEARS'}
feature_cols_scaling = [c for c in train_enhanced.columns if c not in exclude_cols]

# Initialiser le RobustScaler
scaler = RobustScaler()

# Fit sur train_enhanced et transformer train_enhanced et eval_enhanced
train_enhanced[feature_cols_scaling] = scaler.fit_transform(train_enhanced[feature_cols_scaling])
eval_enhanced[feature_cols_scaling] = scaler.transform(eval_enhanced[feature_cols_scaling])

In [307]:
train_enhanced.to_csv('../data/train_enhanced_V2.csv', index=False)
eval_enhanced.to_csv('../data/eval_enhanced_V2.csv', index=False)

In [308]:
# %%
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sksurv.metrics import concordance_index_censored
import xgboost as xgb  # <--- ajout important

time_col = "OS_YEARS"
event_col = "OS_STATUS"
exclude_cols = {time_col, event_col, "ID"}
feature_cols = [c for c in train_enhanced.columns if c not in exclude_cols]

X_df = train_enhanced[feature_cols].astype(float)
X_df = X_df.replace([np.inf, -np.inf], np.nan)

X = X_df.to_numpy(dtype=float)

time_vals = train_enhanced[time_col].to_numpy(dtype=float)     # temps bruts
event_vals = train_enhanced[event_col].to_numpy(dtype=bool)    # 1 = event, 0 = censuré

# Préparer y pour XGBoost AFT : log(1 + temps)
y_log_time = np.log1p(time_vals)


In [309]:
# Paramètres AFT
params = {
    "objective": "survival:aft",
    "eval_metric": "aft-nloglik",
    "learning_rate": 0.05,
    "max_depth": 4,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "min_child_weight": 1,
    "gamma": 0.0,
    "aft_loss_distribution": "normal",
    "aft_loss_distribution_scale": 1.0,
    "tree_method": "hist",
    "seed": 42,
}

num_boost_round = 200

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

c_indices = []
fold = 1

for train_idx, val_idx in kf.split(X):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train_log, y_val_log = y_log_time[train_idx], y_log_time[val_idx]
    e_train, e_val = event_vals[train_idx], event_vals[val_idx]

    # Bornes inf / sup en log-temps :
    # - non censuré : [log(t), log(t)]
    # - censuré à droite : [log(t), +inf]
    lb_train = y_train_log.copy()
    ub_train = y_train_log.copy()
    ub_train[e_train == 0] = np.inf

    lb_val = y_val_log.copy()
    ub_val = y_val_log.copy()
    ub_val[e_val == 0] = np.inf

    dtrain = xgb.DMatrix(X_train, missing=np.nan)


    dtrain.set_float_info("label_lower_bound", lb_train)
    dtrain.set_float_info("label_upper_bound", ub_train)

    dval = xgb.DMatrix(X_val, missing=np.nan)
    dval.set_float_info("label_lower_bound", lb_val)
    dval.set_float_info("label_upper_bound", ub_val)

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=[(dval, "valid")],
        verbose_eval=False,
    )

    # Prédictions de log-temps sur la validation
    y_pred_log = model.predict(dval)

    # C-index : on prend -y_pred_log comme "risque"
    c_index = concordance_index_censored(
    e_val.astype(bool),      # event_indicator
    time_vals[val_idx],      # event_time
    -y_pred_log)[0]


    c_indices.append(c_index)
    print(f"Fold {fold} — C-index: {c_index:.4f}")
    fold += 1

print("\nC-index moyen :", np.mean(c_indices))


Fold 1 — C-index: 0.7480
Fold 2 — C-index: 0.7115
Fold 3 — C-index: 0.7447
Fold 4 — C-index: 0.7290
Fold 5 — C-index: 0.7272

C-index moyen : 0.7320713328450112


## Entraînement sur l'ensemble complet et prédiction

In [310]:
X_full = train_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
y_full_log_time = np.log1p(time_vals)

lb_full = y_full_log_time.copy()
ub_full = y_full_log_time.copy()
ub_full[event_vals == 0] = np.inf

dfull = xgb.DMatrix(X_full)
dfull.set_float_info("label_lower_bound", lb_full)
dfull.set_float_info("label_upper_bound", ub_full)

model_full = xgb.train(
    params,
    dfull,
    num_boost_round=num_boost_round,
    verbose_eval=False
)

## Tuning des hyperparamètres XGBoost AFT avec Optuna

## Sélection de features avec LASSO K-Fold

In [None]:
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import numpy as np

# Préparer les données pour la sélection de features
X_for_selection = train_enhanced[feature_cols].fillna(0.0)
y_for_selection = time_vals  # utiliser directement les temps de survie

print(f"Shape initiale des features: {X_for_selection.shape}")

# Standardiser les features pour LASSO
scaler_lasso = StandardScaler()
X_scaled_lasso = scaler_lasso.fit_transform(X_for_selection)

# Utiliser LassoCV avec validation croisée pour sélectionner les features
print("Sélection des features avec LASSO et validation croisée K-Fold...")
lasso_cv = LassoCV(
    cv=5,  # K-Fold avec 5 splits
    random_state=42,
    max_iter=2000,
    n_alphas=50,  # Tester 50 valeurs d'alpha différentes
    fit_intercept=True
)

# Entraîner LASSO avec CV
lasso_cv.fit(X_scaled_lasso, y_for_selection)

print(f"Alpha optimal trouvé: {lasso_cv.alpha_:.6f}")
print(f"Score R² sur validation croisée: {lasso_cv.score(X_scaled_lasso, y_for_selection):.4f}")

# Sélectionner le top 100 des features basé sur l'importance LASSO
feature_importance = np.abs(lasso_cv.coef_)
n_features_to_select = min(100, len(feature_cols))  # Sélectionner 100 features ou moins si pas assez
top_indices = np.argsort(feature_importance)[-n_features_to_select:][::-1]  # Top 100

# Créer le masque de sélection pour les top 100 features
selected_mask = np.zeros(len(feature_cols), dtype=bool)
selected_mask[top_indices] = True

# Identifier les features sélectionnées
selected_features = [feature_cols[i] for i in top_indices]

print(f"Nombre de features sélectionnées: {len(selected_features)} / {len(feature_cols)}")
print(f"Pourcentage de features conservées: {len(selected_features)/len(feature_cols)*100:.1f}%")

# Afficher les top 20 des features les plus importantes
print(f"\nTop 20 des {n_features_to_select} features sélectionnées (coefficients LASSO):")
for i in range(min(20, len(selected_features))):
    idx = top_indices[i]
    print(f"{i+1:2d}. {feature_cols[idx]:30s} : {lasso_cv.coef_[idx]:8.4f}")

# Mettre à jour les données pour utiliser seulement les features sélectionnées
feature_cols = selected_features
X = train_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)

print(f"\nNouvelle shape des données après sélection: {X.shape}")

Shape initiale des features: (3103, 199)
Sélection des features avec LASSO et validation croisée K-Fold...






Alpha optimal trouvé: 0.114989
Score R² sur validation croisée: 0.1201
Nombre de features sélectionnées: 16 / 199
Pourcentage de features conservées: 8.0%

Top 20 des features les plus importantes (coefficients LASSO):
 1. CHR_nunique                    :  -0.2405
 2. HB                             :   0.2232
 3. PLT                            :   0.2209
 4. karyo_score_clinical           :  -0.1769
 5. risk_score_high_genes          :  -0.1591
 6. Gene_MYC                       :   0.0944
 7. TP53_VAF_interaction           :  -0.0796
 8. BM_BLAST                       :  -0.0670
 9. n_chromosomes_altered          :  -0.0641
10. Gene_NFE2                      :   0.0641
11. Gene_SF3B1                     :   0.0498
12. Gene_MLL                       :  -0.0491
13. EFFECT_NS_count                :  -0.0393
14. Gene_PRPF40A                   :   0.0161
15. CHR_17_count                   :  -0.0103
16. TP53_complex_interaction       :  -0.0102

Nouvelle shape des données après sélection: 

In [104]:
!pip install optuna --q


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [321]:
import optuna
from sklearn.model_selection import KFold
import numpy as np
import xgboost as xgb
from sksurv.metrics import concordance_index_censored

In [322]:
def objective(trial):
    params = {
        "objective": "survival:aft",
        "eval_metric": "aft-nloglik",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "aft_loss_distribution": trial.suggest_categorical("aft_loss_distribution", ["normal", "logistic", "extreme"]),
        "aft_loss_distribution_scale": trial.suggest_float("aft_loss_distribution_scale", 0.5, 2.0),
        "tree_method": "hist",
        "seed": 42,
    }

    num_boost_round = trial.suggest_int("num_boost_round", 50, 500)

    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    c_indices = []

    for train_idx, val_idx in kf.split(X):
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_log_fold, y_val_log_fold = y_log_time[train_idx], y_log_time[val_idx]
        e_train_fold, e_val_fold = event_vals[train_idx], event_vals[val_idx]

        lb_train_fold = y_train_log_fold.copy()
        ub_train_fold = y_train_log_fold.copy()
        ub_train_fold[e_train_fold == 0] = np.inf

        lb_val_fold = y_val_log_fold.copy()
        ub_val_fold = y_val_log_fold.copy()
        ub_val_fold[e_val_fold == 0] = np.inf

        dtrain_fold = xgb.DMatrix(X_train_fold)
        dtrain_fold.set_float_info("label_lower_bound", lb_train_fold)
        dtrain_fold.set_float_info("label_upper_bound", ub_train_fold)

        dval_fold = xgb.DMatrix(X_val_fold)
        dval_fold.set_float_info("label_lower_bound", lb_val_fold)
        dval_fold.set_float_info("label_upper_bound", ub_val_fold)

        model_fold = xgb.train(
            params,
            dtrain_fold,
            num_boost_round=num_boost_round,
            evals=[(dval_fold, "valid")],
            verbose_eval=False,
        )

        y_pred_log_fold = model_fold.predict(dval_fold)

        c_index = concordance_index_censored(
            e_val_fold.astype(bool),
            time_vals[val_idx],
            -y_pred_log_fold
        )[0]

        c_indices.append(c_index)

    return np.mean(c_indices)

In [323]:
study = optuna.create_study(direction="maximize", study_name="xgb_aft_tuning")
study.optimize(objective, n_trials=1000, show_progress_bar=True)

[I 2025-12-05 23:53:27,127] A new study created in memory with name: xgb_aft_tuning


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2025-12-05 23:53:28,673] Trial 0 finished with value: 0.66229310316454 and parameters: {'learning_rate': 0.27750124976596247, 'max_depth': 4, 'subsample': 0.6164828619732514, 'colsample_bytree': 0.953423340220206, 'min_child_weight': 5, 'gamma': 0.31010900754622983, 'aft_loss_distribution': 'normal', 'aft_loss_distribution_scale': 0.5170937325765422, 'num_boost_round': 443}. Best is trial 0 with value: 0.66229310316454.
[I 2025-12-05 23:53:30,017] Trial 1 finished with value: 0.7285966917261383 and parameters: {'learning_rate': 0.029327335699845084, 'max_depth': 4, 'subsample': 0.6722822600528094, 'colsample_bytree': 0.6737280754715467, 'min_child_weight': 6, 'gamma': 1.1038165503527186, 'aft_loss_distribution': 'extreme', 'aft_loss_distribution_scale': 0.8949607264573558, 'num_boost_round': 495}. Best is trial 1 with value: 0.7285966917261383.
[I 2025-12-05 23:53:30,017] Trial 1 finished with value: 0.7285966917261383 and parameters: {'learning_rate': 0.029327335699845084, 'max_dep

In [324]:
print("Meilleurs hyperparamètres:")
print(study.best_params)
print(f"\nMeilleur C-index moyen: {study.best_value:.4f}")

Meilleurs hyperparamètres:
{'learning_rate': 0.029680901871602702, 'max_depth': 9, 'subsample': 0.5828953482373245, 'colsample_bytree': 0.6190964992892509, 'min_child_weight': 10, 'gamma': 3.086282410551341, 'aft_loss_distribution': 'normal', 'aft_loss_distribution_scale': 1.3930640547936635, 'num_boost_round': 299}

Meilleur C-index moyen: 0.7364


In [325]:
best_params = study.best_params
best_params["objective"] = "survival:aft"
best_params["eval_metric"] = "aft-nloglik"
best_params["tree_method"] = "hist"
best_params["seed"] = 42

best_num_boost_round = best_params.pop("num_boost_round")

In [326]:
lb_full_final = y_full_log_time.copy()
ub_full_final = y_full_log_time.copy()
ub_full_final[event_vals == 0] = np.inf

dfull_final = xgb.DMatrix(X_full)
dfull_final.set_float_info("label_lower_bound", lb_full_final)
dfull_final.set_float_info("label_upper_bound", ub_full_final)

model_tuned = xgb.train(
    best_params,
    dfull_final,
    num_boost_round=best_num_boost_round,
    verbose_eval=False
)

In [257]:
eval_enhanced

Unnamed: 0,LEN_avg,has_plus8,CHR_16_count,total_metaphases,CHR_20_count,DEPTH_min,ID,prop_any_abnormal,VAF_avg,CHR_5_count,...,n_splicing_mut,n_signaling_mut,TP53_VAF_interaction,ANC_WBC_ratio,BLAST_WBC_ratio,major_clone_VAF,subclonality,karyo_score_clinical,vaf_entropy,PLT_HB_ratio
0,-0.250000,0.0,0.0,0.0,0.0,-0.656352,KYW1,3.00,-0.153460,1.0,...,-1.0,0.0,-0.000000,-0.694589,13.026763,-0.171875,-1.606796,2.0,0.204085,-2.415558
1,-0.250000,0.0,0.0,0.0,0.0,-0.366450,KYW2,3.00,-0.008504,0.0,...,-1.0,1.0,-0.000000,-0.406093,6.975369,1.113281,-107.637397,2.0,-0.251553,-0.516334
2,0.083333,1.0,0.0,-1.0,2.0,0.684039,KYW3,0.00,-0.501765,0.0,...,-1.0,0.0,-0.000000,0.894022,0.290279,-0.394531,-0.520674,1.0,-0.051060,-0.315667
3,-0.250000,0.0,0.0,-1.0,0.0,0.913681,KYW4,0.00,0.284813,0.0,...,-1.0,0.0,0.000000,0.016160,7.144954,0.000000,0.128167,0.0,0.121087,-1.419324
4,-0.250000,0.0,0.0,-0.2,1.0,0.105863,KYW5,2.25,-0.080756,0.0,...,0.0,0.0,-0.000000,-1.871453,-0.604846,-0.082031,-4.304717,7.0,-0.030057,-1.038620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1188,-0.250000,0.0,0.0,-1.0,0.0,-0.382736,KYW1189,0.00,0.677199,0.0,...,-1.0,0.0,0.000000,-0.432026,0.583807,0.183594,-0.496069,0.0,-0.138957,0.047259
1189,4.083333,0.0,0.0,-1.0,0.0,-0.895765,KYW1190,0.00,0.648715,0.0,...,-1.0,1.0,0.000000,-0.432026,0.583807,0.859375,0.749892,0.0,0.030148,0.047259
1190,-0.250000,0.0,0.0,-1.0,0.0,0.451140,KYW1191,0.00,-0.671280,0.0,...,-1.0,0.0,-0.671280,-0.432026,0.583807,-1.195312,0.301222,0.0,-0.640841,0.047259
1191,-0.250000,0.0,0.0,-1.0,0.0,-0.416938,KYW1192,0.00,0.094317,2.0,...,-1.0,0.0,0.000000,-0.432026,0.583807,0.308594,2.395448,0.0,0.083659,0.047259


In [172]:
submission_tuned

Unnamed: 0,ID,risk_score
0,KYW1,-0.682423
1,KYW2,-0.669761
2,KYW3,-1.287214
3,KYW4,-0.789166
4,KYW5,-0.698329
...,...,...
1188,KYW1189,-1.954821
1189,KYW1190,-1.720673
1190,KYW1191,-1.653615
1191,KYW1192,-1.810687


In [327]:
X_eval_final = eval_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
deval_final = xgb.DMatrix(X_eval_final)

y_pred_log_eval_tuned = model_tuned.predict(deval_final)
risk_score_tuned = -y_pred_log_eval_tuned


In [328]:

submission_tuned = pd.DataFrame({
    'ID': eval_enhanced['ID'],
    'risk_score': risk_score_tuned
})

submission_tuned.to_csv('../submissions/submission_xgb_aft_tuned.csv', index=False)
print("Prédictions avec hyperparamètres optimisés sauvegardées dans '../submissions/submission_xgb_aft_tuned.csv'")

Prédictions avec hyperparamètres optimisés sauvegardées dans '../submissions/submission_xgb_aft_tuned.csv'


In [224]:
df1 = submission_tuned
df2 = pd.read_csv('../submissions/mtlr_tuned.csv')

assert(df1['ID'].equals(df2['ID'])), "Les IDs des deux dataframes ne correspondent pas."

In [200]:
import plotly.express as px

fig = optuna.visualization.plot_optimization_history(study)
fig.show()

In [80]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()
