In [58]:
import pandas as pd
import numpy as np
train_enhanced = pd.read_csv('../data/train_pivot.csv', sep = ',')
eval_enhanced = pd.read_csv('../data/eval_pivot.csv', sep = ',')

molecular_train  =  pd.read_csv('../data/molecular_train.csv', sep = ',')
molecular_eval  =  pd.read_csv('../data/molecular_val.csv', sep = ',')

In [59]:
train_enhanced['cyto_risk_score'] = (
    3 * train_enhanced['is_monosomal_karyotype'] +
    3 * train_enhanced['is_complex_karyotype'] +
    2 * train_enhanced['has_minus7_or_del7q'] +
    2 * train_enhanced['has_minus5_or_del5q'] +
    1 * train_enhanced['has_plus8']
)

eval_enhanced['cyto_risk_score'] = (
    3 * eval_enhanced['is_monosomal_karyotype'] +
    3 * eval_enhanced['is_complex_karyotype'] +
    2 * eval_enhanced['has_minus7_or_del7q'] +
    2 * eval_enhanced['has_minus5_or_del5q'] +
    1 * eval_enhanced['has_plus8']
)

train_enhanced['TP53_complex_interaction'] = train_enhanced['Gene_TP53'] * train_enhanced['is_complex_karyotype']
train_enhanced['ASXL1_minus7_interaction'] = train_enhanced['Gene_ASXL1'] * train_enhanced['has_minus7_or_del7q']
train_enhanced['NPM1_normal_interaction'] = train_enhanced['Gene_NPM1'] * (1 - train_enhanced['prop_any_abnormal'])

eval_enhanced['TP53_complex_interaction'] = eval_enhanced['Gene_TP53'] * eval_enhanced['is_complex_karyotype']
eval_enhanced['ASXL1_minus7_interaction'] = eval_enhanced['Gene_ASXL1'] * eval_enhanced['has_minus7_or_del7q']
eval_enhanced['NPM1_normal_interaction'] = eval_enhanced['Gene_NPM1'] * (1 - eval_enhanced['prop_any_abnormal'])

train_enhanced['high_risk_chr_load'] = (
    train_enhanced['CHR_5_count'] +
    train_enhanced['CHR_7_count'] +
    train_enhanced['CHR_17_count']
)

eval_enhanced['high_risk_chr_load'] = (
    eval_enhanced['CHR_5_count'] +
    eval_enhanced['CHR_7_count'] +
    eval_enhanced['CHR_17_count']
)

In [60]:
train_enhanced['risk_score_high_genes'] = (
    train_enhanced['Gene_TP53'] +
    train_enhanced['Gene_ASXL1'] +
    train_enhanced['Gene_RUNX1']
)

train_enhanced['risk_score_favorable_genes'] = train_enhanced['Gene_NPM1'] + train_enhanced['Gene_CEBPA']

eval_enhanced['risk_score_high_genes'] = (
    eval_enhanced['Gene_TP53'] +    
    eval_enhanced['Gene_ASXL1'] +
    eval_enhanced['Gene_RUNX1']
)

eval_enhanced['risk_score_favorable_genes'] = eval_enhanced['Gene_NPM1'] + eval_enhanced['Gene_CEBPA']


train_enhanced['n_splicing_mut'] = train_enhanced[['Gene_U2AF1','Gene_SRSF2','Gene_SF3B1','Gene_ZRSR2']].sum(axis=1)
train_enhanced['n_signaling_mut'] = train_enhanced[['Gene_NRAS','Gene_KRAS','Gene_JAK2','Gene_CBL']].sum(axis=1)

eval_enhanced['n_splicing_mut'] = eval_enhanced[['Gene_U2AF1','Gene_SRSF2','Gene_SF3B1','Gene_ZRSR2']].sum(axis=1)
eval_enhanced['n_signaling_mut'] = eval_enhanced[['Gene_NRAS','Gene_KRAS','Gene_JAK2','Gene_CBL']].sum(axis=1)

train_enhanced['TP53_VAF_interaction'] = train_enhanced['Gene_TP53'] * train_enhanced['VAF_avg']
eval_enhanced['TP53_VAF_interaction'] = eval_enhanced['Gene_TP53'] * eval_enhanced['VAF_avg']


train_enhanced['ANC_WBC_ratio'] = train_enhanced['ANC'] / (train_enhanced['WBC']+1)
train_enhanced['BLAST_WBC_ratio'] = train_enhanced['BM_BLAST'] / (train_enhanced['WBC']+1)

eval_enhanced['ANC_WBC_ratio'] = eval_enhanced['ANC'] / (eval_enhanced['WBC']+1)
eval_enhanced['BLAST_WBC_ratio'] = eval_enhanced['BM_BLAST'] / (eval_enhanced['WBC']+1)



def safe_ratio(num, den):
    num = num.astype(float)
    den = den.astype(float)

    res = num / den
    res[~np.isfinite(res)] = np.nan   # remplace inf, -inf, nan par NaN
    return res

for df in [train_enhanced, eval_enhanced]:
    # remplace les "valeurs sentinelles" de HB
    df.loc[df["HB"] <= -0.99, "HB"] = np.nan
    
    df["PLT_HB_ratio"] = safe_ratio(df["PLT"], df["HB"] + 1)



train_enhanced['major_clone_VAF'] = train_enhanced['VAF_max']
train_enhanced['subclonality'] = train_enhanced['VAF_std'] / (train_enhanced['VAF_avg']+1e-6)

eval_enhanced['major_clone_VAF'] = eval_enhanced['VAF_max']
eval_enhanced['subclonality'] = eval_enhanced['VAF_std'] / (eval_enhanced['VAF_avg']+1e-6)


train_enhanced['karyo_score_clinical'] = (
    3 * train_enhanced['is_monosomal_karyotype'] +
    2 * train_enhanced['is_complex_karyotype'] +
    2 * train_enhanced['has_minus7_or_del7q'] +
    1 * train_enhanced['has_plus8']
)

eval_enhanced['karyo_score_clinical'] = (
    3 * eval_enhanced['is_monosomal_karyotype'] +
    2 * eval_enhanced['is_complex_karyotype'] +
    2 * eval_enhanced['has_minus7_or_del7q'] +
    1 * eval_enhanced['has_plus8']
)

In [81]:
import numpy as np
import pandas as pd

def compute_vaf_entropy(df_mut):
    """
    df_mut doit contenir au minimum :
        - 'ID'  : identifiant patient
        - 'VAF' : fraction variant allele
    Retourne un dataframe avec une seule ligne par patient :
        ID | vaf_entropy
    """

    # Fonction locale de Shannon entropy
    def entropy_from_vaf(vaf_list):
        vaf_arr = np.array(vaf_list)
        
        # Normalisation ‚Üí proportions p_i
        p = vaf_arr / vaf_arr.sum()

        # Somme seulement sur p_i > 0 sinon log pose pb.
        p = p[p > 0]

        return -np.sum(p * np.log(p))

    entropy_per_patient = (
        df_mut.groupby('ID')['VAF']
              .apply(entropy_from_vaf)
              .reset_index()
              .rename(columns={'VAF': 'vaf_entropy'})
    )

    return entropy_per_patient


entropy_train = compute_vaf_entropy(molecular_train)
entropy_eval = compute_vaf_entropy(molecular_eval)
    
train_enhanced = train_enhanced.merge(entropy_train, on='ID', how='left')
eval_enhanced = eval_enhanced.merge(entropy_eval, on='ID', how='left')

R√©sum√© du plan:
- Pr√©parer les donn√©es √† partir de train_enhanced et y (temps et statut).
- Entra√Æner un XGB AFT avec K-fold cross-validation (K=5 par d√©faut) et √©valuer les scores via concordance (C-index).
- Former le mod√®le sur l‚Äôensemble des donn√©es et faire des pr√©dictions sur le jeu d‚Äô√©valuation, en g√©n√©rant un CSV avec les colonnes "ID" et "risk_score" (risque = oppos√© du temps de survie).
- Sauvegarder le CSV final dans le r√©pertoire appropri√©.

In [62]:
for df in [train_enhanced, eval_enhanced]:
    df["vaf_entropy"] = df["vaf_entropy"].fillna(0.0)


# On calcule la m√©diane sur le TRAIN uniquement
hb_median = train_enhanced["HB"].median()

# On applique la m√™me valeur aux deux jeux
train_enhanced["HB"]  = train_enhanced["HB"].fillna(hb_median)
eval_enhanced["HB"]   = eval_enhanced["HB"].fillna(hb_median)

In [63]:
from sklearn.preprocessing import RobustScaler
import pandas as pd

# Identifier les variables num√©riques (excluant OS_YEARS et OS_STATUS)
numeric_cols = train_enhanced.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [col for col in numeric_cols if col not in ['OS_YEARS', 'OS_STATUS']]

# Initialiser le RobustScaler
scaler = RobustScaler()

# Appliquer le scaler √† train_enhanced
train_enhanced[numeric_cols] = scaler.fit_transform(train_enhanced[numeric_cols])

# Appliquer le scaler √† eval_enhanced (avec les param√®tres de train)
eval_enhanced[numeric_cols] = scaler.transform(eval_enhanced[numeric_cols])

print("RobustScaler appliqu√© avec succ√®s!")
print(f"Variables normalis√©es: {numeric_cols}")
print(f"\nStatistiques apr√®s normalisation (train_enhanced):")
print(train_enhanced[numeric_cols].describe())

RobustScaler appliqu√© avec succ√®s!
Variables normalis√©es: ['has_any_abnormality', 'LEN_max', 'Gene_CEBPA', 'Gene_NFE2', 'DEPTH_min', 'Gene_RUNX1', 'EFFECT_nunique', 'Gene_STAG2', 'has_plus8', 'Gene_ASXL2', 'CHR_19_count', 'Gene_ARID2', 'Gene_CUX1', 'EFFECT_FV_count', 'Gene_SF3B1', 'Gene_U2AF2', 'Gene_FLT3', 'Gene_SH2B3', 'Gene_ZRSR2', 'Gene_JAK2', 'CHR_17_count', 'MONOCYTES', 'BM_BLAST', 'Gene_BCORL1', 'CHR_1_count', 'CHR_15_count', 'Gene_EZH2', 'CHR_3_count', 'Gene_SRSF2', 'PLT', 'CHR_9_count', 'EFFECT_NS_count', 'total_metaphases', 'WBC', 'has_minus7_or_del7q', 'CHR_7_count', 'n_events', 'is_complex_karyotype', 'CHR_4_count', 'Gene_PHF6', 'Gene_IDH1', 'VAF_avg', 'Gene_DDX41', 'Nmut', 'Gene_BRCC3', 'Gene_KRAS', 'Gene_NPM1', 'LEN_avg', 'prop_plus8', 'Gene_IDH2', 'Gene_PPM1D', 'Gene_MLL', 'Gene_CTCF', 'prop_any_abnormal', 'CHR_11_count', 'Gene_PRPF8', 'Gene_U2AF1', 'is_monosomal_karyotype', 'DEPTH_avg', 'CHR_2_count', 'Gene_GNB1', 'Gene_PTPN11', 'Gene_TET2', 'ANC', 'DELLEN_sum', 'EFF

In [64]:
train_enhanced.to_csv('../data/train_enhanced_V2.csv', index=False)
eval_enhanced.to_csv('../data/eval_enhanced_V2.csv', index=False)

In [65]:
# %%
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sksurv.metrics import concordance_index_censored
import xgboost as xgb  # <--- ajout important

time_col = "OS_YEARS"
event_col = "OS_STATUS"
exclude_cols = {time_col, event_col, "ID"}
feature_cols = [c for c in train_enhanced.columns if c not in exclude_cols]

X_df = train_enhanced[feature_cols].astype(float)
X_df = X_df.replace([np.inf, -np.inf], np.nan)

X = X_df.to_numpy(dtype=float)

time_vals = train_enhanced[time_col].to_numpy(dtype=float)     # temps bruts
event_vals = train_enhanced[event_col].to_numpy(dtype=bool)    # 1 = event, 0 = censur√©

# Pr√©parer y pour XGBoost AFT : log(1 + temps)
y_log_time = np.log1p(time_vals)


In [66]:
# Param√®tres AFT
params = {
    "objective": "survival:aft",
    "eval_metric": "aft-nloglik",
    "learning_rate": 0.05,
    "max_depth": 4,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "min_child_weight": 1,
    "gamma": 0.0,
    "aft_loss_distribution": "normal",
    "aft_loss_distribution_scale": 1.0,
    "tree_method": "hist",
    "seed": 42,
}

num_boost_round = 200

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

c_indices = []
fold = 1

for train_idx, val_idx in kf.split(X):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train_log, y_val_log = y_log_time[train_idx], y_log_time[val_idx]
    e_train, e_val = event_vals[train_idx], event_vals[val_idx]

    # Bornes inf / sup en log-temps :
    # - non censur√© : [log(t), log(t)]
    # - censur√© √† droite : [log(t), +inf]
    lb_train = y_train_log.copy()
    ub_train = y_train_log.copy()
    ub_train[e_train == 0] = np.inf

    lb_val = y_val_log.copy()
    ub_val = y_val_log.copy()
    ub_val[e_val == 0] = np.inf

    dtrain = xgb.DMatrix(X_train, missing=np.nan)


    dtrain.set_float_info("label_lower_bound", lb_train)
    dtrain.set_float_info("label_upper_bound", ub_train)

    dval = xgb.DMatrix(X_val, missing=np.nan)
    dval.set_float_info("label_lower_bound", lb_val)
    dval.set_float_info("label_upper_bound", ub_val)

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        evals=[(dval, "valid")],
        verbose_eval=False,
    )

    # Pr√©dictions de log-temps sur la validation
    y_pred_log = model.predict(dval)

    # C-index : on prend -y_pred_log comme "risque"
    c_index = concordance_index_censored(
    e_val.astype(bool),      # event_indicator
    time_vals[val_idx],      # event_time
    -y_pred_log)[0]


    c_indices.append(c_index)
    print(f"Fold {fold} ‚Äî C-index: {c_index:.4f}")
    fold += 1

print("\nC-index moyen :", np.mean(c_indices))


Fold 1 ‚Äî C-index: 0.7447
Fold 2 ‚Äî C-index: 0.7139
Fold 3 ‚Äî C-index: 0.7388
Fold 4 ‚Äî C-index: 0.7288
Fold 5 ‚Äî C-index: 0.7218

C-index moyen : 0.7295902246326025


## Entra√Ænement sur l'ensemble complet et pr√©diction

In [67]:
X_full = train_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
y_full_log_time = np.log1p(time_vals)

lb_full = y_full_log_time.copy()
ub_full = y_full_log_time.copy()
ub_full[event_vals == 0] = np.inf

dfull = xgb.DMatrix(X_full)
dfull.set_float_info("label_lower_bound", lb_full)
dfull.set_float_info("label_upper_bound", ub_full)

model_full = xgb.train(
    params,
    dfull,
    num_boost_round=num_boost_round,
    verbose_eval=False
)

## Tuning des hyperparam√®tres XGBoost AFT avec Optuna

In [68]:
!pip install optuna




[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [69]:
import optuna
from sklearn.model_selection import KFold
import numpy as np
import xgboost as xgb
from sksurv.metrics import concordance_index_censored

In [70]:
def objective(trial):
    params = {
        "objective": "survival:aft",
        "eval_metric": "aft-nloglik",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "aft_loss_distribution": trial.suggest_categorical("aft_loss_distribution", ["normal", "logistic", "extreme"]),
        "aft_loss_distribution_scale": trial.suggest_float("aft_loss_distribution_scale", 0.5, 2.0),
        "tree_method": "hist",
        "seed": 42,
    }

    num_boost_round = trial.suggest_int("num_boost_round", 50, 500)

    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    c_indices = []

    for train_idx, val_idx in kf.split(X):
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_log_fold, y_val_log_fold = y_log_time[train_idx], y_log_time[val_idx]
        e_train_fold, e_val_fold = event_vals[train_idx], event_vals[val_idx]

        lb_train_fold = y_train_log_fold.copy()
        ub_train_fold = y_train_log_fold.copy()
        ub_train_fold[e_train_fold == 0] = np.inf

        lb_val_fold = y_val_log_fold.copy()
        ub_val_fold = y_val_log_fold.copy()
        ub_val_fold[e_val_fold == 0] = np.inf

        dtrain_fold = xgb.DMatrix(X_train_fold)
        dtrain_fold.set_float_info("label_lower_bound", lb_train_fold)
        dtrain_fold.set_float_info("label_upper_bound", ub_train_fold)

        dval_fold = xgb.DMatrix(X_val_fold)
        dval_fold.set_float_info("label_lower_bound", lb_val_fold)
        dval_fold.set_float_info("label_upper_bound", ub_val_fold)

        model_fold = xgb.train(
            params,
            dtrain_fold,
            num_boost_round=num_boost_round,
            evals=[(dval_fold, "valid")],
            verbose_eval=False,
        )

        y_pred_log_fold = model_fold.predict(dval_fold)

        c_index = concordance_index_censored(
            e_val_fold.astype(bool),
            time_vals[val_idx],
            -y_pred_log_fold
        )[0]

        c_indices.append(c_index)

    return np.mean(c_indices)

In [71]:
study = optuna.create_study(direction="maximize", study_name="xgb_aft_tuning")
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2025-12-05 15:37:20,043] A new study created in memory with name: xgb_aft_tuning


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-12-05 15:37:20,628] Trial 0 finished with value: 0.7317449272337548 and parameters: {'learning_rate': 0.02851288120192674, 'max_depth': 10, 'subsample': 0.8365205400350113, 'colsample_bytree': 0.6805876593796931, 'min_child_weight': 5, 'gamma': 1.2812466092048247, 'aft_loss_distribution': 'logistic', 'aft_loss_distribution_scale': 1.7681932946431616, 'num_boost_round': 51}. Best is trial 0 with value: 0.7317449272337548.
[I 2025-12-05 15:37:22,919] Trial 1 finished with value: 0.7168458311229591 and parameters: {'learning_rate': 0.08264666968058107, 'max_depth': 7, 'subsample': 0.9537066609307765, 'colsample_bytree': 0.9365284796816487, 'min_child_weight': 5, 'gamma': 0.19353680647901772, 'aft_loss_distribution': 'logistic', 'aft_loss_distribution_scale': 1.0492871580306267, 'num_boost_round': 450}. Best is trial 0 with value: 0.7317449272337548.
[I 2025-12-05 15:37:23,655] Trial 2 finished with value: 0.7326385429867969 and parameters: {'learning_rate': 0.015005865840181295, '

In [82]:
print("Meilleurs hyperparam√®tres:")
print(study.best_params)
print(f"\nMeilleur C-index moyen: {study.best_value:.4f}")

Meilleurs hyperparam√®tres:
{'learning_rate': 0.01087120676844201, 'max_depth': 4, 'subsample': 0.5861618401934464, 'colsample_bytree': 0.5908065443333089, 'min_child_weight': 5, 'gamma': 3.740396992063645, 'aft_loss_distribution': 'extreme', 'aft_loss_distribution_scale': 0.5370549038978948, 'num_boost_round': 482}

Meilleur C-index moyen: 0.7415


In [83]:
best_params = study.best_params
best_params["objective"] = "survival:aft"
best_params["eval_metric"] = "aft-nloglik"
best_params["tree_method"] = "hist"
best_params["seed"] = 42

best_num_boost_round = best_params.pop("num_boost_round")

In [84]:
lb_full_final = y_full_log_time.copy()
ub_full_final = y_full_log_time.copy()
ub_full_final[event_vals == 0] = np.inf

dfull_final = xgb.DMatrix(X_full)
dfull_final.set_float_info("label_lower_bound", lb_full_final)
dfull_final.set_float_info("label_upper_bound", ub_full_final)

model_tuned = xgb.train(
    best_params,
    dfull_final,
    num_boost_round=best_num_boost_round,
    verbose_eval=False
)

In [75]:
eval_enhanced

Unnamed: 0,LEN_avg,has_plus8,CHR_16_count,total_metaphases,CHR_20_count,DEPTH_min,ID,prop_any_abnormal,VAF_avg,CHR_5_count,...,n_splicing_mut,n_signaling_mut,TP53_VAF_interaction,ANC_WBC_ratio,BLAST_WBC_ratio,PLT_HB_ratio,major_clone_VAF,subclonality,karyo_score_clinical,vaf_entropy
0,-0.250000,0.0,0.0,0.0,0.0,-0.656352,KYW1,3.00,-0.153460,1.0,...,-1.0,0.0,-0.000000,-0.694589,13.026763,-2.638909,-0.171875,-1.606796,2.0,0.204085
1,-0.250000,0.0,0.0,0.0,0.0,-0.366450,KYW2,3.00,-0.008504,0.0,...,-1.0,1.0,-0.000000,-0.406093,6.975369,-0.560707,1.113281,-107.637397,2.0,-0.251553
2,0.083333,1.0,0.0,-1.0,2.0,0.684039,KYW3,0.00,-0.501765,0.0,...,-1.0,0.0,-0.000000,0.894022,0.290279,-0.341129,-0.394531,-0.520674,1.0,-0.051060
3,-0.250000,0.0,0.0,-1.0,0.0,0.913681,KYW4,0.00,0.284813,0.0,...,-1.0,0.0,0.000000,0.016160,7.144954,-1.548793,0.000000,0.128167,0.0,0.121087
4,-0.250000,0.0,0.0,-0.2,1.0,0.105863,KYW5,2.25,-0.080756,0.0,...,0.0,0.0,-0.000000,-1.871453,-0.604846,-1.132212,-0.082031,-4.304717,7.0,-0.030057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1188,-0.250000,0.0,0.0,-1.0,0.0,-0.382736,KYW1189,0.00,0.677199,0.0,...,-1.0,0.0,0.000000,-0.432026,0.583807,0.055997,0.183594,-0.496069,0.0,-0.138957
1189,4.083333,0.0,0.0,-1.0,0.0,-0.895765,KYW1190,0.00,0.648715,0.0,...,-1.0,1.0,0.000000,-0.432026,0.583807,0.055997,0.859375,0.749892,0.0,0.030148
1190,-0.250000,0.0,0.0,-1.0,0.0,0.451140,KYW1191,0.00,-0.671280,0.0,...,-1.0,0.0,-0.671280,-0.432026,0.583807,0.055997,-1.195312,0.301222,0.0,-0.640841
1191,-0.250000,0.0,0.0,-1.0,0.0,-0.416938,KYW1192,0.00,0.094317,2.0,...,-1.0,0.0,0.000000,-0.432026,0.583807,0.055997,0.308594,2.395448,0.0,0.083659


In [76]:
submission_tuned

Unnamed: 0,ID,risk_score
0,KYW1,-0.826686
1,KYW2,-0.831430
2,KYW3,-1.349373
3,KYW4,-0.882311
4,KYW5,-0.718750
...,...,...
1188,KYW1189,-1.849656
1189,KYW1190,-1.744017
1190,KYW1191,-1.560686
1191,KYW1192,-1.797131


In [77]:
assert submission_tuned.shape == (eval_enhanced.shape[0], 2)
assert all(submission_tuned['ID'] == eval_enhanced['ID'])

In [85]:
# Diagnostic du probl√®me d'index
print("=== DIAGNOSTIC DE L'INDEX ===")

print(f"Shape de eval_enhanced: {eval_enhanced.shape}")
print(f"Shape de submission_tuned: {submission_tuned.shape}")

print(f"\nPremiers IDs dans eval_enhanced:")
print(eval_enhanced['ID'].head(10).tolist())

print(f"\nPremiers IDs dans submission_tuned:")
print(submission_tuned['ID'].head(10).tolist())

print(f"\nDerniers IDs dans eval_enhanced:")
print(eval_enhanced['ID'].tail(5).tolist())

print(f"\nDerniers IDs dans submission_tuned:")
print(submission_tuned['ID'].tail(5).tolist())

# V√©rifier s'il y a des doublons
print(f"\nDoublons dans eval_enhanced['ID']: {eval_enhanced['ID'].duplicated().sum()}")
print(f"Doublons dans submission_tuned['ID']: {submission_tuned['ID'].duplicated().sum()}")

# V√©rifier l'ordre
ids_match = (submission_tuned['ID'] == eval_enhanced['ID']).all()
print(f"\nTous les IDs correspondent dans le m√™me ordre: {ids_match}")

if not ids_match:
    # Trouver les diff√©rences
    diff_positions = (submission_tuned['ID'] != eval_enhanced['ID'])
    if diff_positions.any():
        first_diff = diff_positions.idxmax()
        print(f"\nPremi√®re diff√©rence √† la position {first_diff}:")
        print(f"eval_enhanced: {eval_enhanced.loc[first_diff, 'ID']}")
        print(f"submission_tuned: {submission_tuned.loc[first_diff, 'ID']}")

# V√©rifier si submission_tuned a un index personnalis√©
print(f"\nIndex de submission_tuned: {submission_tuned.index.tolist()[:10]}")
print(f"Index de eval_enhanced: {eval_enhanced.index.tolist()[:10]}")

# V√©rifier le type de l'index
print(f"\nType d'index submission_tuned: {type(submission_tuned.index)}")
print(f"Type d'index eval_enhanced: {type(eval_enhanced.index)}")

# V√©rifier s'il y a des valeurs manquantes
print(f"\nNaN dans submission_tuned['ID']: {submission_tuned['ID'].isna().sum()}")
print(f"NaN dans eval_enhanced['ID']: {eval_enhanced['ID'].isna().sum()}")
print(f"NaN dans submission_tuned['risk_score']: {submission_tuned['risk_score'].isna().sum()}")

=== DIAGNOSTIC DE L'INDEX ===
Shape de eval_enhanced: (1193, 117)
Shape de submission_tuned: (1193, 2)

Premiers IDs dans eval_enhanced:
['KYW1', 'KYW2', 'KYW3', 'KYW4', 'KYW5', 'KYW6', 'KYW7', 'KYW8', 'KYW9', 'KYW10']

Premiers IDs dans submission_tuned:
['KYW1', 'KYW2', 'KYW3', 'KYW4', 'KYW5', 'KYW6', 'KYW7', 'KYW8', 'KYW9', 'KYW10']

Derniers IDs dans eval_enhanced:
['KYW1189', 'KYW1190', 'KYW1191', 'KYW1192', 'KYW1193']

Derniers IDs dans submission_tuned:
['KYW1189', 'KYW1190', 'KYW1191', 'KYW1192', 'KYW1193']

Doublons dans eval_enhanced['ID']: 0
Doublons dans submission_tuned['ID']: 0

Tous les IDs correspondent dans le m√™me ordre: True

Index de submission_tuned: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Index de eval_enhanced: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

Type d'index submission_tuned: <class 'pandas.core.indexes.range.RangeIndex'>
Type d'index eval_enhanced: <class 'pandas.core.indexes.range.RangeIndex'>

NaN dans submission_tuned['ID']: 0
NaN dans eval_enhanced['ID']: 0
NaN dans su

In [86]:
# CORRECTION: Cr√©er une nouvelle soumission avec l'index correct
print("=== CR√âATION D'UNE SOUMISSION CORRIG√âE ===")

# S'assurer que le dataframe de soumission a le bon format
submission_corrected = pd.DataFrame({
    'ID': eval_enhanced['ID'].values,  # Utiliser .values pour √©viter les probl√®mes d'index
    'risk_score': risk_score_tuned
})

# R√©initialiser l'index pour √™tre s√ªr
submission_corrected = submission_corrected.reset_index(drop=True)

print(f"Shape de la soumission corrig√©e: {submission_corrected.shape}")
print(f"Premiers IDs: {submission_corrected['ID'].head().tolist()}")
print(f"Derniers IDs: {submission_corrected['ID'].tail().tolist()}")

# V√©rifications finales
assert submission_corrected.shape[0] == eval_enhanced.shape[0], "Nombre de lignes diff√©rent"
assert submission_corrected.shape[1] == 2, "Nombre de colonnes incorrect"
assert submission_corrected.columns.tolist() == ['ID', 'risk_score'], "Noms de colonnes incorrects"
assert not submission_corrected['ID'].isna().any(), "IDs manquants"
assert not submission_corrected['risk_score'].isna().any(), "Risk scores manquants"
assert submission_corrected['ID'].nunique() == len(submission_corrected), "IDs dupliqu√©s"

print("‚úÖ Toutes les v√©rifications sont pass√©es!")

# Sauvegarder la version corrig√©e
submission_corrected.to_csv('../submissions/submission_xgb_aft_tuned_CORRECTED.csv', index=False)
print("üìÅ Soumission sauvegard√©e dans '../submissions/submission_xgb_aft_tuned_CORRECTED.csv'")

# Afficher un aper√ßu
print("\nüìã Aper√ßu de la soumission corrig√©e:")
print(submission_corrected.head(10))
print("...")
print(submission_corrected.tail(5))

=== CR√âATION D'UNE SOUMISSION CORRIG√âE ===
Shape de la soumission corrig√©e: (1193, 2)
Premiers IDs: ['KYW1', 'KYW2', 'KYW3', 'KYW4', 'KYW5']
Derniers IDs: ['KYW1189', 'KYW1190', 'KYW1191', 'KYW1192', 'KYW1193']
‚úÖ Toutes les v√©rifications sont pass√©es!
üìÅ Soumission sauvegard√©e dans '../submissions/submission_xgb_aft_tuned_CORRECTED.csv'

üìã Aper√ßu de la soumission corrig√©e:
      ID  risk_score
0   KYW1   -0.758699
1   KYW2   -0.874134
2   KYW3   -1.449184
3   KYW4   -0.984297
4   KYW5   -0.776960
5   KYW6   -1.127292
6   KYW7   -1.025532
7   KYW8   -1.031210
8   KYW9   -2.690758
9  KYW10   -2.074367
...
           ID  risk_score
1188  KYW1189   -1.914463
1189  KYW1190   -1.834337
1190  KYW1191   -1.621109
1191  KYW1192   -1.869528
1192  KYW1193   -1.150983


In [None]:
# Comparer avec une soumission existante qui fonctionne
print("=== COMPARAISON AVEC UNE SOUMISSION EXISTANTE ===")

try:
    # Charger une soumission existante pour comparaison
    existing_submission = pd.read_csv('../submissions/coxph_enhanced.csv')
    
    print(f"Shape soumission existante: {existing_submission.shape}")
    print(f"Colonnes soumission existante: {existing_submission.columns.tolist()}")
    print(f"Premiers IDs soumission existante: {existing_submission['ID'].head().tolist()}")
    
    # V√©rifier si les IDs correspondent
    ids_match_existing = (existing_submission['ID'] == eval_enhanced['ID']).all()
    print(f"IDs correspondent avec eval_enhanced: {ids_match_existing}")
    
    if ids_match_existing:
        print("‚úÖ L'ordre des IDs dans la soumission existante est correct")
        
        # Utiliser le m√™me ordre pour notre soumission
        submission_final = pd.DataFrame({
            'ID': existing_submission['ID'].values,
            'risk_score': risk_score_tuned
        })
        
        # Double v√©rification
        assert (submission_final['ID'] == existing_submission['ID']).all()
        print("‚úÖ Ordre des IDs confirm√© identique √† la soumission de r√©f√©rence")
        
    else:
        print("‚ö†Ô∏è Probl√®me avec l'ordre des IDs dans la soumission existante aussi")
        submission_final = submission_corrected.copy()
    
except FileNotFoundError:
    print("‚ö†Ô∏è Fichier de soumission de r√©f√©rence non trouv√©, utilisation de la version corrig√©e")
    submission_final = submission_corrected.copy()

# Sauvegarder la version finale
submission_final.to_csv('../submissions/submission_xgb_aft_FINAL.csv', index=False)
print("üíæ Soumission FINALE sauvegard√©e dans '../submissions/submission_xgb_aft_FINAL.csv'")

print(f"\nüìä Statistiques de la soumission finale:")
print(f"- Nombre de lignes: {len(submission_final)}")
print(f"- Risk score min: {submission_final['risk_score'].min():.4f}")
print(f"- Risk score max: {submission_final['risk_score'].max():.4f}")
print(f"- Risk score m√©dian: {submission_final['risk_score'].median():.4f}")

submission_final

In [78]:
X_eval_final = eval_enhanced[feature_cols].fillna(0.0).to_numpy(dtype=float)
deval_final = xgb.DMatrix(X_eval_final)

y_pred_log_eval_tuned = model_tuned.predict(deval_final)
risk_score_tuned = -y_pred_log_eval_tuned

submission_tuned = pd.DataFrame({
    'ID': eval_enhanced['ID'],
    'risk_score': risk_score_tuned
})

submission_tuned.to_csv('../submissions/submission_xgb_aft_tuned.csv', index=False)
print("Pr√©dictions avec hyperparam√®tres optimis√©s sauvegard√©es dans '../submissions/submission_xgb_aft_tuned.csv'")

Pr√©dictions avec hyperparam√®tres optimis√©s sauvegard√©es dans '../submissions/submission_xgb_aft_tuned.csv'


In [79]:
import plotly.express as px

fig = optuna.visualization.plot_optimization_history(study)
fig.show()

In [80]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()
