# Preprocessing

Preprocessing notebook

In [1783]:
import numpy as np
import pandas as pd

pd.options.plotting.backend = "plotly"

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_ipcw
from sksurv.util import Surv

df = pd.read_csv("../../data/clinical_train.csv")
df_eval = pd.read_csv("../../data/clinical_val.csv")
maf_df = pd.read_csv("../../data/molecular_train.csv")
maf_eval = pd.read_csv("../../data/molecular_val.csv")
target_df = pd.read_csv("../../data/target_train.csv")

In [1784]:
maf_df2 = pd.read_csv("../../data/molecular_train.csv")
maf_eval2 = pd.read_csv("../../data/molecular_val.csv")

In [1785]:
maf_df.isna().sum()

ID                  0
CHR               114
START             114
END               114
REF               114
ALT               114
GENE                0
PROTEIN_CHANGE     12
EFFECT              0
VAF                89
DEPTH             114
dtype: int64

In [1786]:
target = ['OS_YEARS', 'OS_STATUS']

# Drop rows where 'OS_YEARS' is NaN if conversion caused any issues
target_df.dropna(subset=target, inplace=True)
target_df['OS_YEARS'] = pd.to_numeric(target_df['OS_YEARS'], errors='coerce')
target_df['OS_STATUS'] = target_df['OS_STATUS'].astype(bool)

# Select features
features = ['ID', 'BM_BLAST', 'WBC', 'ANC', 'MONOCYTES', 'HB', 'PLT', 'CYTOGENETICS']

# Create the survival data format
X = df.loc[df['ID'].isin(target_df['ID']), features]
X_eval = df_eval.loc[:, features]
y = Surv.from_dataframe('OS_STATUS', 'OS_YEARS', target_df)


# Impute Missing values

In [None]:
import numpy as np
import pandas as pd
import optuna
from tqdm.notebook import tqdm
import xgboost as xgb

# -------------------------------------------------------------------
# 0. On suppose que X et X_eval existent déjà (DataFrames)
# -------------------------------------------------------------------

# ----------------------
# 1. Colonnes numériques + indicateurs manquants
# ----------------------
num_cols = X.select_dtypes("number").columns

# Indicateurs de NA AVANT imputation
for df_ in (X, X_eval):
    for col in num_cols:
        df_[f"{col}_missing"] = df_[col].isna().astype("int8")

# Matrice numpy (seulement les colonnes numériques originales)
X_num = X[num_cols].to_numpy(dtype=float)
n_rows, n_features = X_num.shape

# ----------------------
# 2. Tuning Optuna PAR FEATURE (1000 trials + barre de progression par feature)
# ----------------------
base_xgb_params = dict(
    objective="reg:squarederror",
    tree_method="hist",
    n_jobs=-1,
    random_state=0,
)

n_trials = 1000       # <---- 1000 TRIALS
mask_frac = 0.1
rng = np.random.RandomState(0)

best_params_per_feature = {}


def make_objective_for_feature(j):
    """Objective Optuna pour la feature j uniquement."""
    def objective(trial):
        # Hyperparamètres XGBoost à optimiser
        xgb_params_trial = {
            "max_depth": trial.suggest_int("max_depth", 2, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "n_estimators": trial.suggest_int("n_estimators", 50, 500),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        }

        y = X_num[:, j]
        X_features = np.delete(X_num, j, axis=1)

        # on ne garde que les valeurs non-NA de la colonne
        not_nan_y = ~np.isnan(y)
        if not_nan_y.sum() < 20:
            # pas assez de données : on renvoie un score pourri
            return 1e6

        y_obs = y[not_nan_y]
        X_obs = X_features[not_nan_y]

        # masque aléatoire pour l'évaluation
        mask_eval = rng.rand(len(y_obs)) < mask_frac
        if mask_eval.sum() == 0 or (~mask_eval).sum() < 10:
            return 1e6

        X_train = X_obs[~mask_eval]
        y_train = y_obs[~mask_eval]
        X_eval_local = X_obs[mask_eval]
        y_eval_local = y_obs[mask_eval]

        model = xgb.XGBRegressor(
            **base_xgb_params,
            **xgb_params_trial
        )

        model.fit(X_train, y_train)

        y_pred = model.predict(X_eval_local)
        mse = np.mean((y_pred - y_eval_local) ** 2)
        return mse

    return objective


# Boucle Optuna sur chaque feature
for j, col in enumerate(num_cols):
    # Vérification rapide pour éviter de lancer Optuna si trop peu de données
    y_col = X_num[:, j]
    if (~np.isnan(y_col)).sum() < 20:
        continue

    print(f"Tuning feature {j+1}/{len(num_cols)} : {col}")

    study = optuna.create_study(direction="minimize")

    # Barre de progression POUR CETTE FEATURE
    with tqdm(total=n_trials, desc=f"{col}") as pbar:
        def callback(study_, trial_):
            # Affiche la meilleure MSE courante dans la description
            pbar.set_description(f"{col} | best MSE={study_.best_value:.5f}")
            pbar.update(1)

        study.optimize(
            make_objective_for_feature(j),
            n_trials=n_trials,
            callbacks=[callback],
        )

    # On stocke les meilleurs hyperparamètres pour cette feature
    best_params_per_feature[col] = {**base_xgb_params, **study.best_params}

print("Nombre de features tunées :", len(best_params_per_feature))

# ----------------------
# 3. Fit des imputers finaux (un modèle par feature)
# ----------------------
def fit_xgb_imputers_per_feature(X_df, num_cols, best_params_per_feature):
    models = {}
    X_values = X_df[num_cols].to_numpy(dtype=float)

    for j, col in enumerate(num_cols):
        y = X_values[:, j]
        missing_mask = np.isnan(y)
        not_missing = ~missing_mask

        if not_missing.sum() < 20:
            # Pas assez de données pour entraîner un modèle
            continue

        # Si on n'a pas de params Optuna pour cette colonne, on skip
        if col not in best_params_per_feature:
            continue

        X_features = np.delete(X_values, j, axis=1)

        X_train = X_features[not_missing]
        y_train = y[not_missing]

        params = best_params_per_feature[col]
        model = xgb.XGBRegressor(**params)
        model.fit(X_train, y_train)

        models[col] = model

    return models


def transform_with_xgb_imputers(X_df, num_cols, models):
    X_values = X_df[num_cols].to_numpy(dtype=float)

    for j, col in enumerate(num_cols):
        missing_mask = np.isnan(X_values[:, j])

        if missing_mask.any():
            if col in models:
                X_features = np.delete(X_values, j, axis=1)
                X_missing = X_features[missing_mask]
                preds = models[col].predict(X_missing)
                X_values[missing_mask, j] = preds
            else:
                # fallback : médiane de la colonne
                X_values[missing_mask, j] = np.nanmedian(X_values[:, j])

    return pd.DataFrame(X_values, columns=num_cols, index=X_df.index)

# ----------------------
# 4. Imputation finale
# ----------------------
xgb_models = fit_xgb_imputers_per_feature(X, num_cols, best_params_per_feature)

X[num_cols] = transform_with_xgb_imputers(X, num_cols, xgb_models)
X_eval[num_cols] = transform_with_xgb_imputers(X_eval, num_cols, xgb_models)


[I 2025-12-09 12:06:11,492] A new study created in memory with name: no-name-23ed9c2f-384d-4e1e-9fa4-a790a4c4e3cd


Tuning feature 1/6 : BM_BLAST


BM_BLAST:   0%|          | 0/1000 [00:00<?, ?it/s]

[I 2025-12-09 12:06:12,088] Trial 0 finished with value: 46.69465195713341 and parameters: {'max_depth': 7, 'learning_rate': 0.01689159647526214, 'n_estimators': 291, 'subsample': 0.6620127324208509, 'colsample_bytree': 0.5588092341457913}. Best is trial 0 with value: 46.69465195713341.
[I 2025-12-09 12:06:12,224] Trial 1 finished with value: 37.748426145769514 and parameters: {'max_depth': 3, 'learning_rate': 0.11828402898336147, 'n_estimators': 318, 'subsample': 0.7465520005165277, 'colsample_bytree': 0.7252672803024504}. Best is trial 1 with value: 37.748426145769514.
[I 2025-12-09 12:06:12,362] Trial 2 finished with value: 55.75175307363229 and parameters: {'max_depth': 6, 'learning_rate': 0.2254210340812137, 'n_estimators': 164, 'subsample': 0.7122750845669046, 'colsample_bytree': 0.7606738711859645}. Best is trial 1 with value: 37.748426145769514.
[I 2025-12-09 12:06:12,606] Trial 3 finished with value: 52.16733656446621 and parameters: {'max_depth': 4, 'learning_rate': 0.0264482

In [None]:
# from sklearn.impute import SimpleImputer
# num_cols = X.select_dtypes("number").columns # Sauvegarde des indicateurs de manquant AVANT imputation 
# for df_ in [X, X_eval]: 
#     for col in num_cols: 
#         df_[f"{col}_missing"] = df_[col].isna().astype(int) 

# imputer = SimpleImputer(strategy="median") 
# X[num_cols] = imputer.fit_transform(X[num_cols]) 
# X_eval[num_cols] = imputer.transform(X_eval[num_cols])

## Enhanced Mutation Features

Computing mutation count (Nmut) along with VAF statistics (average, std, max) and length statistics (average, std, max) for both training and evaluation datasets.

Looking at the user request and the selected cell, I need to apply RobustScaler to the new mutation features (Nmut, VAF_avg, VAF_std, VAF_max, LEN_avg, LEN_std, LEN_max) that are computed in the `compute_mutation_features` function.

The task is to:
1. Apply RobustScaler to the newly computed mutation features after merging them with the dataframe
2. Fit the scaler on training data and transform both training and evaluation data



In [None]:
pd.get_dummies(maf_df, columns=["EFFECT"])
pd.get_dummies(maf_eval, columns=["EFFECT"])

Unnamed: 0,ID,CHR,START,END,REF,ALT,GENE,PROTEIN_CHANGE,VAF,DEPTH,EFFECT_ITD,EFFECT_PTD,EFFECT_frameshift_variant,EFFECT_inframe_codon_gain,EFFECT_inframe_codon_loss,EFFECT_non_synonymous_codon,EFFECT_stop_gained,EFFECT_stop_lost
0,KYW961,1,1747229.0,1747229.0,T,C,GNB1,p.K57E,0.2620,485.0,False,False,False,False,False,True,False,False
1,KYW142,1,1747229.0,1747229.0,T,C,GNB1,p.K57E,0.0280,527.0,False,False,False,False,False,True,False,False
2,KYW453,1,1747229.0,1747229.0,T,C,GNB1,p.K57E,0.2920,277.0,False,False,False,False,False,True,False,False
3,KYW982,1,1747229.0,1747229.0,T,C,GNB1,p.K57E,0.0970,821.0,False,False,False,False,False,True,False,False
4,KYW845,1,36932209.0,36932209.0,G,A,CSF3R,p.Q754X,0.4300,358.0,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3084,KYW1077,,,,,,MLL,MLL_PTD,0.4231,,False,True,False,False,False,False,False,False
3085,KYW1084,,,,,,MLL,MLL_PTD,0.0176,,False,True,False,False,False,False,False,False
3086,KYW1082,,,,,,MLL,MLL_PTD,0.2273,,False,True,False,False,False,False,False,False
3087,KYW1085,,,,,,MLL,MLL_PTD,0.2941,,False,True,False,False,False,False,False,False


Looking at the user request, I need to add CHR count columns for the chromosomes specified in the list: ['4', '2', '17', 'X', '20', '21', '12', '7', '5', '1', '11', '15', '19', '18', '9', '3', '16'].

Currently, the code only has `CHR_X_count`. I need to add similar count columns for all the other chromosomes in the list.

Here's the modified cell code:



In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler


def compute_mutation_features(maf_df, X_df, top_k_chr=10):
    maf_df = maf_df.copy()

    # --- 0) Vérif de la colonne CHR ---
    if 'CHR' not in maf_df.columns:
        raise ValueError(
            "La colonne 'CHR' est absente de maf_df. "
            "Ne fais pas pd.get_dummies(maf_df, columns=['CHR']) "
            "avant d'appeler compute_mutation_features, ou passe la version brute."
        )

    # --- 1) Longueur de la mutation & 'deletion length' ---
    maf_df['LEN'] = maf_df['END'] - maf_df['START'] + 1
    maf_df['DELLEN'] = maf_df['LEN'] - maf_df['REF'].apply(lambda x: len(str(x)))

    # --- 1bis) CHR : ne garder que les top_k_chr pour les features de comptage ---
    # (mais CHR_nunique sera quand même calculé sur tous les CHR)
    top_chr = maf_df['CHR'].value_counts().nlargest(top_k_chr).index
    unique_chr = sorted(top_chr)

    # Colonnes one-hot d'EFFECT déjà présentes
    effect_dummy_cols = [c for c in maf_df.columns if c.startswith('EFFECT_')]

    # --- 2) Dictionnaire d'agrégation de base ---
    agg_dict = {
        'Nmut': ('ID', 'size'),
        'VAF_avg': ('VAF', 'mean'),
        'VAF_std': ('VAF', 'std'),
        'VAF_max': ('VAF', 'max'),
        'LEN_avg': ('LEN', 'mean'),
        'LEN_max': ('LEN', 'max'),
        'DELLEN_sum': ('DELLEN', 'sum'),
        'DEPTH_avg': ('DEPTH', 'mean'),
        'DEPTH_std': ('DEPTH', 'std'),
        'DEPTH_max': ('DEPTH', 'max'),
        'DEPTH_min': ('DEPTH', 'min'),
        'CHR_nunique': ('CHR', 'nunique'),
        'EFFECT_nunique': ('EFFECT', 'nunique'),
        'EFFECT_FV_count': ('EFFECT', lambda x: (x == 'frameshift_variant').sum()),
        'EFFECT_SG_count': ('EFFECT', lambda x: (x == 'stop_gained').sum()),
        'EFFECT_NS_count': ('EFFECT', lambda x: (x == 'non_synonymous_codon').sum()),
    }

    # --- 3) Colonnes CHR_*_count seulement pour les top_k_chr ---
    for ch in unique_chr:
        col_name = f'CHR_{ch}_count'
        agg_dict[col_name] = ('CHR', lambda x, val=ch: (x == val).sum())

    # --- 4) Agrégation des dummies EFFECT_* au niveau patient ---
    # -> EFFECT_*_count = nombre de mutations de ce type par patient
    for col in effect_dummy_cols:
        new_name = f'{col}_count'
        agg_dict[new_name] = (col, 'sum')

    # --- 5) Agrégation globale par ID ---
    tmp = maf_df.groupby('ID').agg(**agg_dict).reset_index()

    # Remplir NaN des std (quand une seule mutation par patient)
    for std_col in ['VAF_std', 'DEPTH_std']:
        if std_col in tmp.columns:
            tmp[std_col] = tmp[std_col].fillna(0)

    # --- 6) Loss-of-function : EFFECT_LOF_count & EFFECT_LOF_ratio ---
    # On repère les colonnes *_count associées à frameshift / stop_gained
    lof_effect_cols = [
        c for c in tmp.columns
        if c.startswith('EFFECT_')
        and c.endswith('_count')
        and (
            'frameshift_variant' in c
            or 'stop_gained' in c
            # tu peux ajouter d'autres patterns ici si besoin : 'splice', etc.
        )
    ]

    if len(lof_effect_cols) > 0:
        tmp['EFFECT_LOF_count'] = tmp[lof_effect_cols].sum(axis=1)
    else:
        tmp['EFFECT_LOF_count'] = 0

    tmp['EFFECT_LOF_ratio'] = np.where(
        tmp['Nmut'] > 0,
        tmp['EFFECT_LOF_count'] / tmp['Nmut'],
        0.0
    )

    # --- 7) Merge avec X_df ---
    X_w_mutation = X_df.merge(tmp, on='ID', how='left')

    # Colonnes ajoutées par l'agg (à remplir avec 0 pour les patients sans mutation)
    new_cols = [c for c in tmp.columns if c != 'ID']
    X_w_mutation[new_cols] = X_w_mutation[new_cols].fillna(0)

    return X_w_mutation


# ============================================================
# ================ UTILISATION TRAIN / EVAL ==================
# ============================================================

# On suppose que maf_df, maf_eval, X, X_eval existent déjà.

# 1) Ajout des features mutationnelles au train et à l'eval
X_w_mutation = compute_mutation_features(maf_df, X)
X_eval_w_mutation = compute_mutation_features(maf_eval, X_eval)

# 2) Construction de la liste des features de mutation (à partir du train)
base_mutation_features = [
    'Nmut', 'VAF_avg', 'VAF_std', 'VAF_max',
    'LEN_avg', 'LEN_max', 'DELLEN_sum',
    'DEPTH_avg', 'DEPTH_std', 'DEPTH_max', 'DEPTH_min',
    'CHR_nunique',
    'EFFECT_nunique', 'EFFECT_FV_count', 'EFFECT_SG_count', 'EFFECT_NS_count',
    'EFFECT_LOF_count', 'EFFECT_LOF_ratio',  # nouveau
]

# Colonnes CHR_*_count dans le TRAIN
chr_count_cols = [
    c for c in X_w_mutation.columns
    if c.startswith('CHR_') and c.endswith('_count')
]

# Colonnes EFFECT_*_count générées automatiquement depuis les dummies
effect_count_cols = [
    c for c in X_w_mutation.columns
    if c.startswith('EFFECT_') and c.endswith('_count')
    and c not in ['EFFECT_FV_count', 'EFFECT_SG_count', 'EFFECT_NS_count', 'EFFECT_LOF_count']
]

mutation_features = base_mutation_features + chr_count_cols + effect_count_cols

# 3) Harmoniser les colonnes entre train et eval
for col in mutation_features:
    if col not in X_w_mutation.columns:
        X_w_mutation[col] = 0
    if col not in X_eval_w_mutation.columns:
        X_eval_w_mutation[col] = 0

# (Optionnel) s'assurer que les colonnes sont bien dans le même ordre
X_w_mutation = X_w_mutation.copy()
X_eval_w_mutation = X_eval_w_mutation.copy()
X_w_mutation[mutation_features] = X_w_mutation[mutation_features]
X_eval_w_mutation[mutation_features] = X_eval_w_mutation[mutation_features]

# 4) RobustScaler sur ces features
mutation_scaler = RobustScaler()
print(
    f"Fitting RobustScaler for mutation features ({len(mutation_features)} features) "
    "on training data, transforming training and evaluation data."
)
X_w_mutation[mutation_features] = mutation_scaler.fit_transform(X_w_mutation[mutation_features])
X_eval_w_mutation[mutation_features] = mutation_scaler.transform(X_eval_w_mutation[mutation_features])


Fitting RobustScaler for mutation features (28 features) on training data, transforming training and evaluation data.


# Processing Cytogenetics

sorte de "rules engine" basé sur la littérature clinique AML pour extraire des features pronostiques clés depuis l’ISCN. On détecte

* **Monosomal karyotype (MK)**: au moins 2 monosomies autosomiques, ou 1 monosomie autosomique + au moins une anomalie structurale. C’est franchement mauvais pronostic. ([PubMed Central][1])
* **Complex karyotype**: typiquement ≥ 3 anomalies cytogénétiques indépendantes, pronostic défavorable. ([Cancer Network][2])
* **Chromosomes 5 et 7**: perte de 5 ou 7, ou del(5q)/del(7q), classés défavorables dans ELN 2022. ([ASH Publications][3])
* **+8 (trisomie 8)**: très fréquente, plutôt risque intermédiaire en AML. ([MDPI][4])
* **Translocations favorables classiques**: t(8;21), inv(16)/t(16;16); APL t(15;17) est à part. On les isole comme features car elles portent un signal fort. ([ASH Publications][3])
* **Proportions clonales** via les crochets `[n]` pour calculer la part de métaphases portant une anomalie donnée. La sémantique des notations vient de l’ISCN. ([PubMed][5])

Pour chaque karyotype, on construit:

* `has_any_abnormality`, `n_events`, `n_chromosomes_altered`
* `has_minus5_or_del5q`, `has_minus7_or_del7q`, `has_plus8`
* `has_t_8_21`, `has_inv16_or_t_16_16`, `has_t_15_17`
* `is_complex_karyotype`, `is_monosomal_karyotype`
* `total_metaphases` et des **proportions clonales**: `prop_any_abnormal`, `prop_adverse_5_7`, `prop_plus8`, etc.
* Option: `eln_like_flag_adverse_cyto` basé ici sur 5/7 ou MK ou complex.

[1]: https://pmc.ncbi.nlm.nih.gov/articles/PMC3069222/?utm_source=chatgpt.com "Acute myeloid leukemia with monosomal karyotype at the ..."
[2]: https://www.cancernetwork.com/view/unfavorable-complex-and-monosomal-karyotypes-most-challenging-forms-acute-myeloid-leukemia?utm_source=chatgpt.com "Unfavorable, Complex, and Monosomal Karyotypes"
[3]: https://ashpublications.org/blood/article/140/12/1345/485817/Diagnosis-and-management-of-AML-in-adults-2022?utm_source=chatgpt.com "Diagnosis and management of AML in adults - ASH Publications"
[4]: https://www.mdpi.com/2072-6694/13/22/5679?utm_source=chatgpt.com "Risk Stratification, Measurable Residual Disease, and ..."
[5]: https://pubmed.ncbi.nlm.nih.gov/34839499/?utm_source=chatgpt.com "[Introduction and interpretation of the updated contents of ..."


In [None]:
import re
from typing import List, Dict
import pandas as pd

# --- REGEX patterns ---
_ISCN_EVENT_RE = re.compile(r'(del|dup|inv|ins|i|t|add|der)\s*\(', re.IGNORECASE)
_MONOSOMY_RE   = re.compile(r'(?<![pq])-(\d{1,2}|X|Y)(?![pq])', re.IGNORECASE)
_TRISOMY_RE    = re.compile(r'(?<![pq])\+(\d{1,2}|X|Y)(?![pq])', re.IGNORECASE)
_CHR_NUM_RE    = re.compile(r'(?<![pq])(\d{1,2}|X|Y)(?![pq])', re.IGNORECASE)

# specific adverse/favorable (de base)
_MINUS5_OR_DEL5Q_RE = re.compile(r'-(?:5)(?![pq])|del\s*\(\s*5\s*\)\s*\(\s*q', re.IGNORECASE)
_MINUS7_OR_DEL7Q_RE = re.compile(r'-(?:7)(?![pq])|del\s*\(\s*7\s*\)\s*\(\s*q', re.IGNORECASE)
_PLUS8_RE           = re.compile(r'\+8(?![pq])', re.IGNORECASE)
_T_8_21_RE          = re.compile(r't\s*\(\s*8\s*;\s*21\s*\)', re.IGNORECASE)
_INV16_OR_T_16_16_RE= re.compile(r'(inv\s*\(\s*16\s*\)|t\s*\(\s*16\s*;\s*16\s*\))', re.IGNORECASE)
_T_15_17_RE         = re.compile(r't\s*\(\s*15\s*;\s*17\s*\)', re.IGNORECASE)
_STRUCTURAL_RE      = re.compile(r'(del|dup|inv|ins|i|t|add|der)\s*\(', re.IGNORECASE)

# nouveaux patterns ELN-like
_INV3_OR_T3_3_RE    = re.compile(r'(inv\s*\(\s*3\s*\)\s*\(q21q26\)|t\s*\(\s*3\s*;\s*3\s*\)\s*\(q21;q26\))', re.IGNORECASE)
_T_6_9_RE           = re.compile(r't\s*\(\s*6\s*;\s*9\s*\)\s*\(p23;q34\)', re.IGNORECASE)
_T_9_22_RE          = re.compile(r't\s*\(\s*9\s*;\s*22\s*\)\s*\(q34;q11\)', re.IGNORECASE)
_ABN_17P_RE         = re.compile(r'(del\s*\(\s*17\s*\)\s*\(\s*p|del\s*\(\s*17p\s*\)|-17(?![pq])|add\s*\(\s*17\s*\)\s*\(\s*p)', re.IGNORECASE)

# baseline chr count (ex: 46,XX,del(5q)... -> 46)
_BASELINE_CHR_RE    = re.compile(r'^\s*(\d{2})\s*,', re.IGNORECASE)

# normal karyotype (simple)
_NORMAL_KARYO_RE    = re.compile(r'^\s*46\s*,\s*(XX|XY)\s*(\[\d+\])?\s*$', re.IGNORECASE)


# --- helpers ---
def _split_clones(karyo: str) -> List[str]:
    """Split ISCN string into clones separated by '/'."""
    return [c.strip() for c in str(karyo).split('/') if c.strip()]


def _extract_metaphases(clone: str) -> int:
    """Extract number of metaphases from [n] in clone."""
    m = re.search(r'\[(\d+)\]', clone)
    return int(m.group(1)) if m else 0


def _count_events(clone: str) -> int:
    """
    Count events in a clone:
    structural + trisomies + autosomal monosomies (ignore -Y).
    """
    n_struct = len(_ISCN_EVENT_RE.findall(clone))
    n_mono   = len(_MONOSOMY_RE.findall(clone))
    n_tri    = len(_TRISOMY_RE.findall(clone))
    n_mono_minusY = len(re.findall(r'(?<![pq])-(?:Y)(?![pq])', clone, flags=re.IGNORECASE))
    return n_struct + n_tri + max(n_mono - n_mono_minusY, 0)


def _chromosomes_altered(clone: str) -> int:
    """Number of distinct autosomes/sex chromosomes altered (ignore Y)."""
    nums = set()
    for m in _MONOSOMY_RE.finditer(clone):
        nums.add(m.group(1).upper())
    for m in _TRISOMY_RE.finditer(clone):
        nums.add(m.group(1).upper())
    for ev in re.finditer(r'(del|dup|inv|ins|i|t|add|der)\s*\(([^)]+)\)', clone, flags=re.IGNORECASE):
        for x in re.split(r'[;,\s]+', ev.group(2)):
            if _CHR_NUM_RE.fullmatch(x.strip()):
                nums.add(x.strip().upper())
    nums.discard('Y')
    return len(nums)


def _has_structural(clone: str) -> bool:
    return bool(_STRUCTURAL_RE.search(clone))


def _autosomic_monosomies(clone: str) -> List[int]:
    """List of autosomal monosomies in clone (ignore X/Y)."""
    return [int(m.group(1)) for m in _MONOSOMY_RE.finditer(clone) if m.group(1).upper() not in ('X', 'Y')]


def _is_monosomal_karyotype(karyo: str) -> bool:
    clones = _split_clones(karyo)
    autosomal_monosomies = set()
    any_struct = False
    for c in clones:
        autosomal_monosomies.update(_autosomic_monosomies(c))
        any_struct = any_struct or _has_structural(c)
    return (len(autosomal_monosomies) >= 2) or (len(autosomal_monosomies) >= 1 and any_struct)


def _is_complex_karyotype(karyo: str) -> bool:
    clones = _split_clones(karyo)
    total_events = 0
    for c in clones:
        c_wo_minusY = re.sub(r'(?<![pq])-(?:Y)(?![pq])', '', c, flags=re.IGNORECASE)
        total_events += _count_events(c_wo_minusY)
    return total_events >= 3


def _extract_baseline_chr_count(karyo: str) -> int:
    """
    Extract baseline chromosome number at start of ISCN (ex: 46,XX,... -> 46).
    Return -1 if not found.
    """
    if not isinstance(karyo, str):
        return -1
    m = _BASELINE_CHR_RE.match(karyo)
    if not m:
        return -1
    try:
        return int(m.group(1))
    except ValueError:
        return -1


def _clone_flags(clone: str) -> Dict[str, bool]:
    """
    Flags at clone-level (adverse/favorable, counts, etc.)
    """
    return {
        # anomalies "classiques"
        'minus5_or_del5q': bool(_MINUS5_OR_DEL5Q_RE.search(clone)),
        'minus7_or_del7q': bool(_MINUS7_OR_DEL7Q_RE.search(clone)),
        'plus8':           bool(_PLUS8_RE.search(clone)),
        't_8_21':          bool(_T_8_21_RE.search(clone)),
        'inv16_or_t_16_16':bool(_INV16_OR_T_16_16_RE.search(clone)),
        't_15_17':         bool(_T_15_17_RE.search(clone)),

        # anomalies ELN "rares" supplémentaires
        'inv3_or_t3_3':    bool(_INV3_OR_T3_3_RE.search(clone)),
        't_6_9':           bool(_T_6_9_RE.search(clone)),
        't_9_22':          bool(_T_9_22_RE.search(clone)),
        'abn17p':          bool(_ABN_17P_RE.search(clone)),

        # structure & events
        'has_structural':        _has_structural(clone),
        'events_count':          _count_events(clone),
        'chrs_altered':          _chromosomes_altered(clone),
        'has_any_abn':           bool(_ISCN_EVENT_RE.search(clone) or
                                      _MONOSOMY_RE.search(clone) or
                                      _TRISOMY_RE.search(clone)),
        # counts plus détaillés
        'n_monosomies':          len(_MONOSOMY_RE.findall(clone)),
        'n_trisomies':           len(_TRISOMY_RE.findall(clone)),
        'n_structural_events':   len(_ISCN_EVENT_RE.findall(clone)),
    }


# --- main featurizer ---
def add_cytogenetics_features(df: pd.DataFrame, col: str = "CYTOGENETICS") -> pd.DataFrame:
    rows = []

    for k in df[col]:
        # --- cas "missing / not done" ---
        if not isinstance(k, str) or not k.strip() or k.strip().lower() in {"nan", "na", "nd", "notdone", "failed", "failure"}:
            rows.append({
                'is_cyto_missing_or_failed': 1,
                'is_normal_karyotype': 0,
                'is_abnormal_karyotype': 0,
                'has_any_abnormality': 0,
                'n_events': 0,
                'n_chromosomes_altered': 0,
                'n_monosomies_total': 0,
                'n_trisomies_total': 0,
                'n_structural_events_total': 0,
                'has_minus5_or_del5q': 0,
                'has_minus7_or_del7q': 0,
                'has_plus8': 0,
                'has_t_8_21': 0,
                'has_inv16_or_t_16_16': 0,
                'has_t_15_17': 0,
                'has_inv3_or_t3_3': 0,
                'has_t_6_9': 0,
                'has_t_9_22': 0,
                'has_abn17p': 0,
                'is_monosomal_karyotype': 0,
                'is_complex_karyotype': 0,
                'eln_like_flag_adverse_cyto': 0,
                'eln_like_flag_favorable_cyto': 0,
                'eln_like_flag_intermediate_cyto': 0,
                'eln_like_risk_cyto': -1,   # -1 = missing
                'baseline_chr_count': -1,
                'is_hypodiploid': 0,
                'is_hyperdiploid': 0,
                'is_near_tetraploid': 0,
                'total_metaphases': 0,
                'max_clone_size': 0.0,
                'max_adverse_clone_size': 0.0,
                'has_small_adverse_subclone': 0,
                'prop_any_abnormal': 0.0,
                'prop_adverse_5_7': 0.0,
                'prop_plus8': 0.0,
                'prop_favorable_core': 0.0,
            })
            continue

        clones = _split_clones(k)
        clone_info = []
        total_meta_known = 0

        for c in clones:
            n_meta = _extract_metaphases(c)
            flags = _clone_flags(c)
            clone_info.append((c, n_meta, flags))
            total_meta_known += n_meta

        # base stats
        any_abn   = any(f['has_any_abn'] for _, _, f in clone_info)
        n_events  = sum(f['events_count'] for _, _, f in clone_info)
        n_chrs    = max([f['chrs_altered'] for _, _, f in clone_info] + [0])

        n_mono_tot   = sum(f['n_monosomies'] for _, _, f in clone_info)
        n_tris_tot   = sum(f['n_trisomies'] for _, _, f in clone_info)
        n_struct_tot = sum(f['n_structural_events'] for _, _, f in clone_info)

        # anomalies spécifiques
        has_minus5_or_del5q = any(f['minus5_or_del5q'] for _, _, f in clone_info)
        has_minus7_or_del7q = any(f['minus7_or_del7q'] for _, _, f in clone_info)
        has_plus8           = any(f['plus8'] for _, _, f in clone_info)
        has_t_8_21          = any(f['t_8_21'] for _, _, f in clone_info)
        has_inv16_or_t_16_16= any(f['inv16_or_t_16_16'] for _, _, f in clone_info)
        has_t_15_17         = any(f['t_15_17'] for _, _, f in clone_info)
        has_inv3_or_t3_3    = any(f['inv3_or_t3_3'] for _, _, f in clone_info)
        has_t_6_9           = any(f['t_6_9'] for _, _, f in clone_info)
        has_t_9_22          = any(f['t_9_22'] for _, _, f in clone_info)
        has_abn17p          = any(f['abn17p'] for _, _, f in clone_info)

        # MK / complex
        is_mk  = _is_monosomal_karyotype(k)
        is_ck  = _is_complex_karyotype(k)

        # baseline chr + ploidie
        baseline_chr = _extract_baseline_chr_count(k)
        is_hypo      = int(baseline_chr != -1 and baseline_chr < 46)
        is_hyper     = int(baseline_chr != -1 and 46 < baseline_chr < 50)
        is_near_tet  = int(baseline_chr != -1 and baseline_chr >= 80)

        # normal / abnormal / missing
        is_normal    = int(bool(_NORMAL_KARYO_RE.match(k)))
        is_abnormal  = int(not is_normal and any_abn)
        is_missing   = 0  # déjà filtré avant

        # ELN-like : favorable / adverse / intermédiaire
        eln_favorable = bool(has_t_8_21 or has_inv16_or_t_16_16 or has_t_15_17)
        eln_adverse_basic = bool(is_mk or is_ck or has_minus5_or_del5q or has_minus7_or_del7q)
        eln_adverse_extended = bool(
            eln_adverse_basic
            or has_inv3_or_t3_3
            or has_t_6_9
            or has_t_9_22
            or has_abn17p
        )

        if is_missing:
            eln_risk = -1
        else:
            if eln_adverse_extended:
                eln_risk = 2
            elif eln_favorable:
                eln_risk = 0
            else:
                # le reste (avec info cytogénétique) = intermédiaire
                eln_risk = 1

        # clonality helpers
        def _prop(cond_fn):
            if total_meta_known == 0:
                return 0.0
            pos = sum(n_meta for _, n_meta, f in clone_info if n_meta and cond_fn(f))
            return pos / total_meta_known if total_meta_known else 0.0

        # proportion de clones anormaux, défavorables, etc.
        prop_any_abnormal = float(_prop(lambda f: f['has_any_abn']))
        prop_adverse_5_7  = float(_prop(lambda f: f['minus5_or_del5q'] or f['minus7_or_del7q']))
        prop_plus8        = float(_prop(lambda f: f['plus8']))
        prop_favorable_core = float(_prop(lambda f: f['t_8_21'] or f['inv16_or_t_16_16']))

        # clonality plus fine : max clone, max clone "adverse"
        max_clone_prop = 0.0
        max_adverse_prop = 0.0
        has_small_adverse_subclone = 0

        if total_meta_known > 0:
            for _, n_meta, f in clone_info:
                if not n_meta:
                    continue
                p = n_meta / total_meta_known
                if p > max_clone_prop:
                    max_clone_prop = p

                is_extended_adverse_clone = (
                    f['minus5_or_del5q'] or f['minus7_or_del7q'] or
                    f['inv3_or_t3_3'] or f['t_6_9'] or f['t_9_22'] or f['abn17p']
                )
                if is_extended_adverse_clone:
                    if p > max_adverse_prop:
                        max_adverse_prop = p
                    if 0.0 < p < 0.3:
                        has_small_adverse_subclone = 1

        rows.append({
            'is_cyto_missing_or_failed': int(is_missing),
            'is_normal_karyotype': int(is_normal),
            'is_abnormal_karyotype': int(is_abnormal),
            'has_any_abnormality': int(any_abn),
            'n_events': int(n_events),
            'n_chromosomes_altered': int(n_chrs),
            'n_monosomies_total': int(n_mono_tot),
            'n_trisomies_total': int(n_tris_tot),
            'n_structural_events_total': int(n_struct_tot),
            'has_minus5_or_del5q': int(has_minus5_or_del5q),
            'has_minus7_or_del7q': int(has_minus7_or_del7q),
            'has_plus8': int(has_plus8),
            'has_t_8_21': int(has_t_8_21),
            'has_inv16_or_t_16_16': int(has_inv16_or_t_16_16),
            'has_t_15_17': int(has_t_15_17),
            'has_inv3_or_t3_3': int(has_inv3_or_t3_3),
            'has_t_6_9': int(has_t_6_9),
            'has_t_9_22': int(has_t_9_22),
            'has_abn17p': int(has_abn17p),
            'is_monosomal_karyotype': int(is_mk),
            'is_complex_karyotype': int(is_ck),
            'eln_like_flag_adverse_cyto': int(eln_adverse_extended),
            'eln_like_flag_favorable_cyto': int(eln_favorable),
            'eln_like_flag_intermediate_cyto': int(eln_risk == 1),
            'eln_like_risk_cyto': int(eln_risk),
            'baseline_chr_count': int(baseline_chr),
            'is_hypodiploid': int(is_hypo),
            'is_hyperdiploid': int(is_hyper),
            'is_near_tetraploid': int(is_near_tet),
            'total_metaphases': int(total_meta_known),
            'max_clone_size': float(max_clone_prop),
            'max_adverse_clone_size': float(max_adverse_prop),
            'has_small_adverse_subclone': int(has_small_adverse_subclone),
            'prop_any_abnormal': float(prop_any_abnormal),
            'prop_adverse_5_7': float(prop_adverse_5_7),
            'prop_plus8': float(prop_plus8),
            'prop_favorable_core': float(prop_favorable_core),
        })

    features_df = pd.DataFrame(rows, index=df.index)
    # On enlève la colonne brute CYTOGENETICS (comme tu le faisais)
    return pd.concat([df.copy(), features_df], axis=1).drop(columns=[col])


In [None]:
# === Enrichissement avec les features cytogénétiques ===
X_enhanced = add_cytogenetics_features(X_w_mutation)
X_eval_enhanced = add_cytogenetics_features(X_eval_w_mutation)

# Liste élargie de features cytogénétiques (toutes créées par add_cytogenetics_features)
cytogenetics_features = [
    # statut global / qualité
    'is_cyto_missing_or_failed',
    'is_normal_karyotype',
    'is_abnormal_karyotype',

    # complexité / volume d’anomalies
    'has_any_abnormality',
    'n_events',
    'n_chromosomes_altered',
    'n_monosomies_total',
    'n_trisomies_total',
    'n_structural_events_total',

    # anomalies spécifiques défavorables / favorables
    'has_minus5_or_del5q',
    'has_minus7_or_del7q',
    'has_plus8',
    'has_t_8_21',
    'has_inv16_or_t_16_16',
    'has_t_15_17',
    'has_inv3_or_t3_3',
    'has_t_6_9',
    'has_t_9_22',
    'has_abn17p',

    # MK / complexe
    'is_monosomal_karyotype',
    'is_complex_karyotype',

    # résumé type ELN-like
    'eln_like_flag_adverse_cyto',
    'eln_like_flag_favorable_cyto',
    'eln_like_flag_intermediate_cyto',
    'eln_like_risk_cyto',

    # ploidie
    'baseline_chr_count',
    'is_hypodiploid',
    'is_hyperdiploid',
    'is_near_tetraploid',

    # clonalité
    'total_metaphases',
    'max_clone_size',
    'max_adverse_clone_size',
    'has_small_adverse_subclone',

    # proportions clonales sur certains patterns
    'prop_any_abnormal',
    'prop_adverse_5_7',
    'prop_plus8',
    'prop_favorable_core',
]

# === 1) Détection des features quasi constantes (>=95% identique) sur le TRAIN ===
nearly_constant_features = []
for col in cytogenetics_features:
    value_counts = X_enhanced[col].value_counts(dropna=False)
    if len(value_counts) > 0:
        max_proportion = value_counts.iloc[0] / len(X_enhanced)
        if max_proportion >= 0.95:
            nearly_constant_features.append(col)
            print(f"Removing {col}: {max_proportion:.2%} of values are {value_counts.index[0]}")

# === 2) Suppression des features quasi constantes dans train et eval ===
X_enhanced = X_enhanced.drop(columns=nearly_constant_features)
X_eval_enhanced = X_eval_enhanced.drop(columns=nearly_constant_features)

# Mettre à jour la liste des features cyto réellement utilisées
cytogenetics_features = [f for f in cytogenetics_features if f not in nearly_constant_features]

# === 3) RobustScaler sur les features cyto restantes ===
cytogenetics_scaler = RobustScaler()
print(
    f"Fitting RobustScaler for {len(cytogenetics_features)} cytogenetics features "
    "on training data, transforming training and evaluation data."
)
X_enhanced[cytogenetics_features] = cytogenetics_scaler.fit_transform(
    X_enhanced[cytogenetics_features]
)
X_eval_enhanced[cytogenetics_features] = cytogenetics_scaler.transform(
    X_eval_enhanced[cytogenetics_features]
)


Removing has_t_8_21: 100.00% of values are 0
Removing has_inv16_or_t_16_16: 100.00% of values are 0
Removing has_t_15_17: 99.97% of values are 0
Removing has_inv3_or_t3_3: 99.81% of values are 0
Removing has_t_6_9: 99.97% of values are 0
Removing has_t_9_22: 100.00% of values are 0
Removing has_abn17p: 97.86% of values are 0
Removing eln_like_flag_favorable_cyto: 99.97% of values are 0
Removing is_near_tetraploid: 99.94% of values are 0
Removing has_small_adverse_subclone: 96.56% of values are 0
Removing prop_favorable_core: 100.00% of values are 0.0
Fitting RobustScaler for 26 cytogenetics features on training data, transforming training and evaluation data.


In [None]:
df_enhanced = X_enhanced.merge(target_df, on='ID', how='left')
df_eval_enhanced = X_eval_enhanced


In [None]:
import pandas as pd

import pandas as pd

def add_gene_features(df_clinical_enhanced, df_molecular, gene_list=None, top_k=10):
    """
    Ajoute des features one-hot pour les gènes :
    - Si gene_list est None : utilise les top_k gènes les plus fréquents dans df_molecular
      (fréquence = nombre de patients distincts dans lesquels le gène apparaît).
    - Sinon : force l'usage de gene_list (pour que train et val aient les mêmes colonnes)
    """
    # 1) Définir la liste de gènes de référence
    if gene_list is None:
        # fréquence par gène = nb d'ID distincts où le gène est présent
        gene_counts = (
            df_molecular[['ID', 'GENE']]
            .drop_duplicates()['GENE']
            .value_counts()
        )
        # on garde les top_k gènes les plus fréquents
        gene_list = gene_counts.nlargest(top_k).index.tolist()
        print(f"Nombre de gènes utilisés (top {top_k}) : {len(gene_list)}")

    # 2) Ne garder que les lignes correspondant aux gènes de la liste
    df_filtered = df_molecular[df_molecular['GENE'].isin(gene_list)].copy()

    # 3) Table ID x GENE (one-hot)
    gene_pivot = pd.crosstab(df_filtered['ID'], df_filtered['GENE'])

    # 4) S'assurer que toutes les colonnes de gene_list existent
    for g in gene_list:
        if g not in gene_pivot.columns:
            gene_pivot[g] = 0

    # 5) Réordonner les colonnes exactement selon gene_list
    gene_pivot = gene_pivot[gene_list]

    # 6) Binariser et renommer les colonnes
    gene_pivot = (gene_pivot > 0).astype(int)
    gene_pivot.columns = [f'Gene_{col}' for col in gene_pivot.columns]

    # 7) Merge avec les features cliniques
    df_final = df_clinical_enhanced.merge(gene_pivot, on='ID', how='left')

    # 8) Remplir les NaN des colonnes de gènes avec 0
    new_cols = [c for c in df_final.columns if c.startswith('Gene_')]
    df_final[new_cols] = df_final[new_cols].fillna(0)

    return df_final, gene_list



# ======================================================================
# Lecture des données
# ======================================================================
mol_train_raw = pd.read_csv("../../data/molecular_train.csv")
mol_val_raw   = pd.read_csv("../../data/molecular_val.csv")

# df_enhanced et df_df_val_pivot sont tes jeux cliniques déjà préparés
# df_enhanced      : train clinique
# df_df_val_pivotal clinique

# ======================================================================
# Construction des features gènes pour train et val
# ======================================================================
df_train_pivot, gene_list_ref = add_gene_features(
    df_clinical_enhanced=df_enhanced,
    df_molecular=mol_train_raw,
    gene_list=None,  # on déduit les top 70 gènes du train
    top_k=70
)

df_val_pivot, _ = add_gene_features(
    df_clinical_enhanced=df_eval_enhanced,
    df_molecular=mol_val_raw,
    gene_list=gene_list_ref  # même liste (top 70) que le train
)


# ======================================================================
# Harmonisation des colonnes entre train et val
# ======================================================================

# Colonnes de gènes (identiques pour les deux par construction)
gene_cols = [c for c in df_train_pivot.columns if c.startswith('Gene_')]

# Colonnes cliniques pour les données d'entraînement (incluant OS_YEARS et OS_STATUS)
train_clinical_cols = [c for c in df_train_pivot.columns if not c.startswith('Gene_')]

# Colonnes cliniques pour les données de validation (excluant OS_YEARS et OS_STATUS)
val_clinical_cols = [c for c in df_val_pivot.columns if not c.startswith('Gene_')]

# Pour l'harmonisation, on exclut temporairement OS_YEARS et OS_STATUS
train_clinical_cols_for_common = [c for c in train_clinical_cols if c not in ['OS_YEARS', 'OS_STATUS']]

# On garde uniquement les colonnes cliniques communes (sans les targets)
common_clinical = sorted(set(train_clinical_cols_for_common).intersection(val_clinical_cols))

# Ordre final pour les données d'entraînement : colonnes cliniques communes + gènes + targets
final_features_train = common_clinical + gene_cols + ['OS_YEARS', 'OS_STATUS']

# Ordre final pour les données de validation : colonnes cliniques communes + gènes (pas de targets)
final_features_val = common_clinical + gene_cols

# Sélection des colonnes dans le bon ordre pour train et val
df_train_pivot = df_train_pivot[final_features_train]
df_val_pivot = df_val_pivot[final_features_val]

print(f"Nombre de colonnes finales train (avec targets) : {len(final_features_train)}")
print(f"Nombre de colonnes finales val (sans targets) : {len(final_features_val)}")
print(f"Shape train : {df_train_pivot.shape}")
print(f"Shape val   : {df_val_pivot.shape}")

# Vérification que OS_YEARS et OS_STATUS sont bien présents dans les données d'entraînement
print(f"OS_YEARS dans train : {'OS_YEARS' in df_train_pivot.columns}")
print(f"OS_STATUS dans train : {'OS_STATUS' in df_train_pivot.columns}")



Nombre de gènes utilisés (top 70) : 70
Nombre de colonnes finales train (avec targets) : 139
Nombre de colonnes finales val (sans targets) : 137
Shape train : (3173, 139)
Shape val   : (1193, 137)
OS_YEARS dans train : True
OS_STATUS dans train : True


In [None]:
df_train_pivot['cyto_risk_score'] = (
    3 * df_train_pivot['is_monosomal_karyotype'] +
    3 * df_train_pivot['is_complex_karyotype'] +
    2 * df_train_pivot['has_minus7_or_del7q'] +
    2 * df_train_pivot['has_minus5_or_del5q'] +
    1 * df_train_pivot['has_plus8']
)

df_val_pivot['cyto_risk_score'] = (
    3 * df_val_pivot['is_monosomal_karyotype'] +
    3 * df_val_pivot['is_complex_karyotype'] +
    2 * df_val_pivot['has_minus7_or_del7q'] +
    2 * df_val_pivot['has_minus5_or_del5q'] +
    1 * df_val_pivot['has_plus8']
)

df_train_pivot['TP53_complex_interaction'] = df_train_pivot['Gene_TP53'] * df_train_pivot['is_complex_karyotype']
df_train_pivot['ASXL1_minus7_interaction'] = df_train_pivot['Gene_ASXL1'] * df_train_pivot['has_minus7_or_del7q']
df_train_pivot['NPM1_normal_interaction'] = df_train_pivot['Gene_NPM1'] * (1 - df_train_pivot['prop_any_abnormal'])

df_val_pivot['TP53_complex_interaction'] = df_val_pivot['Gene_TP53'] * df_val_pivot['is_complex_karyotype']
df_val_pivot['ASXL1_minus7_interaction'] = df_val_pivot['Gene_ASXL1'] * df_val_pivot['has_minus7_or_del7q']
df_val_pivot['NPM1_normal_interaction'] = df_val_pivot['Gene_NPM1'] * (1 - df_val_pivot['prop_any_abnormal'])

df_train_pivot['high_risk_chr_load'] = (
    (df_val_pivot["CHR_5_count"]) +
    df_train_pivot['CHR_7_count'] +
    df_train_pivot['CHR_17_count']
)

df_val_pivot['high_risk_chr_load'] = (
    (df_val_pivot["CHR_5_count"]) +
    df_val_pivot['CHR_7_count'] +
    df_val_pivot['CHR_17_count']
)

In [None]:
df_train_pivot['risk_score_high_genes'] = (
    df_train_pivot['Gene_TP53'] +
    df_train_pivot['Gene_ASXL1'] +
    df_train_pivot['Gene_RUNX1']
)

df_train_pivot['risk_score_favorable_genes'] = df_train_pivot['Gene_NPM1'] + df_train_pivot['Gene_CEBPA']

df_val_pivot['risk_score_high_genes'] = (
    df_val_pivot['Gene_TP53'] +    
    df_val_pivot['Gene_ASXL1'] +
    df_val_pivot['Gene_RUNX1']
)

df_val_pivot['risk_score_favorable_genes'] = df_val_pivot['Gene_NPM1'] + df_val_pivot['Gene_CEBPA']


df_train_pivot['n_splicing_mut'] = df_train_pivot[['Gene_U2AF1','Gene_SRSF2','Gene_SF3B1','Gene_ZRSR2']].sum(axis=1)
df_train_pivot['n_signaling_mut'] = df_train_pivot[['Gene_NRAS','Gene_KRAS','Gene_JAK2','Gene_CBL']].sum(axis=1)

df_val_pivot['n_splicing_mut'] = df_val_pivot[['Gene_U2AF1','Gene_SRSF2','Gene_SF3B1','Gene_ZRSR2']].sum(axis=1)
df_val_pivot['n_signaling_mut'] = df_val_pivot[['Gene_NRAS','Gene_KRAS','Gene_JAK2','Gene_CBL']].sum(axis=1)

df_train_pivot['TP53_VAF_interaction'] = df_train_pivot['Gene_TP53'] * df_train_pivot['VAF_avg']
df_val_pivot['TP53_VAF_interaction'] = df_val_pivot['Gene_TP53'] * df_val_pivot['VAF_avg']


df_train_pivot['ANC_WBC_ratio'] = df_train_pivot['ANC'] / (df_train_pivot['WBC']+1)
df_train_pivot['BLAST_WBC_ratio'] = df_train_pivot['BM_BLAST'] / (df_train_pivot['WBC']+1)

df_val_pivot['ANC_WBC_ratio'] = df_val_pivot['ANC'] / (df_val_pivot['WBC']+1)
df_val_pivot['BLAST_WBC_ratio'] = df_val_pivot['BM_BLAST'] / (df_val_pivot['WBC']+1)







df_train_pivot['major_clone_VAF'] = df_train_pivot['VAF_max']
df_train_pivot['subclonality'] = df_train_pivot['VAF_std'] / (df_train_pivot['VAF_avg']+1e-6)

df_val_pivot['major_clone_VAF'] = df_val_pivot['VAF_max']
df_val_pivot['subclonality'] = df_val_pivot['VAF_std'] / (df_val_pivot['VAF_avg']+1e-6)


df_train_pivot['karyo_score_clinical'] = (
    3 * df_train_pivot['is_monosomal_karyotype'] +
    2 * df_train_pivot['is_complex_karyotype'] +
    2 * df_train_pivot['has_minus7_or_del7q'] +
    1 * df_train_pivot['has_plus8']
)

df_val_pivot['karyo_score_clinical'] = (
    3 * df_val_pivot['is_monosomal_karyotype'] +
    2 * df_val_pivot['is_complex_karyotype'] +
    2 * df_val_pivot['has_minus7_or_del7q'] +
    1 * df_val_pivot['has_plus8']
)



In [None]:
import numpy as np
import pandas as pd

def compute_vaf_entropy(df_mut):
    """
    df_mut doit contenir au minimum :
        - 'ID'  : identifiant patient
        - 'VAF' : fraction variant allele
    Retourne un dataframe avec une seule ligne par patient :
        ID | vaf_entropy
    """

    # Fonction locale de Shannon entropy
    def entropy_from_vaf(vaf_list):
        vaf_arr = np.array(vaf_list)
        
        # Normalisation → proportions p_i
        p = vaf_arr / vaf_arr.sum()

        # Somme seulement sur p_i > 0 sinon log pose pb.
        p = p[p > 0]

        return -np.sum(p * np.log(p))

    entropy_per_patient = (
        df_mut.groupby('ID')['VAF']
              .apply(entropy_from_vaf)
              .reset_index()
              .rename(columns={'VAF': 'vaf_entropy'})
    )

    return entropy_per_patient


entropy_train = compute_vaf_entropy(maf_df)
entropy_eval = compute_vaf_entropy(maf_eval)
    
df_train_pivot = df_train_pivot.merge(entropy_train, on='ID', how='left')
df_val_pivot = df_val_pivot.merge(entropy_eval, on='ID', how='left')

df_train_pivot["vaf_entropy"] = df_train_pivot["vaf_entropy"].fillna(0)
df_val_pivot["vaf_entropy"] = df_val_pivot["vaf_entropy"].fillna(0)


In [None]:
df_train_pivot["vaf_entropy"] = df_train_pivot["vaf_entropy"].fillna(0)
df_val_pivot["vaf_entropy"] = df_val_pivot["vaf_entropy"].fillna(0)

In [None]:
def safe_ratio(num, den):
    num = num.astype(float)
    den = den.astype(float)
    res = num / den
    res[~np.isfinite(res)] = np.nan   # remplace inf, -inf, nan par NaN
    return res

# 1) Remplacer les -1 par NaN dans HB
for df in [df_train_pivot, df_val_pivot]:
    df.loc[df["HB"] == -1, "HB"] = np.nan

# 2) Calculer la médiane de HB sur le train
hb_median = df_train_pivot["HB"].median()

# 3) Imputer les NaN de HB avec cette médiane (même valeur pour train et eval)
df_train_pivot["HB"] = df_train_pivot["HB"].fillna(hb_median)
df_val_pivot["HB"]  = df_val_pivot["HB"].fillna(hb_median)

# 4) Calculer le ratio PLT_HB_ratio
for df in [df_train_pivot, df_val_pivot]:
    df["PLT_HB_ratio"] = safe_ratio(df["PLT"], df["HB"] + 1)

In [None]:
for df_ in [df_train_pivot, df_val_pivot]:
    for col in ['WBC', 'ANC', 'PLT', 'MONOCYTES', 'BM_BLAST', 'Nmut', 'DEPTH_avg']:
        if col in df_.columns:
            df_[f"log1p_{col}"] = np.log1p(df_[col].clip(lower=0))


In [None]:
mutation_cols = [c for c in df_train_pivot.columns if c.startswith("EFFECT_") and c.endswith("_count")]

low_freq_cols = []
for c in mutation_cols:
    if (df_train_pivot[c] > 0).mean() < 0.01:   # présent chez <1% des patients
        low_freq_cols.append(c)

df_train_pivot = df_train_pivot.drop(columns=low_freq_cols)
df_val_pivot   = df_val_pivot.drop(columns=[c for c in low_freq_cols if c in df_val_pivot.columns])


In [None]:
print("EFFECT_LOF_count" in df_train_pivot.columns)
print([c for c in df_train_pivot.columns if "EFFECT" in c])


False
['EFFECT_FV_count', 'EFFECT_LOF_ratio', 'EFFECT_NS_count', 'EFFECT_SG_count', 'EFFECT_nunique']


In [None]:
for df_ in [df_train_pivot, df_val_pivot]:
    df_["mutation_burden_score"] = (
        0.5 * df_["Nmut"] +
        1.5 * df_["EFFECT_LOF_ratio"] +
        1.0 * df_["vaf_entropy"]
    )


In [None]:
for df_ in [df_train_pivot, df_val_pivot]:
    for col in ["n_events", "n_chromosomes_altered", "n_monosomies_total", "n_trisomies_total"]:
        if col in df_.columns:
            df_[f"log1p_{col}"] = np.log1p(df_[col])


In [None]:
# ======================================================================
# Sauvegarde
# ======================================================================
df_train_pivot.to_csv('../../data/train_pivot4.csv', index=False)
df_val_pivot.to_csv('../../data/eval_pivot4.csv',  index=False)