# Importation des librairies

In [2]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import os.path

from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sksurv.ensemble import RandomSurvivalForest
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored , concordance_index_ipcw
from sklearn.impute import SimpleImputer
from sksurv.util import Surv
from lifelines.utils import concordance_index



# Preprocessing

In [3]:
clinical_val=pd.read_csv('../data/clinical_val.csv')
clinical_train=pd.read_csv('../data/clinical_train.csv')

molecular_val=pd.read_csv('../data/molecular_val.csv')
molecular_train=pd.read_csv('../data/molecular_train.csv')


In [4]:
target_df=pd.read_csv('../data/target_train.csv')

# Drop rows where 'OS_YEARS' is NaN if conversion caused any issues
target_df.dropna(subset=['OS_YEARS', 'OS_STATUS'], inplace=True)


# Contarget_dfvert 'OS_YEARS' to numeric if it isn’t already
target_df['OS_YEARS'] = pd.to_numeric(target_df['OS_YEARS'], errors='coerce')

# Ensure 'OS_STATUS' is boolean
target_df['OS_STATUS'] = target_df['OS_STATUS'].astype(bool)

In [5]:
from sklearn.impute import KNNImputer

def precise_missing_values(df):
    # Imputation KNN pour les variables numériques
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    imputer = KNNImputer(n_neighbors=5)
    df[num_cols] = imputer.fit_transform(df[num_cols])

    # Pour les variables catégorielles, on remplace par 'Missing'
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        df[col] = df[col].fillna('Missing')
    return df

clinical_train = precise_missing_values(clinical_train)
clinical_val = precise_missing_values(clinical_val)
molecular_train = precise_missing_values(molecular_train)
molecular_val = precise_missing_values(molecular_val)

In [6]:
import pandas as pd
import numpy as np

def aggregate_leukemia_data_improved(df, clinical_df=None):
    """
    Agrégation avancée des données de mutations par patient avec intégration
    des facteurs pronostiques ELN 2022 et des recommandations cliniques.
    (Version sans intégration des données cliniques)
    """
    # === DÉFINITIONS DES GÈNES SELON ELN 2022 ===
    adverse_genes = {
        'TP53', 'ASXL1', 'RUNX1', 'EZH2', 'SF3B1', 'SRSF2', 'U2AF1', 
        'ZRSR2', 'STAG2', 'BCOR', 'SETBP1'
    }
    dna_methylation_genes = {'DNMT3A', 'TET2', 'IDH1', 'IDH2'}
    favorable_genes = {'NPM1', 'CEBPA'}
    all_genes = [
        'GNB1', 'CSF3R', 'MPL', 'NRAS', 'HAX1', 'RIT1', 'SMC3', 'WT1', 'ATM', 'CBL',
        'ETV6', 'ETNK1', 'KRAS', 'ARID2', 'NFE2', 'SH2B3', 'PTPN11', 'FLT3', 'BRCA2',
        'PDS5B', 'IDH2', 'BLM', 'CREBBP', 'CTCF', 'PRPF8', 'TP53', 'NF1', 'SUZ12',
        'STAT5B', 'STAT3', 'PPM1D', 'SRSF2', 'SETBP1', 'BCL2', 'EPOR', 'CALR', 'CEBPA',
        'U2AF2', 'DNMT3A', 'ASXL2', 'SF3B1', 'IDH1', 'ASXL1', 'GNAS', 'RUNX1', 'U2AF1',
        'CHEK2', 'MYD88', 'GATA2', 'KIT', 'TET2', 'TERT', 'IRF1', 'CSNK1A1', 'NPM1',
        'NSD1', 'DDX41', 'JARID2', 'CCND3', 'VEGFA', 'IKZF1', 'EGFR', 'SBDS', 'CUX1',
        'BRAF', 'EZH2', 'RAD21', 'JAK2', 'CDKN2A', 'FANCG', 'NOTCH1', 'PIGA', 'ZRSR2',
        'BCOR', 'USP9X', 'KDM6A', 'SMC1A', 'MED12', 'STAG2', 'BCORL1', 'PHF6', 'BRCC3',
        'MLL'
    ]
    gene_weights = {
        'TP53': 5.0,
        'ASXL1': 2.0, 'RUNX1': 2.0, 'EZH2': 2.0,
        'SF3B1': 1.8, 'SRSF2': 1.8, 'U2AF1': 1.8, 'ZRSR2': 1.8,
        'STAG2': 1.5, 'BCOR': 1.5, 'SETBP1': 1.5,
        'DNMT3A': 1.5, 'TET2': 1.5, 'IDH1': 1.8, 'IDH2': 1.8,
        'FLT3': 2.0,
        'NPM1': -2.5,
        'CEBPA': -3.0,
        'KIT': 1.2, 'NRAS': 0.8, 'KRAS': 0.8, 'PTPN11': 0.8
    }
    bad_effects = ['nonsense', 'frameshift', 'splice_site', 'stop_gained']
    # === AGRÉGATION BASIQUE ===
    result = df.groupby('ID').agg({
        'GENE': ['count', 'nunique'],
        'CHR': 'nunique',
        'VAF': ['mean', 'max', 'median', 'min', 'var'],
        'DEPTH': ['mean', 'median', 'min', 'max', 'var']
    }).reset_index()
    result.columns = [
        'ID', 'nb_mutations', 'nb_genes', 'nb_chromosomes',
        'vaf_mean', 'vaf_max', 'vaf_median', 'vaf_min', 'vaf_var',
        'depth_mean', 'depth_median', 'depth_min', 'depth_max', 'depth_var'
    ]
    # === MATRICE BINAIRE GÈNE x PATIENT ===
    has_gene = (
        df.pivot_table(index='ID', columns='GENE', values='CHR', aggfunc='size', fill_value=0)
        .reindex(columns=all_genes, fill_value=0)
        .astype(int)
    )
    has_gene.columns = [f'has_{g}' for g in has_gene.columns]
    has_gene.reset_index(inplace=True)
    result = result.merge(has_gene, on='ID', how='left')
    # === NOUVEAUX INDICATEURS GÉNÉTIQUES ===
    adverse_cols = [f'has_{g}' for g in adverse_genes if f'has_{g}' in result.columns]
    result['has_adverse_gene'] = result[adverse_cols].sum(axis=1).clip(upper=1)
    result['nb_adverse_genes'] = result[adverse_cols].sum(axis=1)
    methylation_cols = [f'has_{g}' for g in dna_methylation_genes if f'has_{g}' in result.columns]
    result['has_methylation_gene'] = result[methylation_cols].sum(axis=1).clip(upper=1)
    result['nb_methylation_genes'] = result[methylation_cols].sum(axis=1)
    result['has_NPM1_favorable'] = result.get('has_NPM1', 0)
    result['has_CEBPA_favorable'] = result.get('has_CEBPA', 0)
    result['high_mutation_burden'] = (result['nb_mutations'] > 3).astype(int)
    result['mutation_burden_score'] = np.where(
        result['nb_mutations'] > 3, 
        (result['nb_mutations'] - 3) * 0.5, 
        0
    )
    result['nb_bad_effects'] = df.groupby('ID')['EFFECT'].apply(lambda x: x.isin(bad_effects).sum()).values
    result['nb_high_vaf'] = df.groupby('ID')['VAF'].apply(lambda x: (x > 0.4).sum()).values
    result['nb_very_high_vaf'] = df.groupby('ID')['VAF'].apply(lambda x: (x > 0.6).sum()).values
    # === CALCUL DU SCORE DE RISQUE AMÉLIORÉ (SANS CLINIQUE) ===
    has_cols = [f'has_{g}' for g in gene_weights if f'has_{g}' in result.columns]
    weights_series = pd.Series([gene_weights[g] for g in gene_weights if f'has_{g}' in result.columns], 
                              index=has_cols)
    gene_score = result[has_cols].dot(weights_series)
    methylation_penalty = np.where(result['nb_methylation_genes'] >= 2, 
                                  result['nb_methylation_genes'] * 0.8, 0)
    npm1_modulation = 0
    if 'has_NPM1' in result.columns and 'has_FLT3' in result.columns:
        npm1_modulation = np.where(
            (result['has_NPM1'] == 1) & (result['has_FLT3'] == 1),
            1.5,
            0
        )
    result['risk_score_genetic'] = (
        gene_score +
        methylation_penalty +
        npm1_modulation +
        result['mutation_burden_score'] +
        0.3 * result['nb_high_vaf'] +
        0.5 * result['nb_very_high_vaf']
    )
    result['risk_score_raw'] = result['risk_score_genetic']
    def enhanced_logistic(score, intercept=3.0, scale=2.5, floor=0.05, ceiling=0.95):
        z = (score - intercept) / scale
        prob = 1.0 / (1.0 + np.exp(-z))
        return np.clip(prob, floor, ceiling)
    result['risk_score_prob'] = result['risk_score_raw'].apply(enhanced_logistic)
    def classify_eln_risk(row):
        if ((row.get('has_NPM1', 0) == 1 and row.get('has_FLT3', 0) == 0) or
            row.get('has_CEBPA', 0) == 1):
            if row['has_adverse_gene'] == 0:
                return 'Favorable'
        if (row['has_adverse_gene'] == 1 or 
            row.get('has_TP53', 0) == 1 or
            row['nb_adverse_genes'] >= 2):
            return 'Adverse'
        return 'Intermediate'
    result['eln_risk_category'] = result.apply(classify_eln_risk, axis=1)
    result['genetic_complexity'] = (
        result['nb_genes'] + 
        result['nb_adverse_genes'] * 2 + 
        result['nb_methylation_genes']
    )
    result['vaf_heterogeneity'] = result['vaf_var'].fillna(0)
    return result

def aggregation(molecular_train, molecular_val, clinical_train=None, clinical_val=None):
    molecular_train_agg = aggregate_leukemia_data_improved(molecular_train, clinical_train)
    molecular_val_agg = aggregate_leukemia_data_improved(molecular_val, clinical_val)
    return molecular_train_agg, molecular_val_agg

# Exemple d'utilisation :
molecular_train_agg, molecular_val_agg = aggregation(molecular_train, molecular_val, clinical_train, clinical_val)


In [6]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def extract_cytogenetic_features(data):
    """Extrait les anomalies chromosomiques fréquentes"""
    abnormalities = {
        "monosomy_7": r"[-]7|\bdel\(7",
        "monosomy_5": r"[-]5|\bdel\(5",
        "del_20q": r"del\(20\)\(q",
        "inv_16": r"inv\(16\)",
        "t_8_21": r"t\(8;21\)",
        "t_15_17": r"t\(15;17\)",
        "t_3_3": r"t\(3;3",
        "del_3_q26": r"del\(3\)\(q26",
        "t_3_9": r"t\(3;9\)",
        "trisomy_8": r"\+8"
    }

    for name, regex in abnormalities.items():
        data[name] = data["CYTOGENETICS"].str.contains(regex, case=False, na=False).astype(int)

    # Complex karyotype = 3 anomalies ou plus
    def is_complex(k):
        if pd.isna(k):
            return 0
        # Compter les délétions, translocations, inversions, etc.
        anomalies = re.findall(r"del\(|t\(|inv\(|ins\(|\+\d+|-\d+", str(k))
        return int(len(anomalies) >= 3)
    
    data["complex_karyotype"] = data["CYTOGENETICS"].apply(is_complex)
    return data

def assign_eln_risk(row):
    """Classification ELN 2022 des risques pronostiques"""
    # Risque favorable
    if row["t_8_21"] == 1 or row["inv_16"] == 1 or row["t_15_17"] == 1:
        return "favorable"
    # Risque défavorable
    elif (row["monosomy_7"] == 1 or row["monosomy_5"] == 1 or 
          row["complex_karyotype"] == 1 or row["t_3_3"] == 1 or row["del_3_q26"] == 1):
        return "adverse"
    # Risque intermédiaire
    else:
        return "intermediate"

def extract_structural_numerical_anomalies(data):
    """Extrait les anomalies structurelles vs numériques"""
    def count_structural(k):
        if pd.isna(k):
            return 0
        # Anomalies structurelles : délétions, translocations, inversions, insertions
        structural = re.findall(r"del\(|t\(|inv\(|ins\(", str(k))
        return len(structural)
    
    def count_numerical(k):
        if pd.isna(k):
            return 0
        # Anomalies numériques : gains (+) et pertes (-) de chromosomes entiers
        numerical = re.findall(r"\+\d+|-\d+", str(k))
        return len(numerical)
    
    data["structural_anomalies_count"] = data["CYTOGENETICS"].apply(count_structural)
    data["numerical_anomalies_count"] = data["CYTOGENETICS"].apply(count_numerical)
    return data

def extract_chromosome_details(data):
    """Extrait les détails des chromosomes impliqués"""
    def get_involved_chromosomes(k):
        if pd.isna(k):
            return []
        # Extraire tous les numéros de chromosomes mentionnés
        chromosomes = re.findall(r"(?:del\(|t\(|inv\(|ins\()(\d+)", str(k))
        chromosomes += re.findall(r"[+-](\d+)", str(k))
        return list(set(chromosomes))  # Supprimer les doublons
    
    data["involved_chromosomes"] = data["CYTOGENETICS"].apply(get_involved_chromosomes)
    data["num_involved_chromosomes"] = data["involved_chromosomes"].apply(len)
    
    # Créer des colonnes binaires pour les chromosomes les plus fréquemment impliqués
    common_chromosomes = ['3', '5', '7', '8', '9', '11', '15', '16', '17', '20', '21', '22']
    for chr_num in common_chromosomes:
        data[f"chr_{chr_num}_involved"] = data["involved_chromosomes"].apply(
            lambda x: 1 if chr_num in x else 0
        )
    
    return data

def create_cytogenetic_embeddings(data, max_features=100):
    """Crée des embeddings TF-IDF sur les chaînes CYTOGENETICS"""
    # Préparation des données pour TF-IDF
    cyto_texts = data["CYTOGENETICS"].fillna("normal").astype(str)
    
    # Tokenization spéciale pour les données cytogénétiques
    def cyto_tokenizer(text):
        # Extraire les éléments cytogénétiques comme tokens
        tokens = []
        tokens.extend(re.findall(r"del\(\d+\)\([pq]\d*\)", text))  # délétions
        tokens.extend(re.findall(r"t\(\d+;\d+\)", text))  # translocations
        tokens.extend(re.findall(r"inv\(\d+\)", text))  # inversions
        tokens.extend(re.findall(r"[+-]\d+", text))  # gains/pertes
        tokens.extend(re.findall(r"\d+,XX|\d+,XY", text))  # formules de base
        return tokens
    
    # Créer le vectoriseur TF-IDF
    vectorizer = TfidfVectorizer(
        tokenizer=cyto_tokenizer,
        max_features=max_features,
        lowercase=False
    )
    
    # Ajuster et transformer
    tfidf_matrix = vectorizer.fit_transform(cyto_texts)
    
    # Convertir en DataFrame
    feature_names = [f"tfidf_{i}" for i in range(tfidf_matrix.shape[1])]
    tfidf_df = pd.DataFrame(
        tfidf_matrix.toarray(), 
        columns=feature_names, 
        index=data.index
    )
    
    # Joindre au DataFrame principal
    data = pd.concat([data, tfidf_df], axis=1)
    return data

def preprocess(data, include_embeddings=True):
    """
    Fonction de preprocessing complète pour les données cytogénétiques
    
    Parameters:
    -----------
    data : pd.DataFrame
        DataFrame contenant la colonne CYTOGENETICS
    include_embeddings : bool
        Si True, inclut les embeddings TF-IDF (peut être coûteux en mémoire)
    
    Returns:
    --------
    pd.DataFrame
        DataFrame enrichi avec toutes les features cytogénétiques
    """
    
    # 1. Normalisation de la casse des chromosomes (tout en majuscule)
    data["CYTOGENETICS"] = data["CYTOGENETICS"].str.upper()
    
    # 2. Indicateur si le caryotype est anormal (inversé par rapport à l'original)
    data["is_abnormal"] = (~data["CYTOGENETICS"].str.contains("46,XX|46,XY", case=False, na=False)).astype(int)
    # Garder aussi l'indicateur normal pour compatibilité
    data["is_normal"] = 1 - data["is_abnormal"]
    
    # 3. Extraction du nombre total de chromosomes
    data["total_chromosomes"] = data["CYTOGENETICS"].str.extract(r"^(\d+)", expand=False).astype(float)
    # Remplir avec 46 si non détecté et caryotype normal
    data.loc[~data["total_chromosomes"].notna(), "total_chromosomes"] = 46
    
    # 4. Extraction du sexe brut (ex: XY, XX)
    data["sex_raw"] = data["CYTOGENETICS"].str.extract(r"\b(XX|XY|XYY|XXY|XXX|YY)\b", expand=False)
    
    # 5. Normalisation du sexe : catégorisation des cas atypiques
    data["sex"] = data["sex_raw"].apply(lambda s: s if s in ["XX", "XY"] else "Other")
    
    # 6. Extraction des anomalies chromosomiques fréquentes
    data = extract_cytogenetic_features(data)
    
    # 7. Classification ELN des risques
    data["eln_risk"] = data.apply(assign_eln_risk, axis=1)
    
    # 8. Encodage ordinal du risque ELN
    eln_risk_mapping = {"favorable": 0, "intermediate": 1, "adverse": 2}
    data["eln_risk_ordinal"] = data["eln_risk"].map(eln_risk_mapping)
    
    # 9. Nombre de clones
    data["number_of_clones"] = data["CYTOGENETICS"].str.findall(r"\[(\d+)\]").apply(
        lambda x: sum(map(int, x)) if x else 0
    )
    
    # 10. Anomalies structurelles vs numériques
    data = extract_structural_numerical_anomalies(data)
    
    # 11. Détails des chromosomes impliqués
    data = extract_chromosome_details(data)
    
    # 12. Ratio anomalies structurelles/numériques
    data["structural_numerical_ratio"] = data["structural_anomalies_count"] / (
        data["numerical_anomalies_count"] + 1
    )  # +1 pour éviter division par zéro
    
    # 13. Score de complexité globale
    data["complexity_score"] = (
        data["structural_anomalies_count"] + 
        data["numerical_anomalies_count"] + 
        data["num_involved_chromosomes"]
    )
    
    # 14. Embeddings TF-IDF (optionnel)
    if include_embeddings:
        data = create_cytogenetic_embeddings(data)
    
    # 15. Nettoyage des colonnes intermédiaires
    columns_to_drop = ["sex_raw", "involved_chromosomes"]
    data.drop(columns=[col for col in columns_to_drop if col in data.columns], inplace=True)
    
    return data

# Exemple d'utilisation avec gestion des erreurs
def preprocess_safe(data, include_embeddings=True):
    """Version sécurisée du preprocessing avec gestion des erreurs"""
    try:
        return preprocess(data.copy(), include_embeddings=include_embeddings)
    except Exception as e:
        print(f"Erreur lors du preprocessing: {e}")
        print("Retour de la version de base...")
        return preprocess_basic(data.copy())

def preprocess_basic(data):
    """Version de base du preprocessing en cas d'erreur"""
    data["CYTOGENETICS"] = data["CYTOGENETICS"].str.upper()
    data["is_abnormal"] = (~data["CYTOGENETICS"].str.contains("46,XX|46,XY", case=False, na=False)).astype(int)
    data["total_chromosomes"] = data["CYTOGENETICS"].str.extract(r"^(\d+)", expand=False).astype(float)
    data.loc[~data["total_chromosomes"].notna(), "total_chromosomes"] = 46
    return data

# Appliquer la fonction aux datasets
clinical_val = preprocess(clinical_val)
clinical_train = preprocess(clinical_train)




In [7]:
clinical_val = clinical_val.drop('CYTOGENETICS', axis=1)
clinical_train = clinical_train.drop('CYTOGENETICS', axis=1)

In [8]:
df_train = clinical_train.merge(molecular_train_agg, on='ID', how='left').fillna(0)
df_val = clinical_val.merge(molecular_val_agg, on='ID', how='left').fillna(0)

In [9]:
df_train['mutation_burden'] = df_train['nb_mutations'] * df_train['vaf_mean']
df_val['mutation_burden'] = df_val['nb_mutations'] * df_val['vaf_mean']


for col in ['HB', 'PLT', 'BM_BLAST']:
    lower = df_train[col].quantile(0.01)
    upper = df_train[col].quantile(0.99)
    df_train[col] = df_train[col].clip(lower, upper)
    df_val[col] = df_val[col].clip(lower, upper)


In [10]:
from sklearn.preprocessing import RobustScaler

# Colonnes numériques hors cibles, présentes dans le train
features = [col for col in df_train.select_dtypes(include='number').columns if col not in ['OS_YEARS', 'OS_STATUS']]

# Ajoute les colonnes manquantes dans df_val
for col in features:
    if col not in df_val.columns:
        df_val[col] = 0

# Aligne l'ordre des colonnes
df_val = df_val.reindex(columns=df_train.columns)

scaler = RobustScaler()
df_train_scaled = df_train.copy()
df_val_scaled = df_val.copy()

df_train_scaled[features] = scaler.fit_transform(df_train[features])
df_val_scaled[features] = scaler.transform(df_val[features][features])

In [11]:
df_train_scaled = df_train_scaled.merge(target_df, on='ID', how='inner')

In [12]:
df_train_scaled = df_train_scaled.drop(['CENTER'], axis=1)
df_val_scaled = df_val_scaled.drop(['CENTER'], axis=1)

In [13]:
# Get categorical columns
categorical_columns = df_train_scaled.select_dtypes(include=['object']).columns

# Perform dummy encoding on both datasets
df_train_scaled = pd.get_dummies(df_train_scaled, columns=categorical_columns)
df_val_scaled = pd.get_dummies(df_val_scaled, columns=categorical_columns)

# Ensure validation set has same columns as training set
missing_cols = set(df_train_scaled.columns) - set(df_val_scaled.columns)
for col in missing_cols:
    df_val_scaled[col] = 0

# Ensure columns are in same order
df_val_scaled = df_val_scaled[df_train_scaled.columns]

  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0
  df_val_scaled[col] = 0


In [15]:
df_train_scaled.to_csv('../data/old_arthur/df_train.csv', index=False)
df_val_scaled.to_csv('../data/old_arthur/df_eval.csv', index=False)