# Importation des librairies

In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import os.path

from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sksurv.ensemble import RandomSurvivalForest
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored , concordance_index_ipcw
from sklearn.impute import SimpleImputer
from sksurv.util import Surv
from lifelines.utils import concordance_index



# Preprocessing

In [3]:
clinical_test=pd.read_csv('clinical_test.csv')
clinical_train=pd.read_csv('clinical_train.csv')

molecular_test=pd.read_csv('molecular_test.csv')
molecular_train=pd.read_csv('molecular_train.csv')


In [4]:
target_df=pd.read_csv('target_train.csv')

# Drop rows where 'OS_YEARS' is NaN if conversion caused any issues
target_df.dropna(subset=['OS_YEARS', 'OS_STATUS'], inplace=True)


# Contarget_dfvert 'OS_YEARS' to numeric if it isn’t already
target_df['OS_YEARS'] = pd.to_numeric(target_df['OS_YEARS'], errors='coerce')

# Ensure 'OS_STATUS' is boolean
target_df['OS_STATUS'] = target_df['OS_STATUS'].astype(bool)

In [5]:
def handle_missing_values(df):
    for col in df.columns:
        if df[col].isnull().any():
            if df[col].dtype in [np.int64, np.float64]:
                # Numérique : remplacer par la médiane
                median_val = df[col].median()
                df[col] = df[col].fillna(median_val)
            else:
                # Catégoriel : remplacer par 'Missing'
                df[col] = df[col].fillna('Missing')
    return df

In [6]:
import pandas as pd
import numpy as np

def aggregate_leukemia_data(df):
    """
    Agrégation simple des données de mutations par patient
    """
    
    # Gènes à haut risque
    high_risk_genes = ['TP53', 'ASXL1', 'RUNX1', 'FLT3', 'EZH2', 'DNMT3A', 'TET2', 'IDH1', 'IDH2']
    
    # Gènes à bon pronostic  
    good_genes = ['NPM1', 'CEBPA']
    
    # Effets délétères
    bad_effects = ['nonsense', 'frameshift', 'splice_site', 'stop_gained']
    
    # D'abord voir les colonnes disponibles
    print("Colonnes disponibles:", df.columns.tolist())
    
    # Agrégation de base
    result = df.groupby('ID').agg({
        'GENE': ['count', 'nunique'],
        'CHR': 'nunique',
        'VAF': ['mean', 'max', 'median'],
        'DEPTH': ['mean', 'min']
    }).reset_index()
    
    # Simplifier les noms de colonnes
    result.columns = ['ID', 'nb_mutations', 'nb_genes', 'nb_chromosomes', 
                      'vaf_mean', 'vaf_max', 'vaf_median', 'depth_mean', 'depth_min']
    
    # Ajouter les variables spécifiques
    for patient_id in result['ID']:
        patient_data = df[df['ID'] == patient_id]
        
        # Gènes à risque
        result.loc[result['ID'] == patient_id, 'nb_high_risk_genes'] = len(set(patient_data['GENE']) & set(high_risk_genes))
        result.loc[result['ID'] == patient_id, 'nb_good_genes'] = len(set(patient_data['GENE']) & set(good_genes))
        
        # Effets délétères
        result.loc[result['ID'] == patient_id, 'nb_bad_effects'] = patient_data['EFFECT'].isin(bad_effects).sum()
        
        # Mutations importantes
        result.loc[result['ID'] == patient_id, 'has_TP53'] = int('TP53' in patient_data['GENE'].values)
        result.loc[result['ID'] == patient_id, 'has_FLT3'] = int('FLT3' in patient_data['GENE'].values)
        result.loc[result['ID'] == patient_id, 'has_NPM1'] = int('NPM1' in patient_data['GENE'].values)
        
        # VAF élevée (charge mutationelle)
        result.loc[result['ID'] == patient_id, 'nb_high_vaf'] = (patient_data['VAF'] > 0.4).sum()
    
    # Score de risque simple
    result['risk_score'] = (result['nb_high_risk_genes'] * 2 + 
                           result['nb_bad_effects'] + 
                           result['nb_high_vaf'] * 0.5 - 
                           result['nb_good_genes'])
    
    return result

molecular_train = aggregate_leukemia_data(molecular_train)
molecular_test = aggregate_leukemia_data(molecular_test)

Colonnes disponibles: ['ID', 'CHR', 'START', 'END', 'REF', 'ALT', 'GENE', 'PROTEIN_CHANGE', 'EFFECT', 'VAF', 'DEPTH']
Colonnes disponibles: ['ID', 'CHR', 'START', 'END', 'REF', 'ALT', 'GENE', 'PROTEIN_CHANGE', 'EFFECT', 'VAF', 'DEPTH']


In [7]:
def add_cytogenetic_features(data):
    # Indicateur si le caryotype est "Normal"
    data["is_normal"] = data["CYTOGENETICS"].str.contains("Normal", case=False, na=False).astype(int)

    # Extraction du nombre total de chromosomes
    data["total_chromosomes"] = data["CYTOGENETICS"].str.extract(r"^(\d+)", expand=False).astype(float)
    data.loc[data["is_normal"] == 1, "total_chromosomes"] = 46  # Valeur par défaut pour caryotype normal

    # Normalisation de la casse des chromosomes (tout en majuscule)
    data["CYTOGENETICS"] = data["CYTOGENETICS"].str.upper()

    # Extraction du sexe brut (ex: XY, XX)
    data["sex_raw"] = data["CYTOGENETICS"].str.extract(r"\b(XX|XY|XYY|XXY|XXX|YY)\b", expand=False)

    # Normalisation du sexe : catégorisation des cas atypiques
    def normalize_sex(s):
        if s in ["XX", "XY"]:
            return s  # Sexe standard
        elif pd.notna(s):
            return "Other"  # Cas atypiques connus
        return "Unknown"  # Non défini

    data["sex"] = data["sex_raw"].apply(normalize_sex)

    # Nettoyage des colonnes intermédiaires
    data.drop(columns=["sex_raw"], inplace=True)

    return data

# Appliquer la fonction aux datasets
clinical_test = add_cytogenetic_features(clinical_test)
clinical_train = add_cytogenetic_features(clinical_train)


In [8]:
clinical_test = clinical_test.drop('CYTOGENETICS', axis=1)
clinical_train = clinical_train.drop('CYTOGENETICS', axis=1)

In [9]:
df_train = clinical_train.merge(molecular_train, on='ID', how='left').fillna(0)
df_test = clinical_test.merge(molecular_test, on='ID', how='left').fillna(0)

In [10]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns excluding 'OS_YEARS' and ID (which is an identifier)
numerical_cols_train = df_train.select_dtypes(include=np.number).columns.tolist()
numerical_cols_test = df_test.select_dtypes(include=np.number).columns.tolist()


if 'OS_YEARS' in numerical_cols_train:
    numerical_cols_train.remove('OS_YEARS') # Exclude the target variable
if 'OS_STATUS' in numerical_cols_train:
    numerical_cols_train.remove('OS_STATUS') # Exclude the target variable

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform on the training data using the filtered numerical columns
df_train[numerical_cols_train] = scaler.fit_transform(df_train[numerical_cols_train])

# Transform on the test data using the filtered numerical columns (and scaler fitted on training data)
# Need to ensure the columns in df_test match the columns used for fitting the scaler on df_train
numerical_cols_test_filtered = [col for col in numerical_cols_train if col in numerical_cols_test]
df_test[numerical_cols_test_filtered] = scaler.transform(df_test[numerical_cols_test_filtered])

In [11]:
df_train = df_train.merge(target_df, on='ID', how='inner')

In [12]:
# prompt: supprime les variables ID et CENTER de df_train et de df_test
ID_test = df_test['ID']

df_train = df_train.drop(['ID', 'CENTER'], axis=1)
df_test = df_test.drop(['ID', 'CENTER'], axis=1)