# Importation des librairies

In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import os.path

from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sksurv.ensemble import RandomSurvivalForest
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored , concordance_index_ipcw
from sklearn.impute import SimpleImputer
from sksurv.util import Surv
from lifelines.utils import concordance_index



# Preprocessing

In [2]:
clinical_test=pd.read_csv('clinical_test.csv')
clinical_train=pd.read_csv('clinical_train.csv')

molecular_test=pd.read_csv('molecular_test.csv')
molecular_train=pd.read_csv('molecular_train.csv')


In [3]:
target_df=pd.read_csv('target_train.csv')

# Drop rows where 'OS_YEARS' is NaN if conversion caused any issues
target_df.dropna(subset=['OS_YEARS', 'OS_STATUS'], inplace=True)


# Contarget_dfvert 'OS_YEARS' to numeric if it isn’t already
target_df['OS_YEARS'] = pd.to_numeric(target_df['OS_YEARS'], errors='coerce')

# Ensure 'OS_STATUS' is boolean
target_df['OS_STATUS'] = target_df['OS_STATUS'].astype(bool)

In [4]:
def handle_missing_values(df):
    for col in df.columns:
        if df[col].isnull().any():
            if df[col].dtype in [np.int64, np.float64]:
                # Numérique : remplacer par la médiane
                median_val = df[col].median()
                df[col] = df[col].fillna(median_val)
            else:
                # Catégoriel : remplacer par 'Missing'
                df[col] = df[col].fillna('Missing')
    return df

In [5]:
import pandas as pd
import numpy as np

def aggregate_leukemia_data(df):
    """
    Agrégation simple des données de mutations par patient
    """
    
    # Gènes à haut risque
    high_risk_genes = ['TP53', 'ASXL1', 'RUNX1', 'FLT3', 'EZH2', 'DNMT3A', 'TET2', 'IDH1', 'IDH2']
    
    # Gènes à bon pronostic  
    good_genes = ['NPM1', 'CEBPA']
    
    # Effets délétères
    bad_effects = ['nonsense', 'frameshift', 'splice_site', 'stop_gained']
    
    # D'abord voir les colonnes disponibles
    print("Colonnes disponibles:", df.columns.tolist())
    
    # Agrégation de base
    result = df.groupby('ID').agg({
        'GENE': ['count', 'nunique'],
        'CHR': 'nunique',
        'VAF': ['mean', 'max', 'median'],
        'DEPTH': ['mean', 'min']
    }).reset_index()
    
    # Simplifier les noms de colonnes
    result.columns = ['ID', 'nb_mutations', 'nb_genes', 'nb_chromosomes', 
                      'vaf_mean', 'vaf_max', 'vaf_median', 'depth_mean', 'depth_min']
    
    # Ajouter les variables spécifiques
    for patient_id in result['ID']:
        patient_data = df[df['ID'] == patient_id]
        
        # Gènes à risque
        result.loc[result['ID'] == patient_id, 'nb_high_risk_genes'] = len(set(patient_data['GENE']) & set(high_risk_genes))
        result.loc[result['ID'] == patient_id, 'nb_good_genes'] = len(set(patient_data['GENE']) & set(good_genes))
        
        # Effets délétères
        result.loc[result['ID'] == patient_id, 'nb_bad_effects'] = patient_data['EFFECT'].isin(bad_effects).sum()
        
        # Mutations importantes
        result.loc[result['ID'] == patient_id, 'has_TP53'] = int('TP53' in patient_data['GENE'].values)
        result.loc[result['ID'] == patient_id, 'has_FLT3'] = int('FLT3' in patient_data['GENE'].values)
        result.loc[result['ID'] == patient_id, 'has_NPM1'] = int('NPM1' in patient_data['GENE'].values)
        
        # VAF élevée (charge mutationelle)
        result.loc[result['ID'] == patient_id, 'nb_high_vaf'] = (patient_data['VAF'] > 0.4).sum()
    
    # Score de risque simple
    result['risk_score'] = (result['nb_high_risk_genes'] * 2 + 
                           result['nb_bad_effects'] + 
                           result['nb_high_vaf'] * 0.5 - 
                           result['nb_good_genes'])
    
    return result

molecular_train = aggregate_leukemia_data(molecular_train)
molecular_test = aggregate_leukemia_data(molecular_test)

Colonnes disponibles: ['ID', 'CHR', 'START', 'END', 'REF', 'ALT', 'GENE', 'PROTEIN_CHANGE', 'EFFECT', 'VAF', 'DEPTH']
Colonnes disponibles: ['ID', 'CHR', 'START', 'END', 'REF', 'ALT', 'GENE', 'PROTEIN_CHANGE', 'EFFECT', 'VAF', 'DEPTH']


In [6]:
# import pandas as pd
# import numpy as np


# print("=== ANALYSE DES VARIABLES CATÉGORIELLES DE molecular_test (SAUF ID) ===\n")

# # Identifier les variables catégorielles (exclure ID)
# categorical_cols = molecular_test.select_dtypes(include=['object']).columns
# categorical_cols = [col for col in categorical_cols if col != 'ID']  # Exclure ID
# numerical_cols = molecular_test.select_dtypes(include=['number']).columns

# print(f"Variables catégorielles (sauf ID): {list(categorical_cols)}")
# print(f"Variables numériques: {list(numerical_cols)}\n")

# # Analyser chaque variable catégorielle
# for col in categorical_cols:
#     print(f"--- {col} ---")
#     unique_values = molecular_test[col].unique()
#     print(f"Nombre de valeurs uniques: {len(unique_values)}")
    
#     # Gérer le tri en séparant les NaN et les valeurs non-numériques
#     try:
#         # Essayer de trier normalement
#         sorted_values = sorted(unique_values)
#         print(f"Valeurs uniques: {sorted_values}")
#     except TypeError:
#         # Si erreur de tri, afficher sans trier
#         print(f"Valeurs uniques (non triées): {list(unique_values)}")
    
#     print(f"Valeurs manquantes: {molecular_test[col].isnull().sum()}")
#     print()

# # Analyse spéciale pour les variables numériques qui pourraient être catégorielles
# print("=== VARIABLES NUMÉRIQUES QUI POURRAIENT ÊTRE CATÉGORIELLES ===\n")

# for col in numerical_cols:
#     unique_count = molecular_test[col].nunique()
#     total_count = len(molecular_test)
    
#     # Si moins de 20% de valeurs uniques, considérer comme potentiellement catégorielle
#     if unique_count / total_count < 0.2:
#         print(f"--- {col} (potentiellement catégorielle) ---")
#         print(f"Nombre de valeurs uniques: {unique_count}")
#         try:
#             sorted_values = sorted(molecular_test[col].unique())
#             print(f"Valeurs uniques: {sorted_values}")
#         except TypeError:
#             print(f"Valeurs uniques (non triées): {list(molecular_test[col].unique())}")
#         print(f"Valeurs manquantes: {molecular_test[col].isnull().sum()}")
#         print()

# # Statistiques générales
# print("=== STATISTIQUES GÉNÉRALES ===\n")
# print(f"Nombre total de lignes: {len(molecular_test)}")
# print(f"Nombre de patients uniques: {molecular_test['ID'].nunique()}")
# print(f"Nombre de gènes uniques: {molecular_test['GENE'].nunique()}")
# print(f"Nombre de chromosomes uniques: {molecular_test['CHR'].nunique()}")
# print(f"Nombre d'effets uniques: {molecular_test['EFFECT'].nunique()}")

In [7]:
def add_cytogenetic_features(data):
    # Indicateur si le caryotype est "Normal"
    data["is_normal"] = data["CYTOGENETICS"].str.contains("Normal", case=False, na=False).astype(int)

    # Extraction du nombre total de chromosomes
    data["total_chromosomes"] = data["CYTOGENETICS"].str.extract(r"^(\d+)", expand=False).astype(float)
    data.loc[data["is_normal"] == 1, "total_chromosomes"] = 46  # Valeur par défaut pour caryotype normal

    # Normalisation de la casse des chromosomes (tout en majuscule)
    data["CYTOGENETICS"] = data["CYTOGENETICS"].str.upper()

    # Extraction du sexe brut (ex: XY, XX)
    data["sex_raw"] = data["CYTOGENETICS"].str.extract(r"\b(XX|XY|XYY|XXY|XXX|YY)\b", expand=False)

    # Normalisation du sexe : catégorisation des cas atypiques
    def normalize_sex(s):
        if s in ["XX", "XY"]:
            return s  # Sexe standard
        elif pd.notna(s):
            return "Other"  # Cas atypiques connus
        return "Unknown"  # Non défini

    data["sex"] = data["sex_raw"].apply(normalize_sex)

    # Nettoyage des colonnes intermédiaires
    data.drop(columns=["sex_raw"], inplace=True)

    return data

# Appliquer la fonction aux datasets
clinical_test = add_cytogenetic_features(clinical_test)
clinical_train = add_cytogenetic_features(clinical_train)


In [8]:
clinical_test = clinical_test.drop('CYTOGENETICS', axis=1)
clinical_train = clinical_train.drop('CYTOGENETICS', axis=1)

In [9]:
df_train = clinical_train.merge(molecular_train, on='ID', how='left').fillna(0)
df_test = clinical_test.merge(molecular_test, on='ID', how='left').fillna(0)

In [10]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns excluding 'OS_YEARS' and ID (which is an identifier)
numerical_cols_train = df_train.select_dtypes(include=np.number).columns.tolist()
numerical_cols_test = df_test.select_dtypes(include=np.number).columns.tolist()


if 'OS_YEARS' in numerical_cols_train:
    numerical_cols_train.remove('OS_YEARS') # Exclude the target variable
if 'OS_STATUS' in numerical_cols_train:
    numerical_cols_train.remove('OS_STATUS') # Exclude the target variable

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform on the training data using the filtered numerical columns
df_train[numerical_cols_train] = scaler.fit_transform(df_train[numerical_cols_train])

# Transform on the test data using the filtered numerical columns (and scaler fitted on training data)
# Need to ensure the columns in df_test match the columns used for fitting the scaler on df_train
numerical_cols_test_filtered = [col for col in numerical_cols_train if col in numerical_cols_test]
df_test[numerical_cols_test_filtered] = scaler.transform(df_test[numerical_cols_test_filtered])

In [11]:
df_train = df_train.merge(target_df, on='ID', how='inner')

In [12]:
# prompt: supprime les variables ID et CENTER de df_train et de df_test
ID_test = df_test['ID']

df_train = df_train.drop(['ID', 'CENTER'], axis=1)
df_test = df_test.drop(['ID', 'CENTER'], axis=1)

# MODELE FLAML

In [None]:
import h2o
from h2o.estimators import H2OCoxProportionalHazardsEstimator

h2o.init(max_mem_size="4G")

# Préparation des données
train = df_train.copy()
test = df_test.copy()

# Forcer les bons types
train['OS_YEARS'] = train['OS_YEARS'].astype(float)
train['OS_STATUS'] = train['OS_STATUS'].astype(int).astype('category')
test['OS_YEARS'] = 0.0  # ou np.nan si inconnu
test['OS_STATUS'] = 0   # ou np.nan si inconnu
test['OS_STATUS'] = test['OS_STATUS'].astype(int).astype('category')

# Encodage des variables catégorielles
for col in train.select_dtypes(include='object').columns:
    train[col] = train[col].astype('category')
    if col in test.columns:
        test[col] = test[col].astype('category')

h2o_train = h2o.H2OFrame(train)
h2o_test = h2o.H2OFrame(test)

x = [col for col in train.columns if col not in ['OS_YEARS', 'OS_STATUS']]
time_col = 'OS_YEARS'
event_col = 'OS_STATUS'

# Entraînement du modèle CoxPH (event_column à l'instanciation)
coxph = H2OCoxProportionalHazardsEstimator()
coxph.train(x=x, y=time_col, training_frame=h2o_train)

# Prédiction sur le test
pred = coxph.predict(h2o_test)
risk_score = pred.as_data_frame()['risk_index']  # plus haut = plus risqué

submission = pd.DataFrame({'ID': ID_test.values, 'risk_score': risk_score})
submission.to_csv('submission_h2o_coxph.csv', index=False)

Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,6 mins 38 secs
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,3 months and 21 days
H2O_cluster_name:,H2O_from_python_arthr_4us451
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.341 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


TypeError: H2OCoxProportionalHazardsEstimator.__init__() got an unexpected keyword argument 'event_column'