In [133]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import os.path

from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sksurv.ensemble import RandomSurvivalForest
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored , concordance_index_ipcw
from sklearn.impute import SimpleImputer
from sksurv.util import Surv
from lifelines.utils import concordance_index



In [134]:
clinical_test=pd.read_csv('clinical_test.csv')
clinical_train=pd.read_csv('clinical_train.csv')

molecular_test=pd.read_csv('molecular_test.csv')
molecular_train=pd.read_csv('molecular_train.csv')


In [135]:
target_df=pd.read_csv('target_train.csv')

# Drop rows where 'OS_YEARS' is NaN if conversion caused any issues
target_df.dropna(subset=['OS_YEARS', 'OS_STATUS'], inplace=True)


# Contarget_dfvert 'OS_YEARS' to numeric if it isn’t already
target_df['OS_YEARS'] = pd.to_numeric(target_df['OS_YEARS'], errors='coerce')

# Ensure 'OS_STATUS' is boolean
target_df['OS_STATUS'] = target_df['OS_STATUS'].astype(bool)


In [None]:
def handle_missing_values(df):
    for col in df.columns:
        if df[col].isnull().any():
            if df[col].dtype in [np.int64, np.float64]:
                # Numerical variable: replace with median
                median_val = df[col].median()
                df[col].fillna(median_val, inplace=True)
            else:
                # Categorical variable: replace with a new class
                df[col].fillna('Missing', inplace=True)
    return df

clinical_train = handle_missing_values(clinical_train)
clinical_test = handle_missing_values(clinical_test)
molecular_train = handle_missing_values(molecular_train)
molecular_test = handle_missing_values(molecular_test)

In [137]:
import pandas as pd
import numpy as np

def aggregate_leukemia_data(df):
    """
    Agrégation simple des données de mutations par patient
    """
    
    # Gènes à haut risque
    high_risk_genes = ['TP53', 'ASXL1', 'RUNX1', 'FLT3', 'EZH2', 'DNMT3A', 'TET2', 'IDH1', 'IDH2']
    
    # Gènes à bon pronostic  
    good_genes = ['NPM1', 'CEBPA']
    
    # Effets délétères
    bad_effects = ['nonsense', 'frameshift', 'splice_site', 'stop_gained']
    
    # D'abord voir les colonnes disponibles
    print("Colonnes disponibles:", df.columns.tolist())
    
    # Agrégation de base
    result = df.groupby('ID').agg({
        'GENE': ['count', 'nunique'],
        'CHR': 'nunique',
        'VAF': ['mean', 'max', 'median'],
        'DEPTH': ['mean', 'min']
    }).reset_index()
    
    # Simplifier les noms de colonnes
    result.columns = ['ID', 'nb_mutations', 'nb_genes', 'nb_chromosomes', 
                      'vaf_mean', 'vaf_max', 'vaf_median', 'depth_mean', 'depth_min']
    
    # Ajouter les variables spécifiques
    for patient_id in result['ID']:
        patient_data = df[df['ID'] == patient_id]
        
        # Gènes à risque
        result.loc[result['ID'] == patient_id, 'nb_high_risk_genes'] = len(set(patient_data['GENE']) & set(high_risk_genes))
        result.loc[result['ID'] == patient_id, 'nb_good_genes'] = len(set(patient_data['GENE']) & set(good_genes))
        
        # Effets délétères
        result.loc[result['ID'] == patient_id, 'nb_bad_effects'] = patient_data['EFFECT'].isin(bad_effects).sum()
        
        # Mutations importantes
        result.loc[result['ID'] == patient_id, 'has_TP53'] = int('TP53' in patient_data['GENE'].values)
        result.loc[result['ID'] == patient_id, 'has_FLT3'] = int('FLT3' in patient_data['GENE'].values)
        result.loc[result['ID'] == patient_id, 'has_NPM1'] = int('NPM1' in patient_data['GENE'].values)
        
        # VAF élevée (charge mutationelle)
        result.loc[result['ID'] == patient_id, 'nb_high_vaf'] = (patient_data['VAF'] > 0.4).sum()
    
    # Score de risque simple
    result['risk_score'] = (result['nb_high_risk_genes'] * 2 + 
                           result['nb_bad_effects'] + 
                           result['nb_high_vaf'] * 0.5 - 
                           result['nb_good_genes'])
    
    return result

molecular_train = aggregate_leukemia_data(molecular_train)
molecular_test = aggregate_leukemia_data(molecular_test)



Colonnes disponibles: ['ID', 'CHR', 'START', 'END', 'REF', 'ALT', 'GENE', 'PROTEIN_CHANGE', 'EFFECT', 'VAF', 'DEPTH']
Colonnes disponibles: ['ID', 'CHR', 'START', 'END', 'REF', 'ALT', 'GENE', 'PROTEIN_CHANGE', 'EFFECT', 'VAF', 'DEPTH']


In [127]:
import pandas as pd
import numpy as np

# Charger les données moléculaires
molecular_test = pd.read_csv('molecular_test.csv')

print("=== ANALYSE DES VARIABLES CATÉGORIELLES DE molecular_test (SAUF ID) ===\n")

# Identifier les variables catégorielles (exclure ID)
categorical_cols = molecular_test.select_dtypes(include=['object']).columns
categorical_cols = [col for col in categorical_cols if col != 'ID']  # Exclure ID
numerical_cols = molecular_test.select_dtypes(include=['number']).columns

print(f"Variables catégorielles (sauf ID): {list(categorical_cols)}")
print(f"Variables numériques: {list(numerical_cols)}\n")

# Analyser chaque variable catégorielle
for col in categorical_cols:
    print(f"--- {col} ---")
    unique_values = molecular_test[col].unique()
    print(f"Nombre de valeurs uniques: {len(unique_values)}")
    
    # Gérer le tri en séparant les NaN et les valeurs non-numériques
    try:
        # Essayer de trier normalement
        sorted_values = sorted(unique_values)
        print(f"Valeurs uniques: {sorted_values}")
    except TypeError:
        # Si erreur de tri, afficher sans trier
        print(f"Valeurs uniques (non triées): {list(unique_values)}")
    
    print(f"Valeurs manquantes: {molecular_test[col].isnull().sum()}")
    print()

# Analyse spéciale pour les variables numériques qui pourraient être catégorielles
print("=== VARIABLES NUMÉRIQUES QUI POURRAIENT ÊTRE CATÉGORIELLES ===\n")

for col in numerical_cols:
    unique_count = molecular_test[col].nunique()
    total_count = len(molecular_test)
    
    # Si moins de 20% de valeurs uniques, considérer comme potentiellement catégorielle
    if unique_count / total_count < 0.2:
        print(f"--- {col} (potentiellement catégorielle) ---")
        print(f"Nombre de valeurs uniques: {unique_count}")
        try:
            sorted_values = sorted(molecular_test[col].unique())
            print(f"Valeurs uniques: {sorted_values}")
        except TypeError:
            print(f"Valeurs uniques (non triées): {list(molecular_test[col].unique())}")
        print(f"Valeurs manquantes: {molecular_test[col].isnull().sum()}")
        print()

# Statistiques générales
print("=== STATISTIQUES GÉNÉRALES ===\n")
print(f"Nombre total de lignes: {len(molecular_test)}")
print(f"Nombre de patients uniques: {molecular_test['ID'].nunique()}")
print(f"Nombre de gènes uniques: {molecular_test['GENE'].nunique()}")
print(f"Nombre de chromosomes uniques: {molecular_test['CHR'].nunique()}")
print(f"Nombre d'effets uniques: {molecular_test['EFFECT'].nunique()}")

=== ANALYSE DES VARIABLES CATÉGORIELLES DE molecular_test (SAUF ID) ===

Variables catégorielles (sauf ID): ['CHR', 'REF', 'ALT', 'GENE', 'PROTEIN_CHANGE', 'EFFECT']
Variables numériques: ['START', 'END', 'VAF', 'DEPTH']

--- CHR ---
Nombre de valeurs uniques: 23
Valeurs uniques (non triées): ['1', '10', '11', '12', '13', '15', '16', '17', '18', '19', '2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9', 'X', nan]
Valeurs manquantes: 69

--- REF ---
Nombre de valeurs uniques: 141
Valeurs uniques (non triées): ['T', 'G', 'A', '-', 'C', 'TT', 'AC', 'CTGTAGA', 'AGG', 'CAC', 'ATG', 'AAAC', 'AATT', 'TAACT', 'CAA', 'GGGGGCTGGGCCGGGGGTGG', 'CTGG', 'TGCGGAGATTCTCTTCCTC', 'CTCTTCCTCTGTGCGCCGGTCTCTCCC', 'GGCGGCTCATAG', 'TC', 'CTTCCACTCGG', 'CA', 'CGGGGGTGTGGAATCA', 'GAATCAACCCACAGCT', 'CCACAGCTGCACAGGGCAGGTCTTGG', 'GGGG', 'GGGAGTACTGTAGGAAGAGGAAGGAGACAGAGTTGAAAGTCA', 'GCAGGGGC', 'TGTT', 'TCAGTGCATAACC', 'ACTT', 'AA', 'CT', 'TA', 'GA', 'TCTGGCCCCCTGA', 'GAAGAAGCATAGACGAAATG', 'GGCGGCTGTGGTGTG

In [138]:
def add_cytogenetic_features(data):
    # Indicateur si le caryotype est "Normal"
    data["is_normal"] = data["CYTOGENETICS"].str.contains("Normal", case=False, na=False).astype(int)

    # Extraction du nombre total de chromosomes
    data["total_chromosomes"] = data["CYTOGENETICS"].str.extract(r"^(\d+)", expand=False).astype(float)
    data.loc[data["is_normal"] == 1, "total_chromosomes"] = 46  # Valeur par défaut pour caryotype normal

    # Normalisation de la casse des chromosomes (tout en majuscule)
    data["CYTOGENETICS"] = data["CYTOGENETICS"].str.upper()

    # Extraction du sexe brut (ex: XY, XX)
    data["sex_raw"] = data["CYTOGENETICS"].str.extract(r"\b(XX|XY|XYY|XXY|XXX|YY)\b", expand=False)

    # Normalisation du sexe : catégorisation des cas atypiques
    def normalize_sex(s):
        if s in ["XX", "XY"]:
            return s  # Sexe standard
        elif pd.notna(s):
            return "Other"  # Cas atypiques connus
        return "Unknown"  # Non défini

    data["sex"] = data["sex_raw"].apply(normalize_sex)

    # Nettoyage des colonnes intermédiaires
    data.drop(columns=["sex_raw"], inplace=True)

    return data

# Appliquer la fonction aux datasets
clinical_test = add_cytogenetic_features(clinical_test)
clinical_train = add_cytogenetic_features(clinical_train)


In [139]:
clinical_test = clinical_test.drop('CYTOGENETICS', axis=1)
clinical_train = clinical_train.drop('CYTOGENETICS', axis=1)

In [140]:
df_train = clinical_train.merge(molecular_train, on='ID', how='left').fillna(0)
df_test = clinical_test.merge(molecular_test, on='ID', how='left').fillna(0)

In [141]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns excluding 'OS_YEARS' and ID (which is an identifier)
numerical_cols_train = df_train.select_dtypes(include=np.number).columns.tolist()
numerical_cols_test = df_test.select_dtypes(include=np.number).columns.tolist()


if 'OS_YEARS' in numerical_cols_train:
    numerical_cols_train.remove('OS_YEARS') # Exclude the target variable
if 'OS_STATUS' in numerical_cols_train:
    numerical_cols_train.remove('OS_STATUS') # Exclude the target variable

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform on the training data using the filtered numerical columns
df_train[numerical_cols_train] = scaler.fit_transform(df_train[numerical_cols_train])

# Transform on the test data using the filtered numerical columns (and scaler fitted on training data)
# Need to ensure the columns in df_test match the columns used for fitting the scaler on df_train
numerical_cols_test_filtered = [col for col in numerical_cols_train if col in numerical_cols_test]
df_test[numerical_cols_test_filtered] = scaler.transform(df_test[numerical_cols_test_filtered])

In [142]:
df_train = df_train.merge(target_df, on='ID', how='inner')

In [144]:
# prompt: supprime les variables ID et CENTER de df_train et de df_test
ID_test = df_test['ID']

df_train = df_train.drop(['ID', 'CENTER'], axis=1)
df_test = df_test.drop(['ID', 'CENTER'], axis=1)

In [120]:
df_test 

Unnamed: 0,BM_BLAST,WBC,ANC,MONOCYTES,HB,PLT,is_normal,total_chromosomes,sex,nb_mutations,...,depth_mean,depth_min,nb_high_risk_genes,nb_good_genes,nb_bad_effects,has_TP53,has_FLT3,has_NPM1,nb_high_vaf,risk_score
0,8.274147,-0.293278,-0.511697,-0.26787,-1.139539,-0.799348,-0.049125,0.069638,XY,0.301076,...,-0.830263,-0.932376,0.858131,4.804498,0.633309,-0.358799,8.090735,7.638016,-0.798072,0.228721
1,3.878339,-0.320717,-0.383280,-0.26787,0.056230,-0.908284,-0.049125,0.056814,XY,-0.123394,...,-0.516271,-0.565598,-1.160679,-0.195611,0.633309,-0.358799,-0.123598,-0.130924,-0.124635,-0.660007
2,0.015357,0.616280,1.078234,-0.26787,1.202176,-0.955943,-0.049125,0.069638,XX,-0.123394,...,0.293707,0.763458,0.858131,-0.195611,1.908754,-0.358799,-0.123598,-0.130924,-0.798072,0.939704
3,7.341703,-0.079862,-0.223511,-0.26787,-0.940245,-0.826582,20.356203,0.056814,Unknown,-0.123394,...,0.038854,-0.446086,0.858131,-0.195611,0.633309,-0.358799,8.090735,-0.130924,0.548801,0.939704
4,-0.517468,-0.520921,-0.481916,-0.26787,-0.641302,-0.942326,-0.049125,0.018342,XY,-0.123394,...,-0.296744,0.031962,-0.151274,-0.195611,1.908754,-0.358799,-0.123598,-0.130924,-0.124635,0.406467
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1188,0.015357,-0.349172,-0.372319,-0.26787,-0.740950,-0.622328,-0.049125,-0.533082,Unknown,-0.547864,...,-0.605532,-0.586204,-1.160679,-0.195611,-0.642137,-0.358799,-0.123598,-0.130924,0.548801,-0.837753
1189,0.015357,-0.349172,-0.372319,-0.26787,-0.740950,-0.622328,-0.049125,-0.533082,Unknown,-0.123394,...,-1.427181,-1.235277,-0.151274,-0.195611,-0.642137,-0.358799,-0.123598,-0.130924,0.548801,-0.126770
1190,0.015357,-0.349172,-0.372319,-0.26787,-0.740950,-0.622328,-0.049125,-0.533082,Unknown,-0.972334,...,-0.067124,0.468799,-0.151274,-0.195611,-0.642137,2.787079,-0.123598,-0.130924,-0.798072,-0.482262
1191,0.015357,-0.349172,-0.372319,-0.26787,-0.740950,-0.622328,-0.049125,-0.533082,Unknown,-0.123394,...,-0.880256,-0.629475,-0.151274,-0.195611,-0.642137,-0.358799,-0.123598,-0.130924,-0.124635,-0.304516


# Modèle MTLR

In [149]:
import torch
import torch.nn as nn
from torchmtlr import (MTLR, mtlr_neg_log_likelihood, mtlr_survival, mtlr_survival_at_times)
from torchmtlr.utils import encode_survival, make_time_bins
import numpy as np
import pandas as pd

X_features = df_train.drop(columns=['OS_YEARS', 'OS_STATUS'])
X_features = pd.get_dummies(X_features, drop_first=True)
X_features = X_features.astype(float)

X_features_test = pd.get_dummies(df_test, drop_first=True)
X_features_test = X_features_test.reindex(columns=X_features.columns, fill_value=0)
X_features_test = X_features_test.astype(float)

X_train = torch.tensor(X_features.values, dtype=torch.float32)
X_test = torch.tensor(X_features_test.values, dtype=torch.float32)
y_time = torch.tensor(df_train['OS_YEARS'].values, dtype=torch.float32)
y_event = torch.tensor(df_train['OS_STATUS'].values, dtype=torch.float32)

# Création des time bins
time_bins = make_time_bins(y_time, event=y_event)
target = encode_survival(y_time, y_event, time_bins)

In [150]:
model = nn.Sequential(
    nn.Linear(X_train.shape[1], 64),
    nn.ReLU(inplace=True),
    MTLR(64, len(time_bins))
)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
n_epochs = 100

# Entraînement
for epoch in range(n_epochs):
    model.train()
    optimizer.zero_grad()
    logits = model(X_train)
    loss = mtlr_neg_log_likelihood(logits, target, model[-1], C1=1., average=True)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}/{n_epochs} - Loss: {loss.item():.4f}")

# Prédiction train
model.eval()
with torch.no_grad():
    logits_train = model(X_train)
    surv_train = mtlr_survival(logits_train)

median_pred = []
for surv in surv_train:
    below_half = np.where(surv.numpy() <= 0.5)[0]
    if below_half.size > 0 and below_half[0] < len(time_bins):
        median_pred.append(time_bins[below_half[0]].item())
    else:
        median_pred.append(time_bins[-1].item())

# Évaluation
from sksurv.util import Surv
from sksurv.metrics import concordance_index_ipcw

y_train_struct = Surv.from_arrays(event=df_train['OS_STATUS'].values.astype(bool),
                                  time=df_train['OS_YEARS'].values)
score_ipwc = concordance_index_ipcw(y_train_struct, y_train_struct, -np.array(median_pred))[0]
print(f"Concordance index IPCW (train): {score_ipwc:.3f}")


Epoch 10/100 - Loss: 24.4030
Epoch 20/100 - Loss: 21.8184
Epoch 30/100 - Loss: 19.5813
Epoch 40/100 - Loss: 17.6148
Epoch 50/100 - Loss: 15.8886
Epoch 60/100 - Loss: 14.3693
Epoch 70/100 - Loss: 13.0287
Epoch 80/100 - Loss: 11.8434
Epoch 90/100 - Loss: 10.7934
Epoch 100/100 - Loss: 9.8620
Concordance index IPCW (train): 0.698


In [153]:
# Prédiction test
with torch.no_grad():
    logits_test = model(X_test)
    surv_test = mtlr_survival(logits_test)
    median_pred_test = []
    for surv in surv_test:
        below_half = np.where(surv.numpy() <= 0.5)[0]
        if below_half.size > 0 and below_half[0] < len(time_bins):
            median_pred_test.append(time_bins[below_half[0]].item())
        else:
            median_pred_test.append(time_bins[-1].item())

# Soumission
risk_score_test = -np.array(median_pred_test)
submission = pd.DataFrame({
    "ID": ID_test.values,
    "risk_score": risk_score_test
})
submission.to_csv("submission_mtlr.csv", index=False)