In [50]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import os.path

from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sksurv.ensemble import RandomSurvivalForest
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored , concordance_index_ipcw
from sklearn.impute import SimpleImputer
from sksurv.util import Surv
from lifelines.utils import concordance_index


In [63]:
clinical_test=pd.read_csv('clinical_test.csv')
clinical_train=pd.read_csv('clinical_train.csv')

molecular_test=pd.read_csv('molecular_test.csv')
molecular_train=pd.read_csv('molecular_train.csv')


In [64]:
target_df=pd.read_csv('target_train.csv')

# Drop rows where 'OS_YEARS' is NaN if conversion caused any issues
target_df.dropna(subset=['OS_YEARS', 'OS_STATUS'], inplace=True)


# Contarget_dfvert 'OS_YEARS' to numeric if it isn’t already
target_df['OS_YEARS'] = pd.to_numeric(target_df['OS_YEARS'], errors='coerce')

# Ensure 'OS_STATUS' is boolean
target_df['OS_STATUS'] = target_df['OS_STATUS'].astype(bool)


In [None]:
def handle_missing_values(df):
    for col in df.columns:
        if df[col].isnull().any():
            if df[col].dtype in [np.int64, np.float64]:
                # Numerical variable: replace with median
                median_val = df[col].median()
                df[col].fillna(median_val, inplace=True)
            else:
                # Categorical variable: replace with a new class
                df[col].fillna('Missing', inplace=True)
    return df

clinical_train = handle_missing_values(clinical_train)
clinical_test = handle_missing_values(clinical_test)
molecular_train = handle_missing_values(molecular_train)
molecular_test = handle_missing_values(molecular_test)

In [None]:
# Merge with the training dataset and replace missing values in 'Nmut' with 0
df_train = clinical_train.merge(molecular_train, on='ID', how='left').fillna(0)
df_test = clinical_test.merge(molecular_test, on='ID', how='left').fillna(0)

In [67]:
def add_cytogenetic_features(data):
    # Indicateur si le caryotype est "Normal"
    data["is_normal"] = data["CYTOGENETICS"].str.contains("Normal", case=False, na=False).astype(int)

    # Extraction du nombre total de chromosomes
    data["total_chromosomes"] = data["CYTOGENETICS"].str.extract(r"^(\d+)", expand=False).astype(float)
    data.loc[data["is_normal"] == 1, "total_chromosomes"] = 46  # Valeur par défaut pour caryotype normal

    # Normalisation de la casse des chromosomes (tout en majuscule)
    data["CYTOGENETICS"] = data["CYTOGENETICS"].str.upper()

    # Extraction du sexe brut (ex: XY, XX)
    data["sex_raw"] = data["CYTOGENETICS"].str.extract(r"\b(XX|XY|XYY|XXY|XXX|YY)\b", expand=False)

    # Normalisation du sexe : catégorisation des cas atypiques
    def normalize_sex(s):
        if s in ["XX", "XY"]:
            return s  # Sexe standard
        elif pd.notna(s):
            return "Other"  # Cas atypiques connus
        return "Unknown"  # Non défini

    data["sex"] = data["sex_raw"].apply(normalize_sex)

    # Nettoyage des colonnes intermédiaires
    data.drop(columns=["sex_raw"], inplace=True)

    return data

# Appliquer la fonction aux datasets
clinical_test = add_cytogenetic_features(clinical_test)
clinical_train = add_cytogenetic_features(clinical_train)


In [69]:
clinical_test = clinical_test.drop('CYTOGENETICS', axis=1)
clinical_train = clinical_train.drop('CYTOGENETICS', axis=1)

In [70]:
# Merge df_train with target_df on 'ID'
df_train = df_train.merge(target_df, on='ID', how='inner')

In [71]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns excluding 'OS_YEARS' and ID (which is an identifier)
numerical_cols_train = df_train.select_dtypes(include=np.number).columns.tolist()
numerical_cols_test = df_test.select_dtypes(include=np.number).columns.tolist()


if 'OS_YEARS' in numerical_cols_train:
    numerical_cols_train.remove('OS_YEARS') # Exclude the target variable
if 'OS_STATUS' in numerical_cols_train:
    numerical_cols_train.remove('OS_STATUS') # Exclude the target variable

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform on the training data using the filtered numerical columns
df_train[numerical_cols_train] = scaler.fit_transform(df_train[numerical_cols_train])

# Transform on the test data using the filtered numerical columns (and scaler fitted on training data)
# Need to ensure the columns in df_test match the columns used for fitting the scaler on df_train
numerical_cols_test_filtered = [col for col in numerical_cols_train if col in numerical_cols_test]
df_test[numerical_cols_test_filtered] = scaler.transform(df_test[numerical_cols_test_filtered])

In [72]:
# prompt: supprime les variables ID et CENTER de df_train et de df_test
ID_test = df_test['ID']

df_train = df_train.drop(['ID', 'CENTER'], axis=1)
df_test = df_test.drop(['ID', 'CENTER'], axis=1)

In [73]:
molecular_train

Unnamed: 0,ID,CHR,START,END,REF,ALT,GENE,PROTEIN_CHANGE,EFFECT,VAF,DEPTH
0,P100000,11,119149248.0,119149248.0,G,A,CBL,p.C419Y,non_synonymous_codon,0.08300,1308.0
1,P100000,5,131822301.0,131822301.0,G,T,IRF1,p.Y164*,stop_gained,0.02200,532.0
2,P100000,3,77694060.0,77694060.0,G,C,ROBO2,p.?,splice_site_variant,0.41000,876.0
3,P100000,4,106164917.0,106164917.0,G,T,TET2,p.R1262L,non_synonymous_codon,0.43000,826.0
4,P100000,2,25468147.0,25468163.0,ACGAAGAGGGGGTGTTC,A,DNMT3A,p.E505fs*141,frameshift_variant,0.08980,942.0
...,...,...,...,...,...,...,...,...,...,...,...
10930,P131472,Missing,74732959.0,74732959.0,Missing,Missing,MLL,MLL_PTD,PTD,0.32125,975.0
10931,P131505,Missing,74732959.0,74732959.0,Missing,Missing,MLL,MLL_PTD,PTD,0.32125,975.0
10932,P131816,Missing,74732959.0,74732959.0,Missing,Missing,MLL,MLL_PTD,PTD,0.32125,975.0
10933,P132717,Missing,74732959.0,74732959.0,Missing,Missing,MLL,MLL_PTD,PTD,0.32125,975.0


In [74]:
print(molecular_train['GENE'].unique())

['CBL' 'IRF1' 'ROBO2' 'TET2' 'DNMT3A' 'CHEK2' 'PIK3CA' 'TP53' 'STAG2'
 'EP300' 'ETNK1' 'JAK2' 'SRSF2' 'EZH2' 'SF3B1' 'CSF3R' 'GATA2' 'MYC'
 'CREBBP' 'NRAS' 'ASXL1' 'RUNX1' 'CEBPA' 'U2AF1' 'IDH2' 'BCOR' 'NPM1'
 'U2AF2' 'RB1' 'ATRX' 'MPL' 'DDX41' 'STAG1' 'CTCF' 'PHF6' 'ZRSR2' 'CALR'
 'ZNF318' 'WT1' 'BRCC3' 'CUX1' 'PTPN11' 'SPRED2' 'STAT5A' 'ZBTB33' 'MGA'
 'SH2B3' 'KRAS' 'LUC7L2' 'BCORL1' 'NF1' 'KMT2C' 'KIT' 'SETBP1' 'SUZ12'
 'RAD21' 'RAC1' 'ROBO1' 'IDH1' 'ARID2' 'CDKN1B' 'PPM1D' 'CSNK1A1' 'RRAS'
 'DHX33' 'CDKN2A' 'ASXL2' 'GNAS' 'KDM6A' 'SMC1A' 'KMT2D' 'ETV6' 'NF2'
 'DDX54' 'GNB1' 'EGFR' 'RAD50' 'BRAF' 'STAT3' 'DNMT3B' 'NOTCH1' 'ARID1A'
 'EED' 'PRPF8' 'NIPBL' 'TERT' 'NFE2' 'SMG1' 'SF1' 'IRF8' 'KDM5C' 'DICER1'
 'GATA1' 'DDX4' 'CDKN2B' 'FLT3' 'SMC3' 'MLL' 'PTEN' 'ZMYM3' 'SETD2'
 'SAMHD1' 'PAX5' 'SRCAP' 'IRF4' 'H3F3A' 'DDX23' 'RBBP4' 'PAPD5' 'FAM175A'
 'CDK4' 'PRPF40A' 'NOTCH2' 'CSF1R' 'PTPRF' 'JAK3' 'BAP1' 'WHSC1' 'HIPK2'
 'NXF1' 'CDKN2C' 'BCL10' 'ABL1' 'JARID2']
