In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,roc_auc_score

# Load Dataset


In [4]:
data=pd.read_csv("/content/Classeur_TP_fevr2023.csv")
data.head()

Unnamed: 0,NUMERO ANONYME,STATUT EN 01/22,statut,time_months,Failure_first_ttt,Status_2y,Status_5y,Relapse_after_CR,Primary_response,Overall_response,...,VOI40_SPECT_Asphericity,VOI40_SPECT_NormalizedHocRadiusSphere,VOI40_SPECT_Maximum3DDiameter,VOI40_SPECT_Maximum3DDiameter_normHeight,VOI40_SPECT_Mean_Ratio_Mean_Liver,VOI40_SPECT_Skewness,VOI40_SPECT_Kurtosis,VOI40_SPECT_MaximumGreyLevel_Ratio_Mean_Liver,Visual_uptake,Visual_heterogeneity
0,62,EN VIE,0,35,No,Alive,,No,PR,PR,...,0.983674,0.479734,120.588307,1.488745,0.745309,0.37153,0.130404,1.267516,Uptake_lower_liver,Heterogeneous
1,67,DECES,1,33,Yes,Alive,Dead,No,SD,PR,...,0.581737,0.500597,228.502253,2.285023,4.13303,-0.104899,-0.392836,6.51827,Upatke_higher_liver,Heterogeneous
2,68,DECES,1,34,No,Alive,Dead,Yes,PR,PR,...,0.329291,0.554013,28.116246,0.251038,1.579389,0.517022,-1.731853,1.751086,Upatke_higher_liver,Heterogeneous
3,105,EN VIE,0,74,Yes,Alive,Alive,No,PR,PR,...,0.408325,0.54158,111.550046,1.616667,2.030242,-0.478963,0.055675,3.212246,Upatke_higher_liver,Homogeneous
4,118,PERDU DE VUE,0,1,,,,,,,...,0.850333,1.179227,152.549411,1.326517,3.259366,0.283834,-0.101313,5.814274,Upatke_higher_liver,Heterogeneous


# Binary Classification Model

In [5]:
target_candidate=["statut","time_months","Failure_first_ttt","Status_2y","Status_5y","Relapse_after_CR","Primary_response","Overall_response","Intitial_global_response"]

In [6]:
data.isnull().sum()

NUMERO ANONYME                                    0
STATUT EN 01/22                                   0
statut                                            0
time_months                                       0
Failure_first_ttt                                 1
Status_2y                                         1
Status_5y                                        39
Relapse_after_CR                                  1
Primary_response                                  1
Overall_response                                  1
Intitial_global_response                          1
Initial_ttt                                       0
Sex                                               0
Age_months                                        0
Lower_18m                                         0
MKI                                              24
INPC                                             18
Genomic_class                                    15
MYCN                                              1
INRGSS      

In [7]:
data.dtypes

NUMERO ANONYME                                     int64
STATUT EN 01/22                                   object
statut                                             int64
time_months                                        int64
Failure_first_ttt                                 object
Status_2y                                         object
Status_5y                                         object
Relapse_after_CR                                  object
Primary_response                                  object
Overall_response                                  object
Intitial_global_response                          object
Initial_ttt                                       object
Sex                                               object
Age_months                                       float64
Lower_18m                                         object
MKI                                               object
INPC                                              object
Genomic_class                  

In [8]:
#Selection des features numériques = quantitatives
#Suppression de la variable Numero Anonyme qui n'est pas à prédire

numeric_features=data.select_dtypes(exclude=['object']).columns.to_list()
numeric_features=[col for col in numeric_features if col not in target_candidate]
numeric_features.remove("NUMERO ANONYME")
numeric_features

['Age_months',
 'Baseline_siopen',
 'Post_ttt_siopen',
 'VOIm_SPECT_ApproximateVolume',
 'VOIm_SPECT_ApproximateVolume_normBSA',
 'VOIm_SPECT_Asphericity',
 'VOIm_SPECT_HocMaxNormalizedWithRadiusSphere',
 'VOIm_SPECT_Maximum3DDiameter',
 'VOIm_SPECT_Maximum3DDiameter_normHeight',
 'VOIm_SPECT_Mean_Ratio_Mean_Liver',
 'VOIm_SPECT_Skewness',
 'VOIm_SPECT_Kurtosis',
 'VOIm_SPECT_MaximumGreyLevel_Ratio_Mean_Liver',
 'VOIm_CT_MinimumGreyLevel',
 'VOIm_CT_10thPercentile',
 'VOIm_CT_90thPercentile',
 'VOIm_CT_MaximumGreyLevel',
 'VOI40_SPECT_ApproximateVolume',
 'VOI40_SPECT_ApproximateVolume_normBSA',
 'VOI40_SPECT_Asphericity',
 'VOI40_SPECT_NormalizedHocRadiusSphere',
 'VOI40_SPECT_Maximum3DDiameter',
 'VOI40_SPECT_Maximum3DDiameter_normHeight',
 'VOI40_SPECT_Mean_Ratio_Mean_Liver',
 'VOI40_SPECT_Skewness',
 'VOI40_SPECT_Kurtosis',
 'VOI40_SPECT_MaximumGreyLevel_Ratio_Mean_Liver']

In [9]:
#Selection des features qualitatives (categorical)
#Suppression de STATUT EN 01/22 qui n'est pas à prédire

categorical_features=data.select_dtypes(include=['object']).columns.to_list()
categorical_features=[col for col in categorical_features if col not in target_candidate]
categorical_features.remove("STATUT EN 01/22")
categorical_features

['Initial_ttt',
 'Sex',
 'Lower_18m',
 'MKI',
 'INPC',
 'Genomic_class',
 'MYCN',
 'INRGSS',
 'Primitif_compartment',
 'IDRF',
 'NRB_sabliers_variantes',
 'M_node',
 'M_bone',
 'M_om',
 'M_liver',
 'MSI_Index',
 'Subtype',
 'Baseline_LDH',
 'Baseline_ratio_VMA_HVA',
 'Baseline_risk_group',
 'Visual_uptake',
 'Visual_heterogeneity']

In [11]:
#Pipeline pour suite d'opérations redondantes
#Remlacement par des valeurs médianes

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

#Attribution de variables aux variables catégorical : 001, 010, 100... toutes les combinaisons possibles.
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Classification binaire : statut dans 2 ans

In [12]:
target="Status_2y"

In [13]:
df=data.dropna(subset = [target])### drop null values in target

In [14]:
df[target].unique()

array(['Alive', 'Dead'], dtype=object)

In [15]:
replace_dict={"Alive":0,"Dead":1}
df[target]=df[target].replace(replace_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target]=df[target].replace(replace_dict)


In [16]:
#X = features nous permettant les prédictions
#Y = classes
X=df[numeric_features+categorical_features]
y=df[target]

In [18]:
#Equilibrage des classes 
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [19]:
#Classifier Gradient Boosting
clf = Pipeline(
    steps=[("preprocessor", preprocessor),("classifier", GradientBoostingClassifier())]
)

X_train, X_test, y_train, y_test = train_test_split(X_resampled,y_resampled, test_size=0.25,random_state=42)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
print("model auc score %.3f" % roc_auc_score( y_test,clf.predict(X_test)))
print(classification_report( y_test,clf.predict(X_test)))

model score: 0.975
model auc score 0.971
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        17
           1       0.96      1.00      0.98        23

    accuracy                           0.97        40
   macro avg       0.98      0.97      0.97        40
weighted avg       0.98      0.97      0.97        40



# Classification binaire : statut dans 5 ans 

In [20]:
target="Status_5y"

In [21]:
df=data.dropna(subset = [target])### drop null values in target

In [22]:
df[target].unique()

array(['Dead', 'Alive'], dtype=object)

In [23]:
replace_dict={"Dead":0,"Alive":1}
df[target]=df[target].replace(replace_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target]=df[target].replace(replace_dict)


In [24]:
X=df[numeric_features+categorical_features]
y=df[target]

In [25]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [26]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", DecisionTreeClassifier())]
)

X_train, X_test, y_train, y_test = train_test_split(X_resampled,y_resampled, test_size=0.3,random_state=42)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
print("model auc score %.3f" % roc_auc_score( y_test,clf.predict(X_test)))
print(classification_report( y_test,clf.predict(X_test)))

model score: 0.714
model auc score 0.697
              precision    recall  f1-score   support

           0       0.77      0.77      0.77        13
           1       0.62      0.62      0.62         8

    accuracy                           0.71        21
   macro avg       0.70      0.70      0.70        21
weighted avg       0.71      0.71      0.71        21



# Target : Statut

In [27]:
target="statut"

In [28]:
df=data.dropna(subset = [target])### drop null values in target

In [29]:
df[target].unique()

array([0, 1])

In [30]:
X=df[numeric_features+categorical_features]
y=df[target]

In [31]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [32]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", GradientBoostingClassifier())]
)

X_train, X_test, y_train, y_test = train_test_split(X_resampled,y_resampled, test_size=0.2,random_state=42)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
print("model auc score %.3f" % roc_auc_score( y_test,clf.predict(X_test)))
print(classification_report( y_test,clf.predict(X_test)))

model score: 0.793
model auc score 0.798
              precision    recall  f1-score   support

           0       0.86      0.75      0.80        16
           1       0.73      0.85      0.79        13

    accuracy                           0.79        29
   macro avg       0.80      0.80      0.79        29
weighted avg       0.80      0.79      0.79        29



# Target=Primary_response

In [33]:
target="Primary_response"

In [34]:
df=data.dropna(subset = [target])### drop null values in target

In [35]:
df[target].unique()

array(['PR', 'SD', 'PD', 'CR'], dtype=object)

In [36]:
replace_dict={"PR":0,"SD":1,"PD":3,"CR":4}
df[target]=df[target].replace(replace_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target]=df[target].replace(replace_dict)


In [37]:
X=df[numeric_features+categorical_features]
y=df[target]

In [38]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [39]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", DecisionTreeClassifier())]
)

X_train, X_test, y_train, y_test = train_test_split(X_resampled,y_resampled, test_size=0.3,random_state=42)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
print(classification_report( y_test,clf.predict(X_test)))

model score: 0.961
              precision    recall  f1-score   support

           0       1.00      0.80      0.89        15
           1       0.92      1.00      0.96        22
           3       1.00      1.00      1.00        20
           4       0.95      1.00      0.98        20

    accuracy                           0.96        77
   macro avg       0.97      0.95      0.96        77
weighted avg       0.96      0.96      0.96        77



# Target : Overall Response

In [40]:
target="Overall_response"

In [41]:
df=data.dropna(subset = [target])### drop null values in target

In [42]:
replace_dict={"PR":0,"SD":1,"PD":3,"CR":4}
df[target]=df[target].replace(replace_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target]=df[target].replace(replace_dict)


In [43]:
X=df[numeric_features+categorical_features]
y=df[target]

In [44]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [45]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", DecisionTreeClassifier())]
)

X_train, X_test, y_train, y_test = train_test_split(X_resampled,y_resampled, test_size=0.3,random_state=42)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
print(classification_report( y_test,clf.predict(X_test)))

model score: 0.936
              precision    recall  f1-score   support

           0       0.83      0.88      0.86        17
           1       0.91      0.88      0.89        24
           3       1.00      1.00      1.00        18
           4       1.00      1.00      1.00        19

    accuracy                           0.94        78
   macro avg       0.94      0.94      0.94        78
weighted avg       0.94      0.94      0.94        78



# Target : initial global response

In [46]:
target="Intitial_global_response"

In [47]:
df=data.dropna(subset = [target])### drop null values in target

In [48]:
df[target].unique()

array(['CR_PR', 'PD', 'SD'], dtype=object)

In [49]:
replace_dict={"PR":0,"SD":1,"PD":3,"CR_PR":4}
df[target]=df[target].replace(replace_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target]=df[target].replace(replace_dict)


In [50]:
X=df[numeric_features+categorical_features]
y=df[target]

In [51]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [52]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", DecisionTreeClassifier())]
)

X_train, X_test, y_train, y_test = train_test_split(X_resampled,y_resampled, test_size=0.3,random_state=42)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
print(classification_report( y_test,clf.predict(X_test)))

model score: 0.969
              precision    recall  f1-score   support

           1       0.92      1.00      0.96        24
           3       1.00      1.00      1.00        24
           4       1.00      0.88      0.93        16

    accuracy                           0.97        64
   macro avg       0.97      0.96      0.96        64
weighted avg       0.97      0.97      0.97        64



# Kaplan-Meier Prediction

In [53]:
!pip install lifelines

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lifelines
  Downloading lifelines-0.27.4-py3-none-any.whl (349 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.7/349.7 KB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting formulaic>=0.2.2
  Downloading formulaic-0.5.2-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.1/77.1 KB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting autograd-gamma>=0.3
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting interface-meta>=1.2.0
  Downloading interface_meta-1.3.0-py3-none-any.whl (14 kB)
Collecting graphlib-backport>=1.0.0
  Downloading graphlib_backport-1.0.3-py3-none-any.whl (5.1 kB)
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py) ... [?25l[?25hdone
  Created wheel for autograd-gamma: filena

In [54]:
data.time_months.describe()

count     92.000000
mean      49.304348
std       26.342937
min        1.000000
25%       28.000000
50%       47.500000
75%       67.250000
max      107.000000
Name: time_months, dtype: float64

In [55]:
import pandas as pd
from lifelines import KaplanMeierFitter
data=data.dropna(subset = ['time_months','statut'])### drop null values 
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(data[['time_months']],data[['statut']])
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3,random_state=42)

# fit the Kaplan-Meier estimator on the training set
kmf = KaplanMeierFitter()
kmf.fit(X_train, y_train)

# classify patients in the test set based on survival
y_pred = (kmf.predict(X_test['time_months']) > 0.5).astype(int)

# compute classification metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_pred,y_test)
precision = precision_score(y_pred,y_test)
recall = recall_score(y_pred,y_test)
f1 = f1_score(y_pred,y_test)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.7272727272727273
Precision: 0.7619047619047619
Recall: 0.6956521739130435
F1-score: 0.7272727272727272


# Cox Analysis

In [56]:
import pandas as pd
from lifelines import CoxPHFitter

cox = CoxPHFitter().fit(data[['time_months','statut']],'time_months','statut')

cox.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'time_months'
event col,'statut'
baseline estimation,breslow
number of observations,92
number of events observed,20
partial log-likelihood,-84.76
time fit was run,2023-03-02 09:55:30 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)

0,1
Concordance,0.50
Partial AIC,169.53
log-likelihood ratio test,0.00 on 0 df
-log2(p) of ll-ratio test,
