In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier

## Data Loading and Preparation

In [None]:
df_clinical = pd.read_excel('/path/to/clinical_data')

df_train_radiomics_standardized = pd.read_excel('/path/to/scaled_train_set')
scaled_features_val = pd.read_excel('/path/to/scaled_inval_set')
scaled_test_set = pd.read_excel('/path/to/scaled_test_set')
scaled_TCIA_TCGA = pd.read_excel('/path/to/scaled_TCIA_TCGA')

#The same standardization applied to the training set has already been applied to the other datasets.

df_clinical_final = df_clinical.rename(columns={'ORR_RECIST1.1': 'ORR'})
df_train_set = pd.merge(df_train_radiomics_standardized, df_clinical_final[[ 'ORR']], on='ID', how='left')
df_inval_set= pd.merge(scaled_features_val, df_clinical_final[[ 'ORR']], on='ID', how='left' )
external_test_set= pd.merge(scaled_test_set, df_clinical_final[[ 'ORR']], on='ID', how='left' )

df_clinical_TCIA_set = pd.read_excel('/path/to/df_TCGA_clinical_data')
TCGA_TCIA_set= pd.merge(scaled_TCIA_TCGA, df_clinical_TCIA_set, on='case_submitter_id', how='left' )

In [None]:
prefixes_list = pd.read_excel('/path/to/final_features_GTR_list')
prefixes_to_keep = prefixes_list['Feature'].tolist()

X_df_train= df_train_set[prefixes_to_keep]
y_df_train = df_train_set['ORR']

X_val_K = df_inval_set[prefixes_to_keep]
y_val = df_inval_set['ORR']

X_external = external_test_set[prefixes_to_keep]
Y_external = external_test_set['ORR']

X_TCGA_TCIA_set = TCGA_TCIA_set[prefixes_to_keep]

## Modelling

In [None]:
######  Random Forest  ######
rf = RandomForestClassifier()
n_estimators_range=[50, 100, 200, 300]
max_depth_range=[10,50,100,150,200,300,500]
max_depth_range.append(None)
min_samples_split_range= list(range(2, 11, 1))
min_samples_leaf_range=list(range(1, 11, 1))

param_grid_rf={
    'n_estimators':n_estimators_range,
    'max_depth':max_depth_range,
    'min_samples_split':min_samples_split_range,
    'min_samples_leaf':min_samples_leaf_range
}

grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=10, n_jobs=-1, verbose=2)

grid_search_rf.fit(X_df_train, y_df_train)

print('Best parameters for Random Forest: ', grid_search_rf.best_params_)  

In [None]:
######  Decision Tree  ######
dt = DecisionTreeClassifier()

max_depth_range = list(range(1,21,1))
min_samples_leaf_range = list(range(2,21,1))
min_samples_split= list(range(2,21,1))

param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': max_depth_range,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf_range,
}

grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=10, n_jobs=-1, verbose=2)

grid_search_dt.fit(X_df_train, y_df_train)

print('Best parameters for Decision Tree: ', grid_search_dt.best_params_)

In [None]:
######  Support Vector Machine  ######
svm = SVC(probability=True)  

param_grid_svm = {
    'C': [0.1, 1,2,3,10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto',  0.001,0.01,0.1, 0.2, 1],
    'max_iter':[1000000]
}

grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, cv=10, n_jobs=-1, verbose=2)

grid_search_svm.fit(X_df_train, y_df_train)

print('Best parameters for SVM: ', grid_search_svm.best_params_)

In [None]:
######  Logistic Regression  ######
lr = LogisticRegression()
grid_search_lr = LogisticRegression()
grid_search_lr.fit(X_df_train, y_df_train)

In [None]:
######  XGBoost  ######
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

n_estimators_range = [50,100,150,200,300,500] 
max_depth_range=list(range(3,11,1))
min_child_weight_range=list(range(1,11,1))

subsample_range= [0.5, 0.7, 0.9,1.0]
learning_rate_range= [0.01,0.02, 0.1, 0.15 ,0.2]

param_grid_xgb = {
    'n_estimators': n_estimators_range,
    'max_depth': max_depth_range,
    'learning_rate': learning_rate_range,
    'subsample': subsample_range,
    'min_child_weight': min_child_weight_range
}

grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, cv=10, n_jobs=-1, verbose=2)

grid_search_xgb.fit(X_df_train, y_df_train)

print('Best parameters for XGBoost: ', grid_search_xgb.best_params_)

## Ensemble learning

In [None]:
# Instantiate the optimal model
opt_rf = RandomForestClassifier(**grid_search_rf.best_params_)
opt_svm = SVC(**grid_search_svm.best_params_, probability=True)
opt_lr = LogisticRegression()  
opt_dt = DecisionTreeClassifier(**grid_search_dt.best_params_)
opt_xgb = XGBClassifier(**grid_search_xgb.best_params_)


# Define the stacking model
stack = StackingClassifier(
    estimators=[
        ('rf', opt_rf),
        ('svm', opt_svm),
        ('lr', opt_lr),
        ('dt', opt_dt),
        ('xgb', opt_xgb)
    ],
    final_estimator=LogisticRegression(),
    cv=5
)

# Train the stacking model
stack.fit(X_df_train, y_df_train)

# Validate the model performance
stack_score_tra = stack.score(X_df_train, y_df_train)
stack_score_val = stack.score(X_val_K, y_val)
stack_score_ext = stack.score(X_external, Y_external)

print(f'Accuracy of stacked model in trainning: {stack_score_tra:.2f}')
print(f'Accuracy of stacked model in validation: {stack_score_val:.2f}')
print(f'Accuracy of stacked model in testing: {stack_score_ext:.2f}')

In [None]:
# Compute predicted probabilities
probs_train = stack.predict_proba(X_df_train)[:, 1]
probs_internal_test = stack.predict_proba(X_val_K)[:, 1]
probs_external_test = stack.predict_proba(X_external)[:, 1]
probs_TCGA_TCIA_set = stack.predict_proba(X_TCGA_TCIA_set)[:, 1]

pred_train = stack.predict(X_df_train)
pred_internal_test = stack.predict(X_val_K)
pred_external_test = stack.predict(X_external)
pred_TCGA_TCIA_set = stack.predict(X_TCGA_TCIA_set)