Pancancer Training - We use 80% of the samples for training and 20% for testing, we ensure with stratify that all tissue are represented both in train and test with same percentages. IMPORTANT: The code assumes that the .env file in root directory contains both the location of Datasets folder (where GDC_samples.csv resides) and the location of raw GDC samples in GDC_PATH variable. 

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow import keras
from keras import Input,Model
from keras.layers import Dense, Conv1D, AveragePooling1D, Flatten, Activation, Concatenate, Dropout, AlphaDropout, GlobalAveragePooling1D
import os
import sys
from pathlib import Path
from sklearn.model_selection import train_test_split

#Add the parent directory to access ENV variables
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

#Import of necessary paths ( GDC data Path and Dataset folder)
from config import GDC_PATH,THYROID_PATH, TENSORBOARD_PATH, MODEL_PATH

# Loading GDC Pancancer Data and Train/Test Splitting

In [None]:
samples_dataframe_path = Path(THYROID_PATH,'GDC_samples.csv')
samples_dataframe = pd.read_csv(samples_dataframe_path,index_col=0)
#Add GDC PATH to the Path column
samples_dataframe['Path']=[ Path(GDC_PATH,path ) for path in samples_dataframe['Path']]

To stratify taking account of both normal/cancer imbalance and tissue of origin, we create a dummy placeholder variable that accounts for both conditions. Num_ID is the number of the GDC/TCGA study, where only one tissue is present.

In [None]:
samples_dataframe['dummy'] = samples_dataframe['Num_ID'].astype(str) + '_'+ samples_dataframe['Target'].astype(str)

train, test = train_test_split(samples_dataframe, test_size=0.2, random_state=2046, stratify=samples_dataframe[['dummy']])
train.head()

In [None]:
print("Data Loading Complete")

# Data Generator


In [None]:
class CustomDataGen(tf.keras.utils.Sequence):
    
    def __init__(self, df, X_col, y_col,
                 batch_size,
                 input_size=(485577,1),
                 shuffle=True, to_fit = True):
        
        self.df = df.copy()
        self.X_col = X_col
        self.y_col = y_col
        self.batch_size = batch_size
        self.input_size = input_size
        self.shuffle = shuffle
        self.to_fit = to_fit
        
        self.n = len(self.df)
        self.n_study = df[y_col['study']].nunique()
        self.n_target = df[y_col['target']].nunique()
    
    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
    
    def __get_input(self, path):
        temp = pd.read_csv(path,sep='\t',index_col=0,names=['value'])
        temp = temp.reindex(sorted(temp.index))
        temp = temp.fillna(0.0)
        return temp

    
    def __get_output(self, label, num_classes):
        return tf.keras.utils.to_categorical(label, num_classes=num_classes)
    
    def __get_data(self, batches):
        # Generates data containing batch_size samples

        path_batch = batches[self.X_col['path']]
        #study_batch = batches[self.y_col['study']]
        target_batch = batches[self.y_col['target']]

        
        
        X_batch = np.asarray([self.__get_input(path) for path in path_batch])
        
        if self.to_fit == True:
        
            #y0_batch = np.asarray([self.__get_output(y, self.n_study) for y in study_batch])
            y_batch = np.asarray([y for y in target_batch])
        
            return (X_batch, y_batch)
        else:
            return X_batch
    def __getitem__(self, index):
        batches = self.df[index * self.batch_size:(index + 1) * self.batch_size]
        X, y = self.__get_data(batches)        
        return X, y
    
    def __len__(self):
        return self.n // self.batch_size

# Train and Test Generator

In [None]:
batch_size = 8
traingen = CustomDataGen(train,X_col={'path':'Path'},y_col={'study':'Num_ID','target':'Target'},batch_size=batch_size)
validgen = CustomDataGen(test,X_col={'path':'Path'},y_col={'study':'Num_ID','target':'Target'},batch_size=batch_size)
num_classes = traingen.n_study

pos = np.sum(train['Target']==1)
neg = np.sum(train['Target']==0)
total = pos+neg
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}

# CNN - 1D

In [None]:
def conv(i, filters=16, kernel_size=5, strides=1):
    #, activity_regularizer=keras.regularizers.l2(1e-6)
    i = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, strides=strides, padding='same',kernel_initializer='glorot_uniform')(i)
    i = keras.layers.BatchNormalization()(i)
    i = keras.layers.LeakyReLU()(i)
    i = keras.layers.SpatialDropout1D(0.1)(i)
    return i

def residual_unit(x, filters, layers=3):
    inp = x
    for i in range(layers):
        x = conv(x, filters)
    return keras.layers.add([x, inp])

def conv_block(x, filters, strides):
    x = conv(x, filters)
    #x = dense_residual_unit(x, filters)
    x = residual_unit(x, filters)
    if strides > 1:
        x = keras.layers.AveragePooling1D(strides, strides)(x)
    return x

def get_uncompiled_model(input_shape = (485577,1)):
    inp = keras.layers.Input(shape=input_shape, dtype=tf.float32)

    conv_1 = conv_block(inp, 16, 1)
    avgpool1 = AveragePooling1D(
        pool_size=2, strides=2, name='avgpool1',
        data_format="channels_last")(conv_1)
    conv_2 = conv_block(avgpool1, 32, 2)
    avgpool2 = AveragePooling1D(
        pool_size=2, strides=2, name='avgpool2',
        data_format="channels_last")(conv_2)
    conv_3 = conv_block(avgpool2, 64, 2)
    
    flat = Flatten()(conv_3)
    output= Dense(1, activation='sigmoid',name='output')(flat)
  

    model = keras.models.Model(inp, output)
    return model

def get_compiled_model(metrics=None):
    
    if(metrics is None):
        metrics = [
              keras.metrics.TruePositives(name='tp'),
              keras.metrics.FalsePositives(name='fp'),
              keras.metrics.TrueNegatives(name='tn'),
              keras.metrics.FalseNegatives(name='fn'), 
              keras.metrics.BinaryAccuracy(name='accuracy'),
              keras.metrics.Precision(name='precision'),
              keras.metrics.Recall(name='recall'),
              keras.metrics.AUC(name='auc'),
              keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
        ]
        
    model = get_uncompiled_model()
    model.compile(loss='binary_crossentropy',optimizer="adam",metrics=metrics)
    return model


In [None]:
# Continue the training if checkpoint exist, otherwise start training from scratch

def make_or_restore_model(checkpoint_dir = "ckpt"):
    # Either restore the latest model, or create a fresh one
    # if there is no checkpoint available.
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    checkpoints = [checkpoint_dir + "/" + name for name in os.listdir(checkpoint_dir)]
    if checkpoints:
        latest_checkpoint = max(checkpoints, key=os.path.getctime)
        print("Restoring from", latest_checkpoint)
        return keras.models.load_model(latest_checkpoint)
    print("Creating a new model")
    return get_compiled_model()


In [None]:
#model = get_uncompiled_model() #Get the uncompiled model
#model.summary() #Summary of model before training

In [None]:
#keras.utils.plot_model(model) #Plot the model architecture

# Pan-cancer - All Samples with Leaky RELU

In [None]:
root_logdir = Path(TENSORBOARD_PATH)

def get_run_logdir():
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir() # e.g., './my_logs/run_2025_03_16-11_28_43'

In [None]:
checkpoint_dir = 'ckpt-leakyrelu'
checkpoint_cb = keras.callbacks.ModelCheckpoint(
        filepath= os.path.join(checkpoint_dir,"ckpt-loss={loss:.2f}"), save_freq=100)

tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)

lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(
    patience=5,
    factor=0.2,
    min_lr=0.0000001)

early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

callbacks = [checkpoint_cb,early_stopping_cb,tensorboard_cb,lr_schedule]
model = make_or_restore_model(checkpoint_dir)


In [None]:
model_history = model.fit(traingen, epochs=100, batch_size = batch_size,validation_data = validgen, callbacks=callbacks, class_weight=class_weight)

In [None]:
#Save training history
history_df = pd.DataFrame(model_history.history)
history_df.to_csv("fit_history/pan-cancer-leaky-relu.csv")

In [None]:
#Save Model
model.save(os.path.join(MODEL_PATH,"pan-cancer-leaky-relu"))

# Pan-cancer - All samples with Standard RELU 

To change the Leaky RELU to standard RELU we just need to replace the conv function

In [None]:
def conv(i, filters=16, kernel_size=5, strides=1):
    #, activity_regularizer=keras.regularizers.l2(1e-6)
    i = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, strides=strides, padding='same',kernel_initializer='glorot_uniform')(i)
    i = keras.layers.BatchNormalization()(i)
    i = keras.layers.ReLU()(i)
    i = keras.layers.SpatialDropout1D(0.1)(i)
    return i

In [None]:
root_logdir = Path(TENSORBOARD_PATH)

def get_run_logdir():
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir() # e.g., './my_logs/run_2025_03_16-11_28_43'

In [None]:
checkpoint_dir = 'ckpt-standardrelu'
checkpoint_cb = keras.callbacks.ModelCheckpoint(
        filepath= os.path.join(checkpoint_dir,"ckpt-loss={loss:.2f}"), save_freq=100)

tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)

lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(
    patience=5,
    factor=0.2,
    min_lr=0.0000001)

early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

callbacks = [checkpoint_cb,early_stopping_cb,tensorboard_cb,lr_schedule]
model = make_or_restore_model(checkpoint_dir)

In [None]:
model_history = model.fit(traingen, epochs=100, batch_size = batch_size,validation_data = validgen, callbacks=callbacks, class_weight=class_weight)

In [None]:
#Save training history
history_df = pd.DataFrame(model_history.history)
history_df.to_csv("fit_history/pan-cancer-standard-relu.csv")

In [None]:
model.save(os.path.join(MODEL_PATH,"pan-cancer-standard-relu"))

# Pan cancer - Solid Tumors with Standard RELU 

We reuse the model with standard relu and we remove blood samples from the training and test set.

In [None]:
blood_studies = ('TCGA-LAML','TARGET-AML')
solid_samples = samples_dataframe[~samples_dataframe['Project.ID'].isin(blood_studies)]

In [None]:
train, test = train_test_split(solid_samples, test_size=0.2, random_state=2046, stratify=solid_samples[['dummy']])
train.head()

In [None]:
batch_size = 8
traingen = CustomDataGen(train,X_col={'path':'Path'},y_col={'study':'Num_ID','target':'Target'},batch_size=batch_size)
validgen = CustomDataGen(test,X_col={'path':'Path'},y_col={'study':'Num_ID','target':'Target'},batch_size=batch_size)
num_classes = traingen.n_study

pos = np.sum(train['Target']==1)
neg = np.sum(train['Target']==0)
total = pos+neg
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}

In [None]:
root_logdir = Path(TENSORBOARD_PATH)

def get_run_logdir():
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir() # e.g., './my_logs/run_2025_03_16-11_28_43'

In [None]:
checkpoint_dir = 'ckpt-solidonly'
checkpoint_cb = keras.callbacks.ModelCheckpoint(
        filepath= os.path.join(checkpoint_dir,"ckpt-loss={loss:.2f}"), save_freq=100)

tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)

lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(
    patience=5,
    factor=0.2,
    min_lr=0.0000001)

early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

callbacks = [checkpoint_cb,early_stopping_cb,tensorboard_cb,lr_schedule]
model = make_or_restore_model(checkpoint_dir)

In [None]:
model_history = model.fit(traingen, epochs=100, batch_size = batch_size,validation_data = validgen, callbacks=callbacks, class_weight=class_weight)

In [None]:
#Save training history
history_df = pd.DataFrame(model_history.history)
history_df.to_csv("fit_history/pan-cancer-solid-only.csv")

In [None]:
model.save(os.path.join(MODEL_PATH,"pan-cancer-solid-only"))