In [1]:
import pandas as pd
import numpy as np
import sys,os
from tqdm import tqdm
sys.path.insert(0, '/home/shenwanxiang/Research/aliyun_sync/COMPASS/')
from baseline.immnue_score.scorer import ssGSEA

from compass.utils import plot_embed_with_label
from compass import PreTrainer, FineTuner, loadcompass #, get_minmal_epoch
from compass.utils import plot_embed_with_label,plot_performance, score2
from compass.tokenizer import CANCER_CODE, CONCEPT

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from copy import deepcopy

import seaborn as sns
from matplotlib import pyplot as plt
from samecode.survival.plot import KMPlot
from sklearn.model_selection import train_test_split
from lifelines.utils import concordance_index as lfcindex

def onehot(S):
    assert type(S) == pd.Series, 'Input type should be pd.Series'
    dfd = pd.get_dummies(S, dummy_na=True)
    nanidx = dfd[dfd[np.nan].astype(bool)].index
    dfd.loc[nanidx, :] = np.nan
    dfd = dfd.drop(columns=[np.nan])*1.
    cols = dfd.sum().sort_values(ascending=False).index.tolist()
    dfd = dfd[cols]
    return dfd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clear():
    import torch, gc
    gc.collect()
    torch.cuda.empty_cache()  
    torch.cuda.ipc_collect()

In [3]:
params = {'mode': 'PFT',
        'seed':42,
        'lr': 1e-3,
        'device':'cuda:1',
        'weight_decay': 1e-4,
        'batch_size':16, 
        'patience':1e10,
        'max_epochs': 100,
        'task_loss_weight':1,
        'load_decoder':False,
        'task_loss_type': 'ce_loss', 
        'task_type': 'c',
        'task_dense_layer': [16],
        'task_batch_norms':True,
        'entropy_weight': 0.0,
        'with_wandb': False,
        'save_best_model':False,
        'verbose': False}

In [4]:
data_path = '/home/shenwanxiang/Research/aliyun_sync/COMPASS/paper/00_data/'
df_label = pd.read_pickle(os.path.join(data_path, 'ITRP.PATIENT.TABLE.ALIGN'))
df_tpm = pd.read_pickle(os.path.join(data_path, 'ITRP.TPM.TABLE'))
df_tpm.shape, df_label.shape

dfcx = df_label.cancer_type.map(CANCER_CODE).to_frame('cancer_code').join(df_tpm)

pretrainer = loadcompass('/home/shenwanxiang/Research/aliyun_sync/COMPASS/paper/checkpoint/latest/pretrainer.pt')


In [5]:
y = pd.read_csv('../Clinical_Transformer/data/ITRP_clinical.csv', index_col=0)
y = y[(~y.OS_Months.isna()) & (~y.OS_Event.isna())]
y['time'] = y['OS_Months']
y['event'] = y['OS_Event']
y = y[['time', 'event','cohort', 'ICI_target', 'ICI','cancer_type','response_label', 'TMB']]
cohort_list = y.cohort.unique()
print(len(y))

860


In [6]:
dfcx = dfcx.loc[y.index]
features = dfcx.columns

repetitions  = 10
seeds = [24, 42, 64]

In [7]:
y.cohort.unique()

array(['IMVigor210', 'Rose', 'Snyder', 'SU2CLC1', 'SU2CLC2', 'Hugo',
       'Gide', 'Liu', 'Riaz', 'Allen', 'MGH'], dtype=object)

In [None]:
for cohort in [ 'SU2CLC1', 'Rose', 'Snyder', 'SU2CLC2', 'Hugo',
               'Riaz', 'Allen', 'MGH']: 
    
    data_train = y[y.cohort != cohort][['time', 'event', 'response_label']].join(dfcx)
    data_test = y[y.cohort == cohort][['time', 'event','response_label']].join(dfcx)

    work_dir = f'./LOCO_PFT/{cohort}'
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)

    res = []
    for fd in range(repetitions):
        inner_train_data, inner_valid_data = train_test_split(
            data_train, 
            test_size=0.1, 
            random_state=fd,
            stratify=data_train[['event']])

        train_cohort_name = f'leave_{cohort}_fold_{fd}'
        Y = onehot(inner_train_data['response_label'])
        X = inner_train_data[features]

        Y_val = onehot(inner_valid_data['response_label'])
        X_val = inner_valid_data[features]

        pretrainer = pretrainer.copy()
        finetuner = FineTuner(pretrainer, **params, 
                              work_dir= work_dir, 
                              task_name = '%s' % train_cohort_name)
        
        finetuner = finetuner.tune_with_test(dfcx_train = X,
                                             dfy_train = Y, 
                                             dfcx_test=X_val,
                                             dfy_test=Y_val,)

        dfh = pd.DataFrame(finetuner.performance)[[0, 3, 6]]
        dfh.columns = ['epoch', 'loss', 'val_loss']
        dfh.epoch = dfh.epoch + 1
        dfh['seed'] = fd
        dfh.to_csv(f'./{work_dir}/train_val_loss_fd_{fd}.csv')
        res.append(dfh)
        clear()            
    res = []
    for i in range(repetitions):
        df = pd.read_csv(f'./{work_dir}/train_val_loss_fd_{i}.csv',index_col=0)
        res.append(df)
    
    dfp = pd.concat(res)
    best_epoch = dfp.groupby('epoch').val_loss.mean().idxmin()

    for seed in seeds:
        pretrainer = pretrainer.copy()
        myparams = params.copy()
        myparams['max_epochs'] = best_epoch
        myparams['seed'] = seed
        my_work_dir = f'./{work_dir}/seed_{seed}'
        if not os.path.exists(my_work_dir):
            os.makedirs(my_work_dir)

        finetuner = FineTuner(pretrainer, **myparams, 
                              work_dir= my_work_dir, 
                              task_name = f'leave_{cohort}')
        
        finetuner = finetuner.tune(dfcx_train = data_train[features],
                                   dfy_train = onehot(data_train['response_label']))

        clear()
        
        dfc, dfp = finetuner.predict(data_test[features], batch_size= 128)
        dfp['seed'] = seed
        dfp['best_epoch'] = best_epoch
        dfp['test_cohort'] = cohort
        dfp.to_csv(f'./{my_work_dir}/test_data.csv')