In [1]:
import pandas as pd
import sys,os
from tqdm import tqdm
from joblib import Parallel, delayed

sys.path.insert(0, '/home/shenwanxiang/Research/aliyun_sync/COMPASS/')
from baseline.immnue_score.scorer import ssGSEA, avgAbundance

from compass.utils import plot_embed_with_label
from compass import PreTrainer, FineTuner, loadcompass #, get_minmal_epoch
from compass.utils import plot_embed_with_label,plot_performance, score2
from compass.tokenizer import CANCER_CODE, CONCEPT


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = '/home/shenwanxiang/Research/aliyun_sync/COMPASS/paper/00_data/'
df_label = pd.read_pickle(os.path.join(data_path, 'ITRP.PATIENT.TABLE.ALIGN'))
df_tpm = pd.read_pickle(os.path.join(data_path, 'ITRP.TPM.TABLE'))
df_tpm.shape, df_label.shape

dfcx = df_label.cancer_type.map(CANCER_CODE).to_frame('cancer_code').join(df_tpm)
df_label.to_csv('./data/ITRP_clinical.csv')

## Signature-level

In [3]:
signature_genes = CONCEPT.Genes.apply(lambda x:x.split(':'))  
signature_genes.head()
 
def run_one_ssgsea(name, g_list, df_tpm):
    gsea = ssGSEA(g_list, name)
    return gsea.fit_transform(df_tpm) 

# Create task generator
tasks = (delayed(run_one_ssgsea)(name, g_list, df_tpm)
         for name, g_list in tqdm(signature_genes.items(), ascii=True, total = len(signature_genes)))
res = Parallel(n_jobs=8, backend="loky")(tasks)

dfc = pd.concat(res, axis=1)
dfc.columns = dfc.columns.str.replace("NES_", "", regex=False)
dfc.loc[df_label.index].to_csv('./data/ITRP_ssGSEA_signature_132.csv')

100%|#################################################################################| 132/132 [03:25<00:00,  1.56s/it]


In [4]:
def run_one_avg(name, g_list, df_tpm):
    gsea = avgAbundance(g_list, name)
    return gsea.fit_transform(df_tpm) 

# Create task generator
tasks = (delayed(run_one_avg)(name, g_list, df_tpm)
         for name, g_list in tqdm(signature_genes.items(), ascii=True, total = len(signature_genes)))
res = Parallel(n_jobs=8, backend="loky")(tasks)

dfc = pd.concat(res, axis=1)
dfc.columns = dfc.columns.str.replace("NAG_", "", regex=False)
dfc.loc[df_label.index].to_csv('./data/ITRP_avg_signature_132.csv')

100%|#################################################################################| 132/132 [00:10<00:00, 12.75it/s]


## Concept-level

In [5]:
def _connect(x):
    from itertools import chain
    return list(set(chain(*[i.split(':') for i in x])))
    
concept_genes = CONCEPT.groupby('BroadCelltypePathway').Genes.apply(lambda x:x.tolist()).apply(_connect)

def run_one_ssgsea(name, g_list, df_tpm):
    gsea = ssGSEA(g_list, name)
    return gsea.fit_transform(df_tpm) 

# Create task generator
tasks = (delayed(run_one_ssgsea)(name, g_list, df_tpm)
         for name, g_list in tqdm(concept_genes.items(), ascii=True, total = len(concept_genes)))
res = Parallel(n_jobs=8, backend="loky")(tasks)

dfc = pd.concat(res, axis=1)
dfc.columns = dfc.columns.str.replace("NES_", "", regex=False)
dfc.loc[df_label.index].to_csv(f'./data/ITRP_ssGSEA_concept_{dfc.shape[1]}.csv')

100%|###################################################################################| 43/43 [00:55<00:00,  1.30s/it]


In [6]:
def run_one_avg(name, g_list, df_tpm):
    gsea = avgAbundance(g_list, name)
    return gsea.fit_transform(df_tpm) 

# Create task generator
tasks = (delayed(run_one_avg)(name, g_list, df_tpm)
         for name, g_list in tqdm(concept_genes.items(), ascii=True, total = len(concept_genes)))
res = Parallel(n_jobs=8, backend="loky")(tasks)

dfc = pd.concat(res, axis=1)
dfc.columns = dfc.columns.str.replace("NAG_", "", regex=False)
dfc.loc[df_label.index].to_csv(f'./data/ITRP_avg_concept_{dfc.shape[1]}.csv')

100%|###################################################################################| 43/43 [00:02<00:00, 18.30it/s]


## COMPASS representation

In [7]:
pretrainer = loadcompass('../../checkpoint/latest/pretrainer.pt', map_location = 'cuda:1')
dfe, dfg, dfc = pretrainer.extract(dfcx,  batch_size= 128, with_gene_level = True)

100%|#####################################################################################| 9/9 [00:11<00:00,  1.28s/it]


In [8]:
#dfe.loc[df_label.index].to_csv(f'./data/ITRP_COMPASS_PT_{dfe.shape[1]}.csv')
dfg.loc[df_label.index].to_csv(f'./data/ITRP_COMPASS_PT_{dfg.shape[1]}.csv')
dfc.loc[df_label.index].to_csv(f'./data/ITRP_COMPASS_PT_{dfc.shape[1]}.csv')