In [3]:
import os
from tqdm import tqdm
from itertools import chain
import pandas as pd
import numpy as np
import random, torch
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = 'white', font_scale=1.5)

import sys
sys.path.insert(0, '/home/was966/Research/mims-compass/')
from compass.utils import plot_embed_with_label
from compass import PreTrainer, FineTuner, loadcompass
from compass.utils import plot_embed_with_label, score
from compass.tokenizer import CANCER_CODE


def onehot(S):
    assert type(S) == pd.Series, 'Input type should be pd.Series'
    dfd = pd.get_dummies(S, dummy_na=True)
    nanidx = dfd[dfd[np.nan]].index
    dfd.loc[nanidx, :] = np.nan
    dfd = dfd.drop(columns=[np.nan])*1.
    cols = dfd.sum().sort_values(ascending=False).index.tolist()
    dfd = dfd[cols]
    return dfd

In [4]:
data_path = '/home/shenwanxiang/Research/aliyun_sync/COMPASS/paper/00_data/'

df_tpm = pd.read_pickle(os.path.join(data_path,  'TCGA.TPM.TABLE'))
df_label = pd.read_pickle(os.path.join(data_path, 'TCGA.PATIENT.PROCESSED.TABLE'))
df_label_org = pd.read_pickle(os.path.join(data_path, 'TCGA.PATIENT.TABLE'))
df_tpm.shape, df_label.shape

((10184, 15672), (10184, 13))

In [8]:
# load the pretrained model as a feature extractor
pretrainer = loadcompass('../checkpoint/latest/pretrainer.pt', map_location='cpu')
genesetprojector = pretrainer.model.latentprojector.genesetprojector
cellpathwayprojector = pretrainer.model.latentprojector.cellpathwayprojector
df_tpm = df_tpm[pretrainer.feature_name]
pretrainer.count_parameters()

1019421

In [9]:
dfcx = df_label.cancer_type.apply(lambda x:x.replace('TCGA-', '')).map(CANCER_CODE).to_frame('cancer_code').join(df_tpm)
dfgeneset, dfcelltype = pretrainer.extract(dfcx,  batch_size= 128)

100%|###################################################################################| 80/80 [04:58<00:00,  3.74s/it]


In [10]:
df_label_org.to_csv('./TCGA/00_clinical_label_orignal.csv')
df_label.to_csv('./TCGA/00_clinical_label.csv')
dfgeneset.to_csv('./TCGA/01_readouts_geneset.csv')
dfcelltype.to_csv('./TCGA/02_readouts_celltype.csv')

In [11]:
dfg, dfc = pretrainer.project(dfcx,  batch_size= 128)

100%|###################################################################################| 80/80 [04:55<00:00,  3.70s/it]


In [12]:
pid = dfg.index.map(lambda x:x.split('$$')[0])
fid = dfg.index.map(lambda x:x.split('$$')[1])
df = pd.DataFrame(index=dfg.index)
df['bcr_patient_barcode'] = pid
df['feature_name'] = fid
df = df.join(dfg)
df = df.sort_values(['feature_name', 'bcr_patient_barcode'])
df.to_csv('./TCGA/03_features_geneset.gzip', compression = 'gzip')

In [13]:
pid = dfc.index.map(lambda x:x.split('$$')[0])
fid = dfc.index.map(lambda x:x.split('$$')[1])
df = pd.DataFrame(index=dfc.index)
df['bcr_patient_barcode'] = pid
df['feature_name'] = fid
df = df.join(dfc)
df = df.sort_values(['feature_name', 'bcr_patient_barcode'])
df.to_csv('./TCGA/04_features_celltype.gzip', compression = 'gzip')