In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [2]:
tcga_data_path = '/n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37'

In [3]:
tpms = glob(os.path.join(tcga_data_path, '*', 'rnaSeq_tpm_matrix.csv'))
samples = glob(os.path.join(tcga_data_path, '*', 'rnaSeq_sample_table.txt'))
genes = glob(os.path.join(tcga_data_path, '*', 'rnaSeq_gene_table.txt'))
scnvs = glob(os.path.join(tcga_data_path, '*', 'SCNV_table.txt'))
snvs = glob(os.path.join(tcga_data_path, '*', 'SNV_table.txt'))
drugs = glob(os.path.join(tcga_data_path, '*', 'clinical_drug_table.txt'))
patients = glob(os.path.join(tcga_data_path, '*', 'clinical_patient_table.txt'))
proteins = glob(os.path.join(tcga_data_path, '*', 'proteome_table.txt'))

In [4]:
len(tpms),len(samples),len(genes),len(scnvs),len(snvs),len(drugs),len(patients),len(proteins)

(33, 33, 33, 33, 33, 33, 33, 32)

# Narrow Down Samples

### Step1. TCGA (33 cancers) RNA-seq TPM matrix


In [5]:
tpm_tbs = []
for tpm in tpms:
    cancer_type = os.path.basename(os.path.dirname(tpm))
    df = pd.read_csv(tpm, index_col=0).T
    dft = pd.DataFrame(index=df.index)
    dft['cancer_type'] = cancer_type
    dft = dft.join(df)
    tpm_tbs.append(dft)

df1 = pd.concat(tpm_tbs, axis=0)
bcr_patient_barcode = df1.index.map(lambda x:x[:12])
bcode = pd.DataFrame(bcr_patient_barcode,index=df1.index, columns = ['bcr_patient_barcode'])
df1 = bcode.join(df1)

In [6]:
df1[df1.columns[2:]].shape

(11274, 60660)

### Step2. Sample type selection (Primary solid Tumor)

In [7]:
sdef = [pd.read_csv(sample,sep='\t', index_col = 0) for sample in samples]
df_samples = pd.concat(sdef)
sample_id_type_map = df_samples.definition
sample_id_type_map.name = 'sample_type'
sample_id_type_map.index.name = 'sample_id'
sample_id_type_map.to_pickle(os.path.join(tcga_data_path, 'sample_id_type.map'))
#df_samples = df_samples[df_samples.prior_treatment == 'No']

# keep only tumor samples
tumer_samples = ['Primary solid Tumor', 'Metastatic', 'Recurrent Solid Tumor',
                 'Primary Blood Derived Cancer - Peripheral Blood'] #'Primary Blood Derived Cancer - Peripheral Blood'
normal_samples = ['Solid Tissue Normal']

sidx = sample_id_type_map[~sample_id_type_map.isin(normal_samples)].index
df2 = df1.loc[sidx].sort_values('cancer_type')
df2[df2.columns[2:]].shape

(10534, 60660)

### Step3. Remove prior treatment samples & is_ffpe

In [8]:
no_prior_treatment_set = set(df_samples[df_samples.prior_treatment == 'No'].index)
no_is_ffpe_set = set(df_samples[df_samples.is_ffpe == False].index)
#no_is_ffpe_set = set(df_samples.index)

selected_samples = no_prior_treatment_set & no_is_ffpe_set
df3 = df2[df2.index.isin(selected_samples)]
df3[df3.columns[2:]].shape

(10305, 60660)

In [9]:
10305

10305

### Step4. Samples to Patients (Average TPM values)


In [10]:
s = df3.groupby(['bcr_patient_barcode']).size()
rnacols = df3.columns[2:]
df_dup = df3[df3.bcr_patient_barcode.isin(s[s > 1].index)]
df_idt = df3[df3.bcr_patient_barcode.isin(s[s == 1].index)]
df_dup_idt = df_dup.groupby(['bcr_patient_barcode', 'cancer_type']).apply(lambda x:x.mean()).reset_index()
df4 = pd.concat([df_idt, df_dup_idt]).sort_values('cancer_type').set_index('bcr_patient_barcode')
df4[df4.columns[1:]].shape

(10184, 60660)

# Narrow Down Genes

### Step5. Gene id-name unique mapping (var criteria)

In [11]:
names = []
types = []
for gf in genes:
    g = pd.read_csv(gf,sep='\t').set_index('gene_id')
    gname = g.gene_name
    gtype = g.gene_type
    names.append(gname)
    types.append(gtype)
gene_id_name_map = pd.concat(names).reset_index().drop_duplicates().set_index('gene_id').gene_name
gene_id_type_map = pd.concat(types).reset_index().drop_duplicates().set_index('gene_id').gene_type
gene_id_name_map.to_pickle(os.path.join(tcga_data_path, 'gene_id_name.map'))
gene_id_type_map.to_pickle(os.path.join(tcga_data_path, 'gene_id_type.map'))
gene_id_type_map.shape

dfv = np.log2(df4[df4.columns[1:]] + 1)
v = dfv.var().to_frame(name='variance')
v['gene_name'] = v.index.map(gene_id_name_map)
## use gene id with the largest variance
gene_name_id_map = v.groupby('gene_name').apply(lambda x:x.variance.idxmax())
gene_name_type_map = gene_name_id_map.map(gene_id_type_map)

gene_name_id_map.to_pickle(os.path.join(tcga_data_path, 'gene_name_id.map'))
gene_name_type_map.to_pickle(os.path.join(tcga_data_path, 'gene_name_type.map'))

### define the subtype genes
gene_type = gene_id_type_map.value_counts()
pseudogenes = [i for i in gene_type.index if 'pseudogene' in i]
s = pd.DataFrame(index=gene_id_type_map.unique())
s['gene_supertype'] = 'other'
s.loc[pseudogenes, 'gene_supertype'] = 'pseudogene'
s.loc['protein_coding', 'gene_supertype'] = 'protein_coding'
s.loc['lncRNA', 'gene_supertype'] = 'lncRNA'
gene_type_supertype_map = s.gene_supertype

gene_id_supertype_map = gene_id_type_map.map(gene_type_supertype_map)
gene_name_supertype_map = gene_name_id_map.map(gene_id_supertype_map)
gene_id_supertype_map.to_pickle(os.path.join(tcga_data_path, 'gene_id_supertype.map'))
gene_name_supertype_map.to_pickle(os.path.join(tcga_data_path, 'gene_name_supertype.map'))

gene_ensid_name_map = gene_id_name_map.to_frame().reset_index()
gene_ensid_name_map['ensid'] = gene_ensid_name_map.gene_id.apply(lambda x:x.split('.')[0])
gene_ensid_name_map = gene_ensid_name_map.set_index('ensid')['gene_name'].drop_duplicates()
gene_ensid_name_map.to_pickle(os.path.join(tcga_data_path, 'gene_ensid_name.map'))

###### gene table
df_gene = gene_ensid_name_map.to_frame()
df_gene['ensid_v36'] = df_gene.gene_name.map(gene_name_id_map)
df_gene['gene_type'] = df_gene.gene_name.map(gene_name_type_map)
df_gene['gene_supertype'] = df_gene.gene_name.map(gene_name_supertype_map)
df_gene.to_pickle(os.path.join(tcga_data_path, 'GENE.TABLE'))

## convert to entrezgene IDs

In [None]:
ensg_list = df_gene.index.tolist()
import mygene
mg = mygene.MyGeneInfo()
mygene_info = mg.getgenes(ensg_list, fields="symbol,entrezgene", species='human', email = 'wanxiang_shen@hms.harvard.edu')
dfmg = pd.DataFrame(mygene_info).drop_duplicates('query').set_index('query')[['entrezgene', 'symbol']]
dfmg.to_pickle(os.path.join(tcga_data_path, 'MYGENE.TABLE'))

In [41]:
dfmg = pd.read_pickle(os.path.join(tcga_data_path, 'MYGENE.TABLE'))
df_gene = df_gene.join(dfmg)
df_gene.to_pickle(os.path.join(tcga_data_path, 'GENE.TABLE'))

### Step5. Remove gene if absent in all samples

In [42]:
df5 = df4[df4.columns[1:]]
s = df5.sum()
df6 = df5[s[s != 0 ].index]
df6.shape

(10184, 58379)

### Step6. Keep unique genes 

In [43]:
cols = list(set(df6.columns) & set(df_gene.ensid_v36))
df7 = df6[cols]
df7.shape

(10184, 57269)

In [44]:
df7.to_pickle(os.path.join(tcga_data_path, 'TPM.TABLE'))

### Patient table with mutation columns

We use the clinical data with follow up and outcome data for the TCGA PanCancer Atlas from Liu et al1

1: An Integrated TCGA Pan-Cancer Clinical Data Resource to Drive High-Quality Survival Outcome Analytics (https://pubmed.ncbi.nlm.nih.gov/29625055/)

In [45]:
df_clinic = pd.read_pickle(os.path.join(tcga_data_path, 'df_patient_clinic.pkl'))
df_mut = pd.read_pickle(os.path.join(tcga_data_path, 'df_patient_mut.pkl'))
df_mut['msi'] = df_mut[['MSI-H', 'MSI-L', 'MSS']].idxmax(axis=1)
df_mut['tmb'] = df_mut['tmb_wang'] 
df_mut['cnv'] = df_mut['scnv_jin_mean'] 
df_mut['msi_score'] = df_mut['MSIsensorScore']
df_mut = df_mut[['tmb', 'cnv', 'msi', 'truncate_mut', 'nontruncate_mut', 'msi_score']]

# pt_tbs = [pd.read_csv(patient, sep='\t') for patient in patients]
# df_patients = pd.concat(pt_tbs, axis=0)
# df_patients = df_patients.set_index('bcr_patient_barcode')
# scols = ['project', 'gender', 'height', 'weight', 'age_at_initial_pathologic_diagnosis', 'race_list', 
#          'vital_status', 'days_to_last_followup', 'days_to_death', 'person_neoplasm_cancer_status', 
#          'year_of_initial_pathologic_diagnosis', 'stage_event_clinical_stage', ]
# df_patients = df_patients[scols]
# df_patients = df_patients.loc[df4.index]
# df_patients = df_patients.rename(columns={'project':'cancer_type'})

df_patients = df4[['cancer_type']].join(df_clinic)
df_patients = df_patients.join(df_mut)
df_patients.to_pickle(os.path.join(tcga_data_path, 'PATIENTS.TABLE'))

In [46]:
df_patients

Unnamed: 0_level_0,cancer_type,age,gender,race,stage,vital_status,tumor_status,treatment_outcome,os_status,os_time,pfi_status,pfi_time,tmb,cnv,msi,truncate_mut,nontruncate_mut,msi_score
bcr_patient_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
TCGA-OR-A5KT,TCGA-ACC,44.0,FEMALE,WHITE,Stage I,Alive,TUMOR FREE,R,alive,2895.0,Progression,2099.0,-0.913960,0.164212,MSS,8.0,11.0,0.00
TCGA-OR-A5J9,TCGA-ACC,22.0,FEMALE,WHITE,Stage II,Alive,WITH TUMOR,R,alive,1352.0,Progression,414.0,-0.407000,-0.396697,MSI-H,1.0,26.0,0.90
TCGA-OR-A5K0,TCGA-ACC,69.0,FEMALE,WHITE,Stage II,Alive,WITH TUMOR,NR,alive,1029.0,Progression,659.0,0.264377,-0.266140,MSI-H,7.0,36.0,0.29
TCGA-OR-A5L6,TCGA-ACC,60.0,MALE,WHITE,Stage II,Alive,TUMOR FREE,R,alive,861.0,censored,861.0,-0.207691,-0.245839,MSI-H,3.0,28.0,2.35
TCGA-OR-A5LT,TCGA-ACC,57.0,MALE,[Not Evaluated],Stage III,Alive,TUMOR FREE,R,alive,549.0,censored,549.0,-1.074425,0.045561,MSI-H,7.0,10.0,1.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-V4-A9EX,TCGA-UVM,55.0,FEMALE,WHITE,Stage IIIA,Dead,WITH TUMOR,,dead,730.0,Progression,381.0,-2.354533,-0.600253,MSI-H,2.0,5.0,0.26
TCGA-WC-A883,TCGA-UVM,76.0,FEMALE,WHITE,Stage IIIA,Dead,WITH TUMOR,,dead,241.0,Progression,241.0,-2.161888,-0.171429,MSI-H,1.0,7.0,0.13
TCGA-V4-A9F7,TCGA-UVM,78.0,FEMALE,WHITE,Stage IIB,Alive,TUMOR FREE,,alive,1256.0,censored,1256.0,-1.576925,-0.325085,MSI-L,2.0,10.0,0.04
TCGA-V4-A9F5,TCGA-UVM,85.0,FEMALE,WHITE,Stage IV,Alive,WITH TUMOR,,alive,203.0,Progression,78.0,-2.161888,-0.472147,MSI-H,1.0,7.0,0.10


In [47]:
ls -lh /n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37/*.xlsx

-rw-rw-r-- 1 was966 was966 2.9M Oct 12 13:42 /n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37/Liu_TCGA_clinical.xlsx
-rw-rw-r-- 1 was966 was966 6.4M Nov 10 14:17 /n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37/Noushin_Nat_Med_2023.xlsx
