In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob


In [2]:
tcga_data_path = '/n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37'

In [3]:
tpms = glob(os.path.join(tcga_data_path, '*', 'rnaSeq_tpm_matrix.csv'))
samples = glob(os.path.join(tcga_data_path, '*', 'rnaSeq_sample_table.txt'))
genes = glob(os.path.join(tcga_data_path, '*', 'rnaSeq_gene_table.txt'))
scnvs = glob(os.path.join(tcga_data_path, '*', 'SCNV_table.txt'))
snvs = glob(os.path.join(tcga_data_path, '*', 'SNV_table.txt'))
drugs = glob(os.path.join(tcga_data_path, '*', 'clinical_drug_table.txt'))
patients = glob(os.path.join(tcga_data_path, '*', 'clinical_patient_table.txt'))
proteins = glob(os.path.join(tcga_data_path, '*', 'proteome_table.txt'))

In [4]:
len(tpms),len(samples),len(genes),len(scnvs),len(snvs),len(drugs),len(patients),len(proteins)

(33, 33, 33, 33, 33, 33, 33, 32)

# Narrow Down Samples

### Step1. TCGA (33 cancers) RNA-seq TPM matrix


In [5]:
tpm_tbs = []
for tpm in tpms:
    cancer_type = os.path.basename(os.path.dirname(tpm))
    df = pd.read_csv(tpm, index_col=0).T
    dft = pd.DataFrame(index=df.index)
    dft['cancer_type'] = cancer_type
    dft = dft.join(df)
    tpm_tbs.append(dft)

df1 = pd.concat(tpm_tbs, axis=0)
bcr_patient_barcode = df1.index.map(lambda x:x[:12])
bcode = pd.DataFrame(bcr_patient_barcode,index=df1.index, columns = ['bcr_patient_barcode'])
df1 = bcode.join(df1)

In [6]:
df1[df1.columns[2:]].shape

(11274, 60660)

### Step2. Sample type selection (Solid Tissue Normal)

In [16]:
sdef = [pd.read_csv(sample,sep='\t', index_col = 0) for sample in samples]
df_samples = pd.concat(sdef)
sample_id_type_map = df_samples.definition
sample_id_type_map.name = 'sample_type'
sample_id_type_map.index.name = 'sample_id'
sample_id_type_map.to_pickle(os.path.join(tcga_data_path, 'sample_id_type.map'))
#df_samples = df_samples[df_samples.prior_treatment == 'No']

# keep only tumor samples
tumer_samples = ['Primary solid Tumor', 'Metastatic', 'Recurrent Solid Tumor',
                 'Primary Blood Derived Cancer - Peripheral Blood'] #'Primary Blood Derived Cancer - Peripheral Blood'
normal_samples = ['Solid Tissue Normal']

sidx = sample_id_type_map[sample_id_type_map.isin(normal_samples)].index
df2 = df1.loc[sidx].sort_values('cancer_type')
df2[df2.columns[2:]].shape

(740, 60660)

### Step3. Remove prior treatment samples & is_ffpe

In [17]:
no_prior_treatment_set = set(df_samples[df_samples.prior_treatment == 'No'].index)
no_is_ffpe_set = set(df_samples[df_samples.is_ffpe == False].index)
selected_samples = no_prior_treatment_set & no_is_ffpe_set
df3 = df2[df2.index.isin(selected_samples)]
df3[df3.columns[2:]].shape

(714, 60660)

### Step4. Samples to Patients (Average TPM values)


In [18]:
s = df3.groupby(['bcr_patient_barcode']).size()
rnacols = df3.columns[2:]
df_dup = df3[df3.bcr_patient_barcode.isin(s[s > 1].index)]
df_idt = df3[df3.bcr_patient_barcode.isin(s[s == 1].index)]
if len(df_dup) > 2:
    df_dup_idt = df_dup.groupby(['bcr_patient_barcode', 'cancer_type']).apply(lambda x:x.mean()).reset_index()
else:
    df_dup_idt = df_dup
df4 = pd.concat([df_idt, df_dup_idt]).sort_values('cancer_type').set_index('bcr_patient_barcode')
df4[df4.columns[1:]].shape

(714, 60660)

# Narrow Down Genes

In [19]:
df_cancer = pd.read_pickle(os.path.join(tcga_data_path, 'TPM.TABLE'))
df7 = df4[df_cancer.columns]
df7.shape

(714, 57269)

In [20]:
df7.to_pickle(os.path.join(tcga_data_path, 'NORMAL.TPM.TABLE'))

### Patient table with mutation columns

In [22]:
df_clinic = pd.read_pickle(os.path.join(tcga_data_path, 'df_patient_clinic.pkl'))
df_mut = pd.read_pickle(os.path.join(tcga_data_path, 'df_patient_mut.pkl'))
df_mut['msi'] = df_mut[['MSI-H', 'MSI-L', 'MSS']].idxmax(axis=1)
df_mut['tmb'] = df_mut['tmb_wang'] 
df_mut['cnv'] = df_mut['scnv_jin_mean'] 
df_mut['msi_score'] = df_mut['MSIsensorScore']
df_mut = df_mut[['tmb', 'cnv', 'msi', 'truncate_mut', 'nontruncate_mut', 'msi_score']]


df_patients = df4[['cancer_type']].join(df_clinic)
df_patients = df_patients.join(df_mut)
df_patients.to_pickle(os.path.join(tcga_data_path, 'NORMAL.PATIENTS.TABLE'))

In [23]:
df_patients.shape

(714, 18)

In [24]:
ls -lh /n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37/*PATIENTS.TABLE

-rw-rw-r-- 1 was966 zitnik  75K Oct 12 15:25 /n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37/NORMAL.PATIENTS.TABLE
-rw-rw-r-- 1 was966 zitnik 1.1M Oct 12 15:16 /n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37/PATIENTS.TABLE
