In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob


In [2]:
tcga_data_path = '/n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37'

In [3]:
tpms = glob(os.path.join(tcga_data_path, '*', 'rnaSeq_tpm_matrix.csv'))
samples = glob(os.path.join(tcga_data_path, '*', 'rnaSeq_sample_table.txt'))
genes = glob(os.path.join(tcga_data_path, '*', 'rnaSeq_gene_table.txt'))
scnvs = glob(os.path.join(tcga_data_path, '*', 'SCNV_table.txt'))
snvs = glob(os.path.join(tcga_data_path, '*', 'SNV_table.txt'))
drugs = glob(os.path.join(tcga_data_path, '*', 'clinical_drug_table.txt'))
patients = glob(os.path.join(tcga_data_path, '*', 'clinical_patient_table.txt'))
proteins = glob(os.path.join(tcga_data_path, '*', 'proteome_table.txt'))

In [4]:
len(tpms),len(samples),len(genes),len(scnvs),len(snvs),len(drugs),len(patients),len(proteins)

(33, 33, 33, 33, 33, 33, 33, 32)

# Narrow Down Samples

### Step1. TCGA (33 cancers) RNA-seq TPM matrix


In [5]:
tpm_tbs = []
for tpm in tpms:
    cancer_type = os.path.basename(os.path.dirname(tpm))
    df = pd.read_csv(tpm, index_col=0).T
    dft = pd.DataFrame(index=df.index)
    dft['cancer_type'] = cancer_type
    dft = dft.join(df)
    tpm_tbs.append(dft)

df1 = pd.concat(tpm_tbs, axis=0)
bcr_patient_barcode = df1.index.map(lambda x:x[:12])
bcode = pd.DataFrame(bcr_patient_barcode,index=df1.index, columns = ['bcr_patient_barcode'])
df1 = bcode.join(df1)

In [6]:
df1[df1.columns[2:]].shape

(11274, 60660)

### Step2. Sample type selection (Primary solid Tumor)

In [7]:
sdef = [pd.read_csv(sample,sep='\t', index_col = 0) for sample in samples]
df_samples = pd.concat(sdef)
sample_id_type_map = df_samples.definition
sample_id_type_map.name = 'sample_type'
sample_id_type_map.index.name = 'sample_id'
sample_id_type_map.to_pickle(os.path.join(tcga_data_path, 'sample_id_type.map'))
#df_samples = df_samples[df_samples.prior_treatment == 'No']

# keep only tumor samples
tumer_samples = ['Primary solid Tumor', 'Metastatic', 'Recurrent Solid Tumor',
                 'Primary Blood Derived Cancer - Peripheral Blood'] #'Primary Blood Derived Cancer - Peripheral Blood'
normal_samples = ['Solid Tissue Normal']

sidx = sample_id_type_map[~sample_id_type_map.isin(normal_samples)].index
df2 = df1.loc[sidx].sort_values('cancer_type')
df2[df2.columns[2:]].shape

(10534, 60660)

### Step3. Remove prior treatment samples & is_ffpe

In [8]:
no_prior_treatment_set = set(df_samples[df_samples.prior_treatment == 'No'].index)
no_is_ffpe_set = set(df_samples[df_samples.is_ffpe == False].index)
selected_samples = no_prior_treatment_set & no_is_ffpe_set
df3 = df2[df2.index.isin(selected_samples)]
df3[df3.columns[2:]].shape

(10305, 60660)

### Step4. Samples to Patients (Average TPM values)


In [9]:
s = df3.groupby(['bcr_patient_barcode']).size()
rnacols = df3.columns[2:]
df_dup = df3[df3.bcr_patient_barcode.isin(s[s > 1].index)]
df_idt = df3[df3.bcr_patient_barcode.isin(s[s == 1].index)]
df_dup_idt = df_dup.groupby(['bcr_patient_barcode', 'cancer_type']).apply(lambda x:x.mean()).reset_index()
df4 = pd.concat([df_idt, df_dup_idt]).sort_values('cancer_type').set_index('bcr_patient_barcode')
df4[df4.columns[1:]].shape

(10184, 60660)

# Narrow Down Genes

### Step5. Gene id-name unique mapping (var criteria)

In [10]:
names = []
types = []
for gf in genes:
    g = pd.read_csv(gf,sep='\t').set_index('gene_id')
    gname = g.gene_name
    gtype = g.gene_type
    names.append(gname)
    types.append(gtype)
gene_id_name_map = pd.concat(names).reset_index().drop_duplicates().set_index('gene_id').gene_name
gene_id_type_map = pd.concat(types).reset_index().drop_duplicates().set_index('gene_id').gene_type
gene_id_name_map.to_pickle(os.path.join(tcga_data_path, 'gene_id_name.map'))
gene_id_type_map.to_pickle(os.path.join(tcga_data_path, 'gene_id_type.map'))
gene_id_type_map.shape

dfv = np.log2(df4[df4.columns[1:]] + 1)
v = dfv.var().to_frame(name='variance')
v['gene_name'] = v.index.map(gene_id_name_map)
## use gene id with the largest variance
gene_name_id_map = v.groupby('gene_name').apply(lambda x:x.variance.idxmax())
gene_name_type_map = gene_name_id_map.map(gene_id_type_map)

gene_name_id_map.to_pickle(os.path.join(tcga_data_path, 'gene_name_id.map'))
gene_name_type_map.to_pickle(os.path.join(tcga_data_path, 'gene_name_type.map'))

### define the subtype genes
gene_type = gene_id_type_map.value_counts()
pseudogenes = [i for i in gene_type.index if 'pseudogene' in i]
s = pd.DataFrame(index=gene_id_type_map.unique())
s['gene_supertype'] = 'other'
s.loc[pseudogenes, 'gene_supertype'] = 'pseudogene'
s.loc['protein_coding', 'gene_supertype'] = 'protein_coding'
s.loc['lncRNA', 'gene_supertype'] = 'lncRNA'
gene_type_supertype_map = s.gene_supertype

gene_id_supertype_map = gene_id_type_map.map(gene_type_supertype_map)
gene_name_supertype_map = gene_name_id_map.map(gene_id_supertype_map)
gene_id_supertype_map.to_pickle(os.path.join(tcga_data_path, 'gene_id_supertype.map'))
gene_name_supertype_map.to_pickle(os.path.join(tcga_data_path, 'gene_name_supertype.map'))

gene_ensid_name_map = gene_id_name_map.to_frame().reset_index()
gene_ensid_name_map['ensid'] = gene_ensid_name_map.gene_id.apply(lambda x:x.split('.')[0])
gene_ensid_name_map = gene_ensid_name_map.set_index('ensid')['gene_name'].drop_duplicates()
gene_ensid_name_map.to_pickle(os.path.join(tcga_data_path, 'gene_ensid_name.map'))

###### gene table
df_gene = gene_ensid_name_map.to_frame()
df_gene['ensid_v36'] = df_gene.gene_name.map(gene_name_id_map)
df_gene['gene_type'] = df_gene.gene_name.map(gene_name_type_map)
df_gene['gene_supertype'] = df_gene.gene_name.map(gene_name_supertype_map)
df_gene.to_pickle(os.path.join(tcga_data_path, 'GENE.TABLE'))

### Step5. Remove gene if absent in all samples

In [15]:
df5 = df4[df4.columns[1:]]
s = df5.sum()
df6 = df5[s[s != 0 ].index]
df6.shape

(10184, 58379)

### Step6. Keep unique genes 

In [16]:
cols = list(set(df6.columns) & set(df_gene.ensid_v36))
df7 = df6[cols]
df7.shape

(10184, 57269)

In [18]:
df7.to_pickle(os.path.join(tcga_data_path, 'TPM.TABLE'))

### Patient table with mutation columns

In [67]:
pt_tbs = [pd.read_csv(patient, sep='\t') for patient in patients]
df_patients = pd.concat(pt_tbs, axis=0)
df_patients = df_patients.set_index('bcr_patient_barcode')
scols = ['project', 'gender', 'height', 'weight', 'age_at_initial_pathologic_diagnosis', 'race_list', 
         'vital_status', 'days_to_last_followup', 'days_to_death', 'person_neoplasm_cancer_status', 
         'year_of_initial_pathologic_diagnosis', 'stage_event_clinical_stage', ]
df_patients = df_patients[scols]
# df5_patients = df_patients[scols].loc[df5.index]
# df5_patients.to_pickle((os.path.join(tcga_data_path, 'df5_patients.pkl')))
df_patients = df_patients.loc[df4.index]
df_patients = df_patients.rename(columns={'project':'cancer_type'})

df_mut = pd.read_pickle(os.path.join(tcga_data_path, 'df_patient_mut.pkl'))
df_patients = df_patients.join(df_mut[df_mut.columns[1:]])
df_patients.to_pickle(os.path.join(tcga_data_path, 'PATIENTS.TABLE'))

In [82]:
df_patients.shape

(10184, 22)

In [19]:
ls -lh /n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37/*.TABLE

-rw-rw-r-- 1 was966 zitnik 3.4M Sep  1 14:32 /n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37/GENE.TABLE
-rw-rw-r-- 1 was966 zitnik 1.7M Sep  1 13:33 /n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37/PATIENTS.TABLE
-rw-rw-r-- 1 was966 zitnik 4.4G Sep  1 14:35 /n/data1/hms/dbmi/zitnik/lab/users/was966/TCGA/GDC_v37/TPM.TABLE
