In [1]:
import cdapython as cda
import pandas as pd

In [2]:
cohorts = cda.unique_terms('ResearchSubject.member_of_research_project')
', '.join([cohort for cohort in cohorts if 'TCGA' in cohort])

'CPTAC-TCGA, TCGA-ACC, TCGA-BLCA, TCGA-BRCA, TCGA-CESC, TCGA-CHOL, TCGA-COAD, TCGA-DLBC, TCGA-ESCA, TCGA-GBM, TCGA-HNSC, TCGA-KICH, TCGA-KIRC, TCGA-KIRP, TCGA-LAML, TCGA-LGG, TCGA-LIHC, TCGA-LUAD, TCGA-LUSC, TCGA-MESO, TCGA-OV, TCGA-PAAD, TCGA-PCPG, TCGA-PRAD, TCGA-READ, TCGA-SARC, TCGA-SKCM, TCGA-STAD, TCGA-TGCT, TCGA-THCA, TCGA-THYM, TCGA-UCEC, TCGA-UCS, TCGA-UVM'

In [11]:
TCGA_cohort = "TCGA-SKCM"

In [4]:
def get_cohort_research_subjects(cohort_name, query_limit=5000):
    query = f'ResearchSubject.member_of_research_project = "{cohort_name}"'
    print(query)
    q = cda.Q(query)
    return q.run(limit=query_limit)

def create_file_records(research_subjects, return_dataframe=False):
    records = []
    for research_subject in research_subjects:
        research_subject_id = research_subject['id']
        specimens = research_subject['ResearchSubject'][0]['Specimen']
        for specimen in specimens:
            specimen_id = specimen['id']
            source_material_type = specimen['source_material_type']
            specimen_type = specimen['specimen_type']
        
            files = specimen['File']
            for file in files:
                file_format = file['file_format']
                file_id = file['id']
                label = file['label']
                data_category = file['data_category']
                data_modality = file['data_modality']
                data_type = file['data_type']
                byte_size = file['byte_size']
                checksum = file['checksum']
                gdc_identifier = file['identifier'][0]['value']
                
                record = {
                    'research_subject_id': research_subject_id,
                    'specimen_id': specimen_id,
                    'specimen_source_material_type': source_material_type,
                    'specimen_type': specimen_type,
                    'file_id': file_id,
                    'file_format': file_format,
                    'label': label,
                    'data_category': data_category,
                    'data_modality': data_modality,
                    'data_type': data_type,
                    'checksum': checksum,
                    'byte_size': byte_size,
                    'gdc_identifier': gdc_identifier
                }
                records.append(record)
    if return_dataframe:
        return pd.DataFrame(records)
    else:
        return records
    
def create_gdc_manifest(dataframe, state='validated'):
    column_map = {
        'file_id': 'id',
        'label': 'filename',
        'checksum': 'md5',
        'size': 'byte_size'
    }
    
    return (dataframe
            .rename(columns=column_map)
            .loc[:, column_map.values()]
           )

In [5]:
research_subjects = get_cohort_research_subjects(TCGA_cohort)
file_records = create_file_records(research_subjects, return_dataframe=True)

ResearchSubject.member_of_research_project = "TCGA-READ"
Getting results from database

Total execution time: 23815 ms


In [6]:
file_records.columns

Index(['research_subject_id', 'specimen_id', 'specimen_source_material_type',
       'specimen_type', 'file_id', 'file_format', 'label', 'data_category',
       'data_modality', 'data_type', 'checksum', 'byte_size',
       'gdc_identifier'],
      dtype='object')

In [7]:
idx_aliquot = file_records['specimen_type'].eq('aliquot')
idx_bam = file_records['file_format'].eq('BAM')
idx = idx_aliquot & idx_bam
idx.value_counts()

False    44755
True       678
dtype: int64

In [8]:
bams = file_records.loc[idx, :].reset_index(drop=True)

In [9]:
# Manually annotate sequencing type
# This is not yet captured in the CDA and would be a wonderful feature to add
# The GDC has two additional metadata fields for files, 
# Experimental Strategy - this seems to be the sequencing type
# Platform - this seems to be the sequencing platform
if TCGA_cohort == 'TCGA-SKCM':
    bams['sequencing_type'] = pd.NA

    idx_mi_rna_seq = bams.loc[:, 'label'].str.contains('mirna')
    bams.loc[idx_mi_rna_seq, 'sequencing_type'] = 'miRNA-Seq'

    idx_atac_seq = bams.loc[:, 'label'].str.contains('atac')
    bams.loc[idx_atac_seq, 'sequencing_type'] = 'ATAC-Seq'

    idx_wes = bams['label'].str.split('.').apply(lambda x: len(x)).eq(4)
    bams.loc[idx_wes, 'sequencing_type'] = 'WES'

    bams['sequencing_type'].fillna('RNA-seq', inplace=True)

In [10]:
manifest = create_gdc_manifest(bams)
manifest.to_csv(f'{TCGA_cohort}.bams.created_with_cda.tsv', sep='\t')