## Creates map of sample UUIDs and barcodes
- adapted from: https://seandavi.github.io/post/2017/12/genomicdatacommons-example-uuid-to-tcga-and-target-barcode-translation/
- other resources: https://bioconductor.org/packages/devel/bioc/manuals/GenomicDataCommons/man/GenomicDataCommons.pdf

In [1]:
library(GenomicDataCommons)
library(magrittr)

Loading required package: magrittr

Attaching package: ‘GenomicDataCommons’

The following object is masked from ‘package:stats’:

    filter



In [20]:
TCGAtranslateID = function(file_ids, legacy = FALSE) {
    info = files(legacy = legacy) %>%
        filter( ~ file_id %in% file_ids) %>%
        select(c('cases.submitter_id', 'file_name', 'data_format', 'cases.samples.submitter_id', 'type', 'cases.project.project_id')) %>%
        results()
#     print(info)
    
    ### to view all possible fields, uncomment the following block
#     info2 = files(legacy = legacy) %>%
#         filter( ~ file_id %in% file_ids) %>%
#         select(available_fields('files')) 
#     print(info2)
    
    file_id  = info$file_id
    file_name = info$file_name
    data_format = info$data_format
    type = info$type

    
    ### gets patient barcode
    barcode_list <- (lapply(info$cases, function(a) { a[[3]][[1]][[1]] }))
    barcode_lengths <- lapply(barcode_list, length)
    barcode_mask <- unlist(lapply(barcode_lengths, function(a) {a > 1}))
    barcode_list[barcode_mask] <- NA
    
    ### gets patient disease type
    disease_list <- (lapply(info$cases, function(a) { a[[1]][[1]][[1]] }))
    

    ## gets sample barcodes - if has more than 1 barcode (e.g. VCF files) then it is not the type of 
    ## experiment (e.g. BAM) where barcode can tell you if it is tumor/normal sample
    sample_barcode_list <- (lapply(info$cases, function(a) {a[[2]][[1]][[1]] }))
    sample_barcode_lengths <- lapply(sample_barcode_list, length)
    sample_barcode_mask <- unlist(lapply(sample_barcode_lengths, function(a) {a > 1}))
    sample_barcode_list[sample_barcode_mask] <- NA
    
    
#     print('----------')
#     print(paste('fild_id: ', file_id))
#     print(paste('file_name: ', file_name))
#     print(paste('data_format:', data_format))
#     print(paste('barcode:', unlist(barcode_list)))
#     print(paste('sample_barcode:', unlist(sample_barcode_list)))
#     print('----------')
    
    
    # And build the data.frame
    return(data.frame(file_id = file_id,
                      file_name = file_name, 
                      barcode = unlist(barcode_list, use.names=F), 
                      sample_barcode = unlist(sample_barcode_list, use.names=F),
                      disease = unlist(disease_list, use.names=F),
                      type = type,
                      data_format = data_format))
    }

In [21]:
TCGAtranslateID(c('88be35fc-3a1d-47ad-93b4-90d786341be7', 'cb4a2233-e962-4e43-b1d1-e12e3c142476'))
# TCGAtranslateID('cb4a2233-e962-4e43-b1d1-e12e3c142476')
# TCGAtranslateID('23fa7b4b-9d68-429b-aece-658b11124bb3')

file_id,file_name,barcode,sample_barcode,disease,type,data_format
88be35fc-3a1d-47ad-93b4-90d786341be7,88be35fc-3a1d-47ad-93b4-90d786341be7.vep.vcf.gz,TCGA-VM-A8CH,,TCGA-LGG,annotated_somatic_mutation,VCF
cb4a2233-e962-4e43-b1d1-e12e3c142476,TCGA-OR-A5KS-01A-11D-A30A-10_Illumina_gdc_realn.bam,TCGA-OR-A5KS,TCGA-OR-A5KS-01A,TCGA-ACC,aligned_reads,BAM


In [22]:
biospecimen_df <- read.csv('~/GDC_barcodes/gdc_manifest.2018-05-14_biospecimen.txt', sep='\t', row.names=NULL)
clinical_df <- read.csv('~/GDC_barcodes/gdc_manifest.2018-05-14_clinical.txt', sep='\t', row.names=NULL)
snv_df <- read.csv('~/GDC_barcodes/gdc_manifest.2018-05-14_SNV.txt', sep='\t', row.names=NULL)
cnv_df <- read.csv('~/GDC_barcodes/gdc_manifest.2018-05-14_CNV.txt', sep='\t', row.names=NULL)
methyl_df <- read.csv('~/GDC_barcodes/gdc_manifest.2018-05-14_DNA_methylation.txt', sep='\t', row.names=NULL)
raw_df <- read.csv('~/GDC_barcodes/gdc_manifest.2018-05-14_raw_seq_data.txt', sep='\t', row.names=NULL)
transcriptome_df <- read.csv('~/GDC_barcodes/gdc_manifest.2018-05-14_transcriptome_profiling.txt', sep='\t', row.names=NULL)

In [23]:
# issues with biospecimen and clinical uuid - skip for now
total_df <- rbind(snv_df, cnv_df, methyl_df, raw_df, transcriptome_df)
head(total_df)

id,filename,md5,size,state
88be35fc-3a1d-47ad-93b4-90d786341be7,88be35fc-3a1d-47ad-93b4-90d786341be7.vep.vcf.gz,d07cf83738686ae3053fe72e95c409e1,165208,live
ad3dd7e9-808a-4e4b-8d43-dc3de68c8c5c,ad3dd7e9-808a-4e4b-8d43-dc3de68c8c5c.vep.vcf.gz,35f2de2ef0304fe3ee8bb1229eb4c3b8,1069229,live
0b1c205c-c227-43dd-a308-1a46b4f733df,0b1c205c-c227-43dd-a308-1a46b4f733df.vep.vcf.gz,9af292e502a6419905c3f25763e36b9c,110968,live
9c26fefe-1876-41d5-ae27-a665ed72643f,9c26fefe-1876-41d5-ae27-a665ed72643f.vcf.gz,fb3358bdea79e85074b9ab99845a790a,198682,live
2c9c09c4-4849-4333-b1ad-53699fde6072,2c9c09c4-4849-4333-b1ad-53699fde6072.vep.vcf.gz,5c289ee2e699d1b856c31735e1134a43,181009,live
0f7e2d94-046d-437c-aa42-12b79e4bf80a,0f7e2d94-046d-437c-aa42-12b79e4bf80a.vcf.gz,8bbec722d7fbe95cbdfdb880de4a4b8d,295326,live


In [24]:
step = 10000
indices <- seq(1, nrow(total_df), by=step)

In [25]:
# TCGAtranslateID does not seem to be able to handle more than ~10,000 UUIDs at a time
for (index in indices) {
#     if (index < 19991) {
#         next
#     }
    start <- index
    end <- index + step - 1
    
    if (index == tail(indices, 1)) {
        end <- nrow(total_df)
    }
    print(paste("start: ", start, " end: ", end))
    temp_df <- total_df[start:end,]
    temp_uuids <- temp_df[,'id']
    temp_barcode_df <- TCGAtranslateID(temp_uuids)
    if (index == 1) {
        total_output <- temp_barcode_df
    } 
    else {
        total_output <- rbind(total_output, temp_barcode_df)
    }
}

[1] "start:  1  end:  10000"
[1] "start:  10001  end:  20000"
[1] "start:  20001  end:  30000"
[1] "start:  30001  end:  40000"
[1] "start:  40001  end:  50000"
[1] "start:  50001  end:  60000"
[1] "start:  60001  end:  70000"
[1] "start:  70001  end:  80000"
[1] "start:  80001  end:  90000"
[1] "start:  90001  end:  1e+05"
[1] "start:  100001  end:  110000"
[1] "start:  110001  end:  120000"
[1] "start:  120001  end:  130000"
[1] "start:  130001  end:  140000"
[1] "start:  140001  end:  150000"
[1] "start:  150001  end:  160000"
[1] "start:  160001  end:  170000"
[1] "start:  170001  end:  180000"
[1] "start:  180001  end:  190000"
[1] "start:  190001  end:  2e+05"
[1] "start:  200001  end:  210000"
[1] "start:  210001  end:  220000"
[1] "start:  220001  end:  230000"
[1] "start:  230001  end:  240000"
[1] "start:  240001  end:  250000"
[1] "start:  250001  end:  260000"
[1] "start:  260001  end:  270000"
[1] "start:  270001  end:  280000"
[1] "start:  280001  end:  288277"


In [28]:
head(total_output, 20)

file_id,file_name,barcode,sample_barcode,disease,type,data_format
2c9c09c4-4849-4333-b1ad-53699fde6072,2c9c09c4-4849-4333-b1ad-53699fde6072.vep.vcf.gz,TARGET-30-PARMLF,,TARGET-NBL,annotated_somatic_mutation,VCF
3513cd49-6a46-4265-ac5e-029b0fe219e1,3513cd49-6a46-4265-ac5e-029b0fe219e1.vcf.gz,TARGET-30-PASFGG,,TARGET-NBL,simple_somatic_mutation,VCF
c6d73185-de17-4184-bb19-f94a15d765c2,c6d73185-de17-4184-bb19-f94a15d765c2.vep.vcf.gz,TARGET-30-PALPGG,,TARGET-NBL,annotated_somatic_mutation,VCF
83a84c6c-8f39-4837-b583-8ec5696be5e4,83a84c6c-8f39-4837-b583-8ec5696be5e4.vep.vcf.gz,TARGET-30-PARSBI,,TARGET-NBL,annotated_somatic_mutation,VCF
d8a2f831-eec1-4888-94be-738bb6b914f4,d8a2f831-eec1-4888-94be-738bb6b914f4.vep.vcf.gz,TARGET-30-PARJMX,,TARGET-NBL,annotated_somatic_mutation,VCF
78398f5d-c956-43ba-a5d6-b90b096323af,78398f5d-c956-43ba-a5d6-b90b096323af.vep.vcf.gz,TARGET-30-PARDCK,,TARGET-NBL,annotated_somatic_mutation,VCF
b00e2179-07c9-46ef-ba03-f349e10dc72d,b00e2179-07c9-46ef-ba03-f349e10dc72d.vep.vcf.gz,TARGET-30-PASTKC,,TARGET-NBL,annotated_somatic_mutation,VCF
27db172d-0969-4910-80f8-88f72d374f9a,27db172d-0969-4910-80f8-88f72d374f9a.vcf.gz,TARGET-30-PAPNEP,,TARGET-NBL,simple_somatic_mutation,VCF
0b459668-f0c1-47af-af32-9cf61334a9a0,0b459668-f0c1-47af-af32-9cf61334a9a0.vcf.gz,TARGET-30-PANRHJ,,TARGET-NBL,simple_somatic_mutation,VCF
1d93b566-a62f-4d16-9180-cc66f17d3c75,1d93b566-a62f-4d16-9180-cc66f17d3c75.vcf,TARGET-30-PASNEF,,TARGET-NBL,simple_somatic_mutation,VCF


In [27]:
write.table(total_output, file='~/GDC_barcodes/uuid_barcode_map.txt', quote=FALSE, sep='\t', row.names=FALSE)