## Creates map of sample UUIDs and barcodes
- adapted from: https://seandavi.github.io/post/2017/12/genomicdatacommons-example-uuid-to-tcga-and-target-barcode-translation/
- other resources: https://bioconductor.org/packages/3.7/bioc/manuals/GenomicDataCommons/man/GenomicDataCommons.pdf

In [10]:
library(GenomicDataCommons)
library(magrittr)

In [357]:
TCGAtranslateID = function(file_ids, legacy = FALSE) {
    info = files(legacy = legacy) %>%
        filter( ~ file_id %in% file_ids) %>%
        select(c('cases.submitter_id', 'file_name', 'data_format', 'cases.samples.submitter_id', 'type')) %>%
        results_all()
#     print(info)
    
    ### to view all possible fields, uncomment the following block
#     info2 = files(legacy = legacy) %>%
#         filter( ~ file_id %in% file_ids) %>%
#         select(available_fields('files')) #%>%
#     print(info2)
    
    file_id  = info$file_id
    file_name = info$file_name
    data_format = info$data_format
    type = info$type
    
    ### gets patient barcode
    barcode_list <- (lapply(info$cases, function(a) { a[[2]][[1]][[1]] }))
    barcode_lengths <- lapply(barcode_list, length)
    barcode_mask <- unlist(lapply(barcode_lengths, function(a) {a > 1}))
    barcode_list[barcode_mask] <- NA
    
    '''
    gets sample barcodes - if has more than 1 barcode (e.g. VCF files) then it is not the 
    type of experiment (e.g. BAM) where barcode can tell you if it is tumor/normal sample
    '''
    sample_barcode_list <- (lapply(info$cases, function(a) {a[[1]][[1]][[1]] }))
    sample_barcode_lengths <- lapply(sample_barcode_list, length)
    sample_barcode_mask <- unlist(lapply(sample_barcode_lengths, function(a) {a > 1}))
    sample_barcode_list[sample_barcode_mask] <- NA
    
    
#     print('----------')
#     print(paste('fild_id: ', file_id))
#     print(paste('file_name: ', file_name))
#     print(paste('data_format:', data_format))
#     print(paste('barcode:', unlist(barcode_list)))
#     print(paste('sample_barcode:', unlist(sample_barcode_list)))
#     print('----------')
    
    
    # And build the data.frame
    return(data.frame(file_id = file_id,
                      file_name = file_name, 
                      barcode = unlist(barcode_list, use.names=F), 
                      sample_barcode = unlist(sample_barcode_list, use.names=F),
                      type = type,
                      data_format = data_format))
    }

ERROR: Error in parse(text = x, srcfile = src): <text>:25:7: unexpected string constant
27:     type of experiment (e.g. BAM) where barcode can tell you if it is tumor/normal sample
28:     '
          ^


In [354]:
TCGAtranslateID(c('88be35fc-3a1d-47ad-93b4-90d786341be7', 'cb4a2233-e962-4e43-b1d1-e12e3c142476'))
# TCGAtranslateID('cb4a2233-e962-4e43-b1d1-e12e3c142476')
# TCGAtranslateID('23fa7b4b-9d68-429b-aece-658b11124bb3')

file_id,file_name,barcode,sample_barcode,type,data_format
88be35fc-3a1d-47ad-93b4-90d786341be7,88be35fc-3a1d-47ad-93b4-90d786341be7.vep.vcf.gz,TCGA-VM-A8CH,,annotated_somatic_mutation,VCF
cb4a2233-e962-4e43-b1d1-e12e3c142476,TCGA-OR-A5KS-01A-11D-A30A-10_Illumina_gdc_realn.bam,TCGA-OR-A5KS,TCGA-OR-A5KS-01A,aligned_reads,BAM


In [309]:
biospecimen_df <- read.csv('~/GDC_barcodes/gdc_manifest.2018-05-14_biospecimen.txt', sep='\t', row.names=NULL)
clinical_df <- read.csv('~/GDC_barcodes/gdc_manifest.2018-05-14_clinical.txt', sep='\t', row.names=NULL)
snv_df <- read.csv('~/GDC_barcodes/gdc_manifest.2018-05-14_SNV.txt', sep='\t', row.names=NULL)
cnv_df <- read.csv('~/GDC_barcodes/gdc_manifest.2018-05-14_CNV.txt', sep='\t', row.names=NULL)
methyl_df <- read.csv('~/GDC_barcodes/gdc_manifest.2018-05-14_DNA_methylation.txt', sep='\t', row.names=NULL)
raw_df <- read.csv('~/GDC_barcodes/gdc_manifest.2018-05-14_raw_seq_data.txt', sep='\t', row.names=NULL)
transcriptome_df <- read.csv('~/GDC_barcodes/gdc_manifest.2018-05-14_transcriptome_profiling.txt', sep='\t', row.names=NULL)

In [337]:
# issues with biospecimen and clinical uuid - skip for now
total_df <- rbind(snv_df, cnv_df, methyl_df, raw_df, transcriptome_df)
head(total_df)

id,filename,md5,size,state
88be35fc-3a1d-47ad-93b4-90d786341be7,88be35fc-3a1d-47ad-93b4-90d786341be7.vep.vcf.gz,d07cf83738686ae3053fe72e95c409e1,165208,live
ad3dd7e9-808a-4e4b-8d43-dc3de68c8c5c,ad3dd7e9-808a-4e4b-8d43-dc3de68c8c5c.vep.vcf.gz,35f2de2ef0304fe3ee8bb1229eb4c3b8,1069229,live
0b1c205c-c227-43dd-a308-1a46b4f733df,0b1c205c-c227-43dd-a308-1a46b4f733df.vep.vcf.gz,9af292e502a6419905c3f25763e36b9c,110968,live
9c26fefe-1876-41d5-ae27-a665ed72643f,9c26fefe-1876-41d5-ae27-a665ed72643f.vcf.gz,fb3358bdea79e85074b9ab99845a790a,198682,live
2c9c09c4-4849-4333-b1ad-53699fde6072,2c9c09c4-4849-4333-b1ad-53699fde6072.vep.vcf.gz,5c289ee2e699d1b856c31735e1134a43,181009,live
0f7e2d94-046d-437c-aa42-12b79e4bf80a,0f7e2d94-046d-437c-aa42-12b79e4bf80a.vcf.gz,8bbec722d7fbe95cbdfdb880de4a4b8d,295326,live


In [338]:
step = 10000
indices <- seq(1, nrow(total_df), by=step)

In [355]:
# TCGAtranslateID does not seem to be able to handle more than ~10,000 UUIDs at a time
for (index in indices) {
#     if (index < 19991) {
#         next
#     }
    start <- index
    end <- index + step - 1
    
    if (index == tail(indices, 1)) {
        end <- nrow(total_df)
    }
    print(paste("start: ", start, " end: ", end))
    temp_df <- total_df[start:end,]
    temp_uuids <- temp_df[,'id']
    temp_barcode_df <- TCGAtranslateID(temp_uuids)
    if (index == 1) {
        total_output <- temp_barcode_df
    } 
    else {
        total_output <- rbind(total_output, temp_barcode_df)
    }
}

[1] "start:  1  end:  10000"
[1] "start:  10001  end:  20000"
[1] "start:  20001  end:  30000"
[1] "start:  30001  end:  40000"
[1] "start:  40001  end:  50000"
[1] "start:  50001  end:  60000"
[1] "start:  60001  end:  70000"
[1] "start:  70001  end:  80000"
[1] "start:  80001  end:  90000"
[1] "start:  90001  end:  1e+05"
[1] "start:  100001  end:  110000"
[1] "start:  110001  end:  120000"
[1] "start:  120001  end:  130000"
[1] "start:  130001  end:  140000"
[1] "start:  140001  end:  150000"
[1] "start:  150001  end:  160000"
[1] "start:  160001  end:  170000"
[1] "start:  170001  end:  180000"
[1] "start:  180001  end:  190000"
[1] "start:  190001  end:  2e+05"
[1] "start:  200001  end:  210000"
[1] "start:  210001  end:  220000"
[1] "start:  220001  end:  230000"
[1] "start:  230001  end:  240000"
[1] "start:  240001  end:  250000"
[1] "start:  250001  end:  260000"
[1] "start:  260001  end:  270000"
[1] "start:  270001  end:  280000"
[1] "start:  280001  end:  288277"


In [358]:
tail(total_output, 20)

Unnamed: 0,file_id,file_name,barcode,sample_barcode,type,data_format
288258,6d127809-3000-47b3-a38c-cf23bfaf7b0d,dae3ad4e-f035-49f6-a2be-75b1f69800c4.mirbase21.isoforms.quantification.txt,TCGA-HZ-A8P0,TCGA-HZ-A8P0-01A,mirna_expression,TSV
288259,9b1c4409-235e-43f0-a9e2-7aa489cc897c,a79def22-f416-42e9-9ef6-3ee554b5b17d.mirbase21.mirnas.quantification.txt,TCGA-EK-A2RN,TCGA-EK-A2RN-01A,mirna_expression,TSV
288260,a8b8f998-4973-466a-95f3-df082aa4c42a,6f4fcf38-5623-4984-b5f5-68d1ec2f8b5c.mirbase21.isoforms.quantification.txt,TCGA-BT-A20V,TCGA-BT-A20V-01A,mirna_expression,TSV
288261,c83df78c-5765-4d1e-b5e7-3a0df4d1ebc8,ff4c5ac9-fd9a-40a9-9d70-1c890ca04332.FPKM-UQ.txt.gz,TCGA-2Z-A9JQ,TCGA-2Z-A9JQ-01A,gene_expression,TXT
288262,949c5b4e-a6f2-4d9e-9549-625b92c79292,610600b6-c285-4a32-894b-14c80040f750.mirbase21.mirnas.quantification.txt,TCGA-AL-3468,TCGA-AL-3468-01A,mirna_expression,TSV
288263,6dc73d52-1767-4b69-a476-eb4c1ca84067,3ebca0e5-d7ba-4c6f-b746-abfac50a04bc.mirbase21.isoforms.quantification.txt,TARGET-50-PAJNCJ,TARGET-50-PAJNCJ-01A,mirna_expression,TSV
288264,a0f0dd67-f2ca-44ae-8564-1b265040c7ee,a5e07df6-2166-422f-b852-dc5da2e3a876.FPKM.txt.gz,TCGA-UZ-A9PM,TCGA-UZ-A9PM-01A,gene_expression,TXT
288265,090330b6-5568-45b5-bd4d-3d4b3b8d846c,ca57d990-f70a-4506-ae83-4e0a9b46a02d.htseq.counts.gz,TARGET-50-PALDTE,TARGET-50-PALDTE-01A,gene_expression,TXT
288266,525f0127-146d-4805-aa1d-bd50706b2a34,672d5558-5679-4710-a3bb-baf3a6e53552.mirbase21.isoforms.quantification.txt,TCGA-UY-A9PA,TCGA-UY-A9PA-01A,mirna_expression,TSV
288267,bff1004a-5c0b-4d9e-bb2e-d193e1c03c75,98bdb2ac-c7d9-4f11-a078-a3af8b92ea18.mirbase21.isoforms.quantification.txt,TCGA-SQ-A6I4,TCGA-SQ-A6I4-11A,mirna_expression,TSV


In [359]:
write.table(total_output, file='~/GDC_barcodes/uuid_barcode_map.txt', quote=FALSE, sep='\t', row.names=FALSE)