From dc8d09db44d31046d6e47b1997aaf312eb15bd51 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Sat, 7 Jan 2017 21:26:01 -0500 Subject: [PATCH] WDLs for trivial BAM manipulations - WDL to extract the SAM headers from a list of BAMs - WDL to validate a list of SAM/BAM files --- .../ExtractSamHeadersWf_170107.inputs.json | 10 +++ .../ExtractSamHeadersWf_170107.wdl | 84 ++++++++++++++++++++++ .../ValidateBamsWf_170107.inputs.json | 12 ++++ .../broad_dsde_workflows/ValidateBamsWf_170107.wdl | 80 +++++++++++++++++++++ scripts/broad_dsde_workflows/generic.options.json | 2 +- 5 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 scripts/broad_dsde_workflows/ExtractSamHeadersWf_170107.inputs.json create mode 100644 scripts/broad_dsde_workflows/ExtractSamHeadersWf_170107.wdl create mode 100644 scripts/broad_dsde_workflows/ValidateBamsWf_170107.inputs.json create mode 100644 scripts/broad_dsde_workflows/ValidateBamsWf_170107.wdl diff --git a/scripts/broad_dsde_workflows/ExtractSamHeadersWf_170107.inputs.json b/scripts/broad_dsde_workflows/ExtractSamHeadersWf_170107.inputs.json new file mode 100644 index 0000000..a835b1f --- /dev/null +++ b/scripts/broad_dsde_workflows/ExtractSamHeadersWf_170107.inputs.json @@ -0,0 +1,10 @@ +{ + "ExtractSamHeadersWf.bam_list": [ + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" + ], + + "ExtractSamHeadersWf.ExtractSAMHeader.mem_size": "1 GB", + "ExtractSamHeadersWf.ExtractSAMHeader.disk_size": 200 +} diff --git a/scripts/broad_dsde_workflows/ExtractSamHeadersWf_170107.wdl b/scripts/broad_dsde_workflows/ExtractSamHeadersWf_170107.wdl new file mode 100644 index 0000000..b82bb99 --- /dev/null +++ b/scripts/broad_dsde_workflows/ExtractSamHeadersWf_170107.wdl @@ -0,0 +1,84 @@ +## Copyright Broad Institute, 2017 +## +## This WDL extracts the headers from a list of SAM/BAMs +## +## Requirements/expectations : +## - List of valid SAM or BAM files +## +## Outputs: +## - Set of .txt files containing the header, one per input file +## +## Cromwell version support +## - Successfully tested on v24 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## Note that this is a really dumb way to get a SAM/BAM header when running on GCP +## because it will require localizing the entire SAM/BAM file, which can take a while. +## A much better way is to use gsutil like this: +## +## gsutil cp gs://bucket/path/your.bam - | samtools view -H - +## +## And yes there's a weird trailing dash (-) there at the end; it's not a typo, leave +## it in. +## +## If you're running locally or on a cloud platform that doesn't require localizing +## files then it should be fine. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Extract the header from a SAM or BAM using samtools view +task ExtractSAMHeader { + File bam_file + String output_name + Int disk_size + String mem_size + + command { + samtools view -H ${bam_file} > ${output_name} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: mem_size + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_header = "${output_name}" + } +} + +# WORKFLOW DEFINITION +workflow ExtractSamHeadersWf { + Array[File] bam_list + + # Process the input files in parallel + scatter (input_bam in bam_list) { + + String sub_strip_path = "gs://.*/" + String sub_strip_suffix = ".bam$" + + # Extract the header to a text file + call ExtractSAMHeader { + input: + bam_file = input_bam, + output_name = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, "") + ".header.txt" + } + } + + # Outputs that will be retained when execution is complete + output { + Array[File] output_headers = ExtractSAMHeader.output_header + } +} + diff --git a/scripts/broad_dsde_workflows/ValidateBamsWf_170107.inputs.json b/scripts/broad_dsde_workflows/ValidateBamsWf_170107.inputs.json new file mode 100644 index 0000000..5c81dde --- /dev/null +++ b/scripts/broad_dsde_workflows/ValidateBamsWf_170107.inputs.json @@ -0,0 +1,12 @@ +{ + "ValidateBamsWf.bam_list": [ + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" + ], + + "ValidateBamsWf.ValidateBAM.validation_mode": "SUMMARY", + + "ValidateBamsWf.ValidateBAM.mem_size": "1 GB", + "ValidateBamsWf.ValidateBAM.disk_size": 200 +} diff --git a/scripts/broad_dsde_workflows/ValidateBamsWf_170107.wdl b/scripts/broad_dsde_workflows/ValidateBamsWf_170107.wdl new file mode 100644 index 0000000..c92d566 --- /dev/null +++ b/scripts/broad_dsde_workflows/ValidateBamsWf_170107.wdl @@ -0,0 +1,80 @@ +## Copyright Broad Institute, 2017 +## +## This WDL validates a list of SAM/BAMs +## +## Requirements/expectations : +## - List of SAM or BAM files to validate +## - Explicit request of either SUMMARY or VERBOSE mode in inputs.json +## +## Outputs: +## - Set of .txt files containing the validation report, one per input file +## +## Cromwell version support +## - Successfully tested on v24 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Validate a SAM or BAM using Picard ValidateSamFile +task ValidateBAM { + File bam_file + String output_basename + String validation_mode + Int disk_size + String mem_size + + String output_name = "${output_basename}_${validation_mode}.txt" + + command { + java -Xmx3000m -jar /usr/gitc/picard.jar \ + ValidateSamFile \ + I=${bam_file} \ + OUTPUT=${output_name} \ + MODE=${validation_mode} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: mem_size + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + } + output { + File validation_report = "${output_name}" + } +} + +# WORKFLOW DEFINITION +workflow ValidateBamsWf { + Array[File] bam_list + + # Process the input files in parallel + scatter (input_bam in bam_list) { + + String sub_strip_path = "gs://.*/" + String sub_strip_suffix = ".bam$" + + # Run the validation + call ValidateBAM { + input: + bam_file = input_bam, + output_basename = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, "") + ".validation" + } + } + + # Outputs that will be retained when execution is complete + output { + Array[File] validation_reports = ValidateBAM.validation_report + } +} + diff --git a/scripts/broad_dsde_workflows/generic.options.json b/scripts/broad_dsde_workflows/generic.options.json index 949c569..ea81d83 100644 --- a/scripts/broad_dsde_workflows/generic.options.json +++ b/scripts/broad_dsde_workflows/generic.options.json @@ -1,6 +1,6 @@ { "read_from_cache":false, - "defaultRuntimeOptions": { + "default_runtime_attributes": { "zones": "us-central1-a us-central1-b us-central1-c" } } \ No newline at end of file