Permalink
Please sign in to comment.
Browse files
WDLs for trivial BAM manipulations
- WDL to extract the SAM headers from a list of BAMs
- WDL to validate a list of SAM/BAM files- Loading branch information...
Showing
with
187 additions
and 1 deletion.
- +10 −0 scripts/broad_dsde_workflows/ExtractSamHeadersWf_170107.inputs.json
- +84 −0 scripts/broad_dsde_workflows/ExtractSamHeadersWf_170107.wdl
- +12 −0 scripts/broad_dsde_workflows/ValidateBamsWf_170107.inputs.json
- +80 −0 scripts/broad_dsde_workflows/ValidateBamsWf_170107.wdl
- +1 −1 scripts/broad_dsde_workflows/generic.options.json
| @@ -0,0 +1,10 @@ | ||
| +{ | ||
| + "ExtractSamHeadersWf.bam_list": [ | ||
| + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", | ||
| + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", | ||
| + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" | ||
| + ], | ||
| + | ||
| + "ExtractSamHeadersWf.ExtractSAMHeader.mem_size": "1 GB", | ||
| + "ExtractSamHeadersWf.ExtractSAMHeader.disk_size": 200 | ||
| +} |
| @@ -0,0 +1,84 @@ | ||
| +## Copyright Broad Institute, 2017 | ||
| +## | ||
| +## This WDL extracts the headers from a list of SAM/BAMs | ||
| +## | ||
| +## Requirements/expectations : | ||
| +## - List of valid SAM or BAM files | ||
| +## | ||
| +## Outputs: | ||
| +## - Set of .txt files containing the header, one per input file | ||
| +## | ||
| +## Cromwell version support | ||
| +## - Successfully tested on v24 | ||
| +## - Does not work on versions < v23 due to output syntax | ||
| +## | ||
| +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. | ||
| +## For program versions, see docker containers. | ||
| +## | ||
| +## Note that this is a really dumb way to get a SAM/BAM header when running on GCP | ||
| +## because it will require localizing the entire SAM/BAM file, which can take a while. | ||
| +## A much better way is to use gsutil like this: | ||
| +## | ||
| +## gsutil cp gs://bucket/path/your.bam - | samtools view -H - | ||
| +## | ||
| +## And yes there's a weird trailing dash (-) there at the end; it's not a typo, leave | ||
| +## it in. | ||
| +## | ||
| +## If you're running locally or on a cloud platform that doesn't require localizing | ||
| +## files then it should be fine. | ||
| +## | ||
| +## LICENSING : | ||
| +## This script is released under the WDL source code license (BSD-3) (see LICENSE in | ||
| +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may | ||
| +## be subject to different licenses. Users are responsible for checking that they are | ||
| +## authorized to run all programs before running this script. Please see the docker | ||
| +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed | ||
| +## licensing information pertaining to the included programs. | ||
| + | ||
| +# TASK DEFINITIONS | ||
| + | ||
| +# Extract the header from a SAM or BAM using samtools view | ||
| +task ExtractSAMHeader { | ||
| + File bam_file | ||
| + String output_name | ||
| + Int disk_size | ||
| + String mem_size | ||
| + | ||
| + command { | ||
| + samtools view -H ${bam_file} > ${output_name} | ||
| + } | ||
| + runtime { | ||
| + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" | ||
| + memory: mem_size | ||
| + cpu: "1" | ||
| + disks: "local-disk " + disk_size + " HDD" | ||
| + } | ||
| + output { | ||
| + File output_header = "${output_name}" | ||
| + } | ||
| +} | ||
| + | ||
| +# WORKFLOW DEFINITION | ||
| +workflow ExtractSamHeadersWf { | ||
| + Array[File] bam_list | ||
| + | ||
| + # Process the input files in parallel | ||
| + scatter (input_bam in bam_list) { | ||
| + | ||
| + String sub_strip_path = "gs://.*/" | ||
| + String sub_strip_suffix = ".bam$" | ||
| + | ||
| + # Extract the header to a text file | ||
| + call ExtractSAMHeader { | ||
| + input: | ||
| + bam_file = input_bam, | ||
| + output_name = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, "") + ".header.txt" | ||
| + } | ||
| + } | ||
| + | ||
| + # Outputs that will be retained when execution is complete | ||
| + output { | ||
| + Array[File] output_headers = ExtractSAMHeader.output_header | ||
| + } | ||
| +} | ||
| + |
| @@ -0,0 +1,12 @@ | ||
| +{ | ||
| + "ValidateBamsWf.bam_list": [ | ||
| + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", | ||
| + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", | ||
| + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" | ||
| + ], | ||
| + | ||
| + "ValidateBamsWf.ValidateBAM.validation_mode": "SUMMARY", | ||
| + | ||
| + "ValidateBamsWf.ValidateBAM.mem_size": "1 GB", | ||
| + "ValidateBamsWf.ValidateBAM.disk_size": 200 | ||
| +} |
| @@ -0,0 +1,80 @@ | ||
| +## Copyright Broad Institute, 2017 | ||
| +## | ||
| +## This WDL validates a list of SAM/BAMs | ||
| +## | ||
| +## Requirements/expectations : | ||
| +## - List of SAM or BAM files to validate | ||
| +## - Explicit request of either SUMMARY or VERBOSE mode in inputs.json | ||
| +## | ||
| +## Outputs: | ||
| +## - Set of .txt files containing the validation report, one per input file | ||
| +## | ||
| +## Cromwell version support | ||
| +## - Successfully tested on v24 | ||
| +## - Does not work on versions < v23 due to output syntax | ||
| +## | ||
| +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. | ||
| +## For program versions, see docker containers. | ||
| +## | ||
| +## LICENSING : | ||
| +## This script is released under the WDL source code license (BSD-3) (see LICENSE in | ||
| +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may | ||
| +## be subject to different licenses. Users are responsible for checking that they are | ||
| +## authorized to run all programs before running this script. Please see the docker | ||
| +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed | ||
| +## licensing information pertaining to the included programs. | ||
| + | ||
| +# TASK DEFINITIONS | ||
| + | ||
| +# Validate a SAM or BAM using Picard ValidateSamFile | ||
| +task ValidateBAM { | ||
| + File bam_file | ||
| + String output_basename | ||
| + String validation_mode | ||
| + Int disk_size | ||
| + String mem_size | ||
| + | ||
| + String output_name = "${output_basename}_${validation_mode}.txt" | ||
| + | ||
| + command { | ||
| + java -Xmx3000m -jar /usr/gitc/picard.jar \ | ||
| + ValidateSamFile \ | ||
| + I=${bam_file} \ | ||
| + OUTPUT=${output_name} \ | ||
| + MODE=${validation_mode} | ||
| + } | ||
| + runtime { | ||
| + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" | ||
| + memory: mem_size | ||
| + cpu: "1" | ||
| + disks: "local-disk " + disk_size + " HDD" | ||
| + } | ||
| + output { | ||
| + File validation_report = "${output_name}" | ||
| + } | ||
| +} | ||
| + | ||
| +# WORKFLOW DEFINITION | ||
| +workflow ValidateBamsWf { | ||
| + Array[File] bam_list | ||
| + | ||
| + # Process the input files in parallel | ||
| + scatter (input_bam in bam_list) { | ||
| + | ||
| + String sub_strip_path = "gs://.*/" | ||
| + String sub_strip_suffix = ".bam$" | ||
| + | ||
| + # Run the validation | ||
| + call ValidateBAM { | ||
| + input: | ||
| + bam_file = input_bam, | ||
| + output_basename = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, "") + ".validation" | ||
| + } | ||
| + } | ||
| + | ||
| + # Outputs that will be retained when execution is complete | ||
| + output { | ||
| + Array[File] validation_reports = ValidateBAM.validation_report | ||
| + } | ||
| +} | ||
| + |
| @@ -1,6 +1,6 @@ | ||
| { | ||
| "read_from_cache":false, | ||
| - "defaultRuntimeOptions": { | ||
| + "default_runtime_attributes": { | ||
| "zones": "us-central1-a us-central1-b us-central1-c" | ||
| } | ||
| } |
0 comments on commit
dc8d09d