WDLs for trivial bam manipulation #82

Merged
merged 1 commit into from Jan 21, 2017
Jump to file or symbol
Failed to load files and symbols.
+187 −1
Split
@@ -0,0 +1,10 @@
+{
+ "ExtractSamHeadersWf.bam_list": [
+ "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam",
+ "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam",
+ "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam"
+ ],
+
+ "ExtractSamHeadersWf.ExtractSAMHeader.mem_size": "1 GB",
+ "ExtractSamHeadersWf.ExtractSAMHeader.disk_size": 200
+}
@@ -0,0 +1,84 @@
+## Copyright Broad Institute, 2017
+##
+## This WDL extracts the headers from a list of SAM/BAMs
+##
+## Requirements/expectations :
+## - List of valid SAM or BAM files
+##
+## Outputs:
+## - Set of .txt files containing the header, one per input file
+##
+## Cromwell version support
+## - Successfully tested on v24
+## - Does not work on versions < v23 due to output syntax
+##
+## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
+## For program versions, see docker containers.
+##
+## Note that this is a really dumb way to get a SAM/BAM header when running on GCP
+## because it will require localizing the entire SAM/BAM file, which can take a while.
+## A much better way is to use gsutil like this:
+##
+## gsutil cp gs://bucket/path/your.bam - | samtools view -H -
+##
+## And yes there's a weird trailing dash (-) there at the end; it's not a typo, leave
+## it in.
+##
+## If you're running locally or on a cloud platform that doesn't require localizing
+## files then it should be fine.
+##
+## LICENSING :
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in
+## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the docker
+## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
+## licensing information pertaining to the included programs.
+
+# TASK DEFINITIONS
+
+# Extract the header from a SAM or BAM using samtools view
+task ExtractSAMHeader {
+ File bam_file
+ String output_name
+ Int disk_size
+ String mem_size
+
+ command {
+ samtools view -H ${bam_file} > ${output_name}
+ }
+ runtime {
+ docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+ memory: mem_size
+ cpu: "1"
+ disks: "local-disk " + disk_size + " HDD"
+ }
+ output {
+ File output_header = "${output_name}"
+ }
+}
+
+# WORKFLOW DEFINITION
+workflow ExtractSamHeadersWf {
+ Array[File] bam_list
+
+ # Process the input files in parallel
+ scatter (input_bam in bam_list) {
+
+ String sub_strip_path = "gs://.*/"
+ String sub_strip_suffix = ".bam$"
+
+ # Extract the header to a text file
+ call ExtractSAMHeader {
+ input:
+ bam_file = input_bam,
+ output_name = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, "") + ".header.txt"
+ }
+ }
+
+ # Outputs that will be retained when execution is complete
+ output {
+ Array[File] output_headers = ExtractSAMHeader.output_header
+ }
+}
+
@@ -0,0 +1,12 @@
+{
+ "ValidateBamsWf.bam_list": [
+ "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam",
+ "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam",
+ "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam"
+ ],
+
+ "ValidateBamsWf.ValidateBAM.validation_mode": "SUMMARY",
+
+ "ValidateBamsWf.ValidateBAM.mem_size": "1 GB",
+ "ValidateBamsWf.ValidateBAM.disk_size": 200
+}
@@ -0,0 +1,80 @@
+## Copyright Broad Institute, 2017
+##
+## This WDL validates a list of SAM/BAMs
+##
+## Requirements/expectations :
+## - List of SAM or BAM files to validate
+## - Explicit request of either SUMMARY or VERBOSE mode in inputs.json
+##
+## Outputs:
+## - Set of .txt files containing the validation report, one per input file
+##
+## Cromwell version support
+## - Successfully tested on v24
+## - Does not work on versions < v23 due to output syntax
+##
+## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
+## For program versions, see docker containers.
+##
+## LICENSING :
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in
+## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the docker
+## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
+## licensing information pertaining to the included programs.
+
+# TASK DEFINITIONS
+
+# Validate a SAM or BAM using Picard ValidateSamFile
+task ValidateBAM {
+ File bam_file
+ String output_basename
+ String validation_mode
+ Int disk_size
+ String mem_size
+
+ String output_name = "${output_basename}_${validation_mode}.txt"
+
+ command {
+ java -Xmx3000m -jar /usr/gitc/picard.jar \
+ ValidateSamFile \
+ I=${bam_file} \
+ OUTPUT=${output_name} \
+ MODE=${validation_mode}
+ }
+ runtime {
+ docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+ memory: mem_size
+ cpu: "1"
+ disks: "local-disk " + disk_size + " HDD"
+ }
+ output {
+ File validation_report = "${output_name}"
+ }
+}
+
+# WORKFLOW DEFINITION
+workflow ValidateBamsWf {
+ Array[File] bam_list
+
+ # Process the input files in parallel
+ scatter (input_bam in bam_list) {
+
+ String sub_strip_path = "gs://.*/"
+ String sub_strip_suffix = ".bam$"
+
+ # Run the validation
+ call ValidateBAM {
+ input:
+ bam_file = input_bam,
+ output_basename = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, "") + ".validation"
+ }
+ }
+
+ # Outputs that will be retained when execution is complete
+ output {
+ Array[File] validation_reports = ValidateBAM.validation_report
+ }
+}
+
@@ -1,6 +1,6 @@
{
"read_from_cache":false,
- "defaultRuntimeOptions": {
+ "default_runtime_attributes": {
"zones": "us-central1-a us-central1-b us-central1-c"
}
}