From 575a3c387515ff280e1bfe0c5729e0ad34590895 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Sat, 7 Jan 2017 21:26:01 -0500 Subject: [PATCH] WDLs for generic conversion/reversion use cases - revert batch of single-readgroup SAM/BAMs to FASTQ - convert batch of paired FASTQs to uBAM per readgroup - revert multi-readgroup SAM/BAM to uBAM --- ...rtPairedFastQToUnmappedBamWf_170107.inputs.json | 32 +++++++ .../ConvertPairedFastQToUnmappedBamWf_170107.wdl | 97 ++++++++++++++++++++++ .../RevertBamToUnmappedRGBamsWf_170107.inputs.json | 16 ++++ .../RevertBamToUnmappedRGBamsWf_170107.wdl | 77 +++++++++++++++++ ...RevertRGBamsToPairedFastQsWf_170107.inputs.json | 11 +++ .../RevertRGBamsToPairedFastQsWf_170107.wdl | 81 ++++++++++++++++++ 6 files changed, 314 insertions(+) create mode 100644 scripts/broad_dsde_workflows/ConvertPairedFastQToUnmappedBamWf_170107.inputs.json create mode 100644 scripts/broad_dsde_workflows/ConvertPairedFastQToUnmappedBamWf_170107.wdl create mode 100644 scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170107.inputs.json create mode 100644 scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170107.wdl create mode 100644 scripts/broad_dsde_workflows/RevertRGBamsToPairedFastQsWf_170107.inputs.json create mode 100644 scripts/broad_dsde_workflows/RevertRGBamsToPairedFastQsWf_170107.wdl diff --git a/scripts/broad_dsde_workflows/ConvertPairedFastQToUnmappedBamWf_170107.inputs.json b/scripts/broad_dsde_workflows/ConvertPairedFastQToUnmappedBamWf_170107.inputs.json new file mode 100644 index 0000000..ccb545d --- /dev/null +++ b/scripts/broad_dsde_workflows/ConvertPairedFastQToUnmappedBamWf_170107.inputs.json @@ -0,0 +1,32 @@ +{ + "ConvertPairedFastQsToUnmappedBamWf.readgroup_list": [ + "NA12878_A", "NA12878_B", "NA12878_C" + ], + "ConvertPairedFastQsToUnmappedBamWf.metadata": { + "NA12878_A": [ + "NA12878", "Solexa-NA12878", "H06HDADXX130110.2.ATCACGAT", "2016-09-01T02:00:00+0200", "illumina", "BI" + ], + "NA12878_B": [ + "NA12878", "Solexa-NA12878", "H06HDADXX130110.1.ATCACGAT", "2016-09-01T02:00:00+0200", "illumina", "BI" + ], + "NA12878_C": [ + "NA12878", "Solexa-NA12878", "H06JUADXX130110.1.ATCACGAT", "2016-09-01T02:00:00+0200", "illumina", "BI" + ] + }, + "ConvertPairedFastQsToUnmappedBamWf.fastq_pairs": { + "NA12878_A": [ + "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.1.ATCACGAT.20k_reads_1.fastq", + "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.1.ATCACGAT.20k_reads_2.fastq" + ], + "NA12878_B": [ + "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.2.ATCACGAT.20k_reads_1.fastq", + "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.2.ATCACGAT.20k_reads_2.fastq" + ], + "NA12878_C": [ + "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06JUADXX130110.1.ATCACGAT.20k_reads_1.fastq", + "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06JUADXX130110.1.ATCACGAT.20k_reads_2.fastq" + ] + }, + "ConvertPairedFastQsToUnmappedBamWf.PairedFastQsToUnmappedBAM.mem_size": "1 GB", + "ConvertPairedFastQsToUnmappedBamWf.PairedFastQsToUnmappedBAM.disk_size": 200 +} diff --git a/scripts/broad_dsde_workflows/ConvertPairedFastQToUnmappedBamWf_170107.wdl b/scripts/broad_dsde_workflows/ConvertPairedFastQToUnmappedBamWf_170107.wdl new file mode 100644 index 0000000..cec8133 --- /dev/null +++ b/scripts/broad_dsde_workflows/ConvertPairedFastQToUnmappedBamWf_170107.wdl @@ -0,0 +1,97 @@ +## Copyright Broad Institute, 2017 +## +## This WDL converts paired FASTQ to uBAM and adds read group information +## +## Requirements/expectations : +## - Pair-end sequencing data in FASTQ format (one file per orientation) +## - One or more read groups, one per pair of FASTQ files +## +## Outputs : +## - Set of unmapped BAMs, one per read group +## +## Cromwell version support +## - Successfully tested on v24 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Convert a pair of FASTQs to uBAM +task PairedFastQsToUnmappedBAM { + File fastq_1 + File fastq_2 + String readgroup_name + String sample_name + String library_name + String platform_unit + String run_date + String platform_name + String sequencing_center + Int disk_size + String mem_size + + command { + java -Xmx3000m -jar /usr/gitc/picard.jar \ + FastqToSam \ + FASTQ=${fastq_1} \ + FASTQ2=${fastq_2} \ + OUTPUT=${readgroup_name}.bam \ + READ_GROUP_NAME=${readgroup_name} \ + SAMPLE_NAME=${sample_name} \ + LIBRARY_NAME=${library_name} \ + PLATFORM_UNIT=${platform_unit} \ + RUN_DATE=${run_date} \ + PLATFORM=${platform_name} \ + SEQUENCING_CENTER=${sequencing_center} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: mem_size + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bam = "${readgroup_name}.bam" + } +} + +# WORKFLOW DEFINITION +workflow ConvertPairedFastQsToUnmappedBamWf { + Array[String] readgroup_list + Map[String, Array[File]] fastq_pairs + Map[String, Array[String]] metadata + + # Convert multiple pairs of input fastqs in parallel + scatter (readgroup in readgroup_list) { + + # Convert pair of FASTQs to uBAM + call PairedFastQsToUnmappedBAM { + input: + fastq_1 = fastq_pairs[readgroup][0], + fastq_2 = fastq_pairs[readgroup][1], + readgroup_name = readgroup, + sample_name = metadata[readgroup][0], + library_name = metadata[readgroup][1], + platform_unit = metadata[readgroup][2], + run_date = metadata[readgroup][3], + platform_name = metadata[readgroup][4], + sequencing_center = metadata[readgroup][5] + } + } + + # Outputs that will be retained when execution is complete + output { + Array[File] output_bams = PairedFastQsToUnmappedBAM.output_bam + } +} + diff --git a/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170107.inputs.json b/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170107.inputs.json new file mode 100644 index 0000000..5511207 --- /dev/null +++ b/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170107.inputs.json @@ -0,0 +1,16 @@ +{ + + "RevertBamToUnmappedRGBamsWf.ref_fasta": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta", + "RevertBamToUnmappedRGBamsWf.ref_fasta_index": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai", + + "RevertBamToUnmappedRGBamsWf.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_20k_b37/NA12878.bam", + + "RevertBamToUnmappedRGBamsWf.output_dir": ".", + + "RevertBamToUnmappedRGBamsWf.RevertBamToUnmappedRGBams.max_discard_pct": 0.01, + + "RevertBamToUnmappedRGBamsWf.RevertBamToUnmappedRGBams.disk_size": 10, + "RevertBamToUnmappedRGBamsWf.RevertBamToUnmappedRGBams.mem_size": "1 GB", + "RevertBamToUnmappedRGBamsWf.SortBamByQueryname.disk_size": 10, + "RevertBamToUnmappedRGBamsWf.SortBamByQueryname.mem_size": "3500 MB" +} diff --git a/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170107.wdl b/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170107.wdl new file mode 100644 index 0000000..b75682b --- /dev/null +++ b/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170107.wdl @@ -0,0 +1,77 @@ +## Copyright Broad Institute, 2017 +## +## This WDL reverts a SAM or BAM file to uBAMs, one per readgroup +## +## Requirements/expectations : +## - Pair-end sequencing data in SAM or BAM format +## - One or more read groups +## +## Outputs : +## - Set of unmapped BAMs, one per read group, with reads sorted by queryname +## +## Cromwell version support +## - Successfully tested on v24 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Revert a BAM to uBAMs, one per readgroup +task RevertBamToUnmappedRGBams { + File input_bam + String output_dir + Float? max_discard_pct + Int disk_size + String mem_size + + command { + java -Xmx1000m -jar /usr/gitc/picard.jar \ + RevertSam \ + INPUT=${input_bam} \ + O=${output_dir} \ + OUTPUT_BY_READGROUP=true \ + VALIDATION_STRINGENCY=LENIENT \ + SANITIZE=TRUE \ + MAX_DISCARD_FRACTION=${max_discard_pct} \ + ATTRIBUTE_TO_CLEAR=FT \ + SORT_ORDER=queryname + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + disks: "local-disk " + disk_size + " HDD" + memory: mem_size + } + output { + Array[File] unmapped_bams = glob("*.bam") + } +} + +# WORKFLOW DEFINITION +workflow RevertBamToUnmappedRGBamsWf { + File input_bam + File ref_fasta + File ref_fasta_index + String output_dir + + # Revert inputs to unmapped + call RevertBamToUnmappedRGBams { + input: + input_bam = input_bam, + output_dir = output_dir + } + + # Outputs that will be retained when execution is complete + output { + Array[File] unmapped_bams_output=RevertBamToUnmappedRGBams.unmapped_bams + } +} diff --git a/scripts/broad_dsde_workflows/RevertRGBamsToPairedFastQsWf_170107.inputs.json b/scripts/broad_dsde_workflows/RevertRGBamsToPairedFastQsWf_170107.inputs.json new file mode 100644 index 0000000..253a221 --- /dev/null +++ b/scripts/broad_dsde_workflows/RevertRGBamsToPairedFastQsWf_170107.inputs.json @@ -0,0 +1,11 @@ +{ + "RevertRGBamsToPairedFastQsWf.bam_list": [ + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" + ], + + "RevertRGBamsToPairedFastQsWf.RevertBAMToPairedFASTQ.mem_size": "1 GB", + "RevertRGBamsToPairedFastQsWf.RevertBAMToPairedFASTQ.disk_size": 200 + +} diff --git a/scripts/broad_dsde_workflows/RevertRGBamsToPairedFastQsWf_170107.wdl b/scripts/broad_dsde_workflows/RevertRGBamsToPairedFastQsWf_170107.wdl new file mode 100644 index 0000000..0781341 --- /dev/null +++ b/scripts/broad_dsde_workflows/RevertRGBamsToPairedFastQsWf_170107.wdl @@ -0,0 +1,81 @@ +## Copyright Broad Institute, 2017 +## +## This WDL reverts a set of single-readgroup BAMs to paired FASTQs +## +## Requirements/expectations: +## - List of valid BAM files +## - Max one readgroup per BAM files. If there are more, the distinctions will be lost! +## +## Outputs: +## - Sets of two FASTQ files of paired reads (*_1 and *_2) plus one FASTQ file of +## unpaired reads (*_unp) per input file. +## +## Cromwell version support +## - Successfully tested on v24 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Run SamToFASTQ to revert the bam +task RevertBAMToPairedFASTQ { + File bam_file + String output_basename + Int disk_size + String mem_size + + command { + java -Xmx3000m -jar /usr/gitc/picard.jar \ + SamToFastq \ + I=${bam_file} \ + FASTQ=${output_basename}_1.fastq \ + SECOND_END_FASTQ=${output_basename}_2.fastq \ + UNPAIRED_FASTQ=${output_basename}_unp.fastq \ + INCLUDE_NON_PRIMARY_ALIGNMENTS=true \ + INCLUDE_NON_PF_READS=true + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: mem_size + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + } + output { + Array[File] output_fastqs = glob("*.fastq") + } +} + +# WORKFLOW DEFINITION +workflow RevertRGBamsToPairedFastQsWf { + Array[File] bam_list + + # Process input files in parallel + scatter (input_bam in bam_list) { + + String sub_strip_path = "gs://.*/" + String sub_strip_suffix = ".bam$" + + # Revert inputs to paired FASTQ + call RevertBAMToPairedFASTQ { + input: + bam_file = input_bam, + output_basename = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, ""), + } + } + + # Outputs that will be retained when execution is complete + output { + Array[Array[File]] output_fastqs_globs=RevertBAMToPairedFASTQ.output_fastqs + } +} +