WDLs to revert/convert between BAM, FASTQ and uBAM #83

Merged
merged 1 commit into from Jan 22, 2017
Jump to file or symbol
Failed to load files and symbols.
+314 −0
Split
@@ -0,0 +1,32 @@
+{
+ "ConvertPairedFastQsToUnmappedBamWf.readgroup_list": [
+ "NA12878_A", "NA12878_B", "NA12878_C"
+ ],
+ "ConvertPairedFastQsToUnmappedBamWf.metadata": {
+ "NA12878_A": [
+ "NA12878", "Solexa-NA12878", "H06HDADXX130110.2.ATCACGAT", "2016-09-01T02:00:00+0200", "illumina", "BI"
+ ],
+ "NA12878_B": [
+ "NA12878", "Solexa-NA12878", "H06HDADXX130110.1.ATCACGAT", "2016-09-01T02:00:00+0200", "illumina", "BI"
+ ],
+ "NA12878_C": [
+ "NA12878", "Solexa-NA12878", "H06JUADXX130110.1.ATCACGAT", "2016-09-01T02:00:00+0200", "illumina", "BI"
+ ]
+ },
+ "ConvertPairedFastQsToUnmappedBamWf.fastq_pairs": {
+ "NA12878_A": [
+ "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.1.ATCACGAT.20k_reads_1.fastq",
+ "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.1.ATCACGAT.20k_reads_2.fastq"
+ ],
+ "NA12878_B": [
+ "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.2.ATCACGAT.20k_reads_1.fastq",
+ "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.2.ATCACGAT.20k_reads_2.fastq"
+ ],
+ "NA12878_C": [
+ "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06JUADXX130110.1.ATCACGAT.20k_reads_1.fastq",
+ "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06JUADXX130110.1.ATCACGAT.20k_reads_2.fastq"
+ ]
+ },
+ "ConvertPairedFastQsToUnmappedBamWf.PairedFastQsToUnmappedBAM.mem_size": "1 GB",
+ "ConvertPairedFastQsToUnmappedBamWf.PairedFastQsToUnmappedBAM.disk_size": 200
+}
@@ -0,0 +1,97 @@
+## Copyright Broad Institute, 2017
+##
+## This WDL converts paired FASTQ to uBAM and adds read group information
+##
+## Requirements/expectations :
+## - Pair-end sequencing data in FASTQ format (one file per orientation)
+## - One or more read groups, one per pair of FASTQ files
+##
+## Outputs :
+## - Set of unmapped BAMs, one per read group
+##
+## Cromwell version support
+## - Successfully tested on v24
+## - Does not work on versions < v23 due to output syntax
+##
+## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
+## For program versions, see docker containers.
+##
+## LICENSING :
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in
+## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the docker
+## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
+## licensing information pertaining to the included programs.
+
+# TASK DEFINITIONS
+
+# Convert a pair of FASTQs to uBAM
+task PairedFastQsToUnmappedBAM {
+ File fastq_1
+ File fastq_2
+ String readgroup_name
+ String sample_name
+ String library_name
+ String platform_unit
+ String run_date
+ String platform_name
+ String sequencing_center
+ Int disk_size
+ String mem_size
+
+ command {
+ java -Xmx3000m -jar /usr/gitc/picard.jar \
+ FastqToSam \
+ FASTQ=${fastq_1} \
+ FASTQ2=${fastq_2} \
+ OUTPUT=${readgroup_name}.bam \
+ READ_GROUP_NAME=${readgroup_name} \
+ SAMPLE_NAME=${sample_name} \
+ LIBRARY_NAME=${library_name} \
+ PLATFORM_UNIT=${platform_unit} \
+ RUN_DATE=${run_date} \
+ PLATFORM=${platform_name} \
+ SEQUENCING_CENTER=${sequencing_center}
+ }
+ runtime {
+ docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
+ memory: mem_size
+ cpu: "1"
+ disks: "local-disk " + disk_size + " HDD"
+ }
+ output {
+ File output_bam = "${readgroup_name}.bam"
+ }
+}
+
+# WORKFLOW DEFINITION
+workflow ConvertPairedFastQsToUnmappedBamWf {
+ Array[String] readgroup_list
+ Map[String, Array[File]] fastq_pairs
+ Map[String, Array[String]] metadata
+
+ # Convert multiple pairs of input fastqs in parallel
+ scatter (readgroup in readgroup_list) {
+
+ # Convert pair of FASTQs to uBAM
+ call PairedFastQsToUnmappedBAM {
+ input:
+ fastq_1 = fastq_pairs[readgroup][0],
+ fastq_2 = fastq_pairs[readgroup][1],
+ readgroup_name = readgroup,
+ sample_name = metadata[readgroup][0],
+ library_name = metadata[readgroup][1],
+ platform_unit = metadata[readgroup][2],
+ run_date = metadata[readgroup][3],
+ platform_name = metadata[readgroup][4],
+ sequencing_center = metadata[readgroup][5]
+ }
+ }
+
+ # Outputs that will be retained when execution is complete
+ output {
+ Array[File] output_bams = PairedFastQsToUnmappedBAM.output_bam
+ }
+}
+
@@ -0,0 +1,16 @@
+{
+
+ "RevertBamToUnmappedRGBamsWf.ref_fasta": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta",
+ "RevertBamToUnmappedRGBamsWf.ref_fasta_index": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai",
+
+ "RevertBamToUnmappedRGBamsWf.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_20k_b37/NA12878.bam",
+
+ "RevertBamToUnmappedRGBamsWf.output_dir": ".",
+
+ "RevertBamToUnmappedRGBamsWf.RevertBamToUnmappedRGBams.max_discard_pct": 0.01,
+
+ "RevertBamToUnmappedRGBamsWf.RevertBamToUnmappedRGBams.disk_size": 10,
+ "RevertBamToUnmappedRGBamsWf.RevertBamToUnmappedRGBams.mem_size": "1 GB",
+ "RevertBamToUnmappedRGBamsWf.SortBamByQueryname.disk_size": 10,
+ "RevertBamToUnmappedRGBamsWf.SortBamByQueryname.mem_size": "3500 MB"
+}
@@ -0,0 +1,77 @@
+## Copyright Broad Institute, 2017
+##
+## This WDL reverts a SAM or BAM file to uBAMs, one per readgroup
+##
+## Requirements/expectations :
+## - Pair-end sequencing data in SAM or BAM format
+## - One or more read groups
+##
+## Outputs :
+## - Set of unmapped BAMs, one per read group, with reads sorted by queryname
+##
+## Cromwell version support
+## - Successfully tested on v24
+## - Does not work on versions < v23 due to output syntax
+##
+## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
+## For program versions, see docker containers.
+##
+## LICENSING :
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in
+## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the docker
+## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
+## licensing information pertaining to the included programs.
+
+# TASK DEFINITIONS
+
+# Revert a BAM to uBAMs, one per readgroup
+task RevertBamToUnmappedRGBams {
+ File input_bam
+ String output_dir
+ Float? max_discard_pct
+ Int disk_size
+ String mem_size
+
+ command {
+ java -Xmx1000m -jar /usr/gitc/picard.jar \
+ RevertSam \
+ INPUT=${input_bam} \
+ O=${output_dir} \
+ OUTPUT_BY_READGROUP=true \
+ VALIDATION_STRINGENCY=LENIENT \
+ SANITIZE=TRUE \
+ MAX_DISCARD_FRACTION=${max_discard_pct} \
+ ATTRIBUTE_TO_CLEAR=FT \
+ SORT_ORDER=queryname
+ }
+ runtime {
+ docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+ disks: "local-disk " + disk_size + " HDD"
+ memory: mem_size
+ }
+ output {
+ Array[File] unmapped_bams = glob("*.bam")
+ }
+}
+
+# WORKFLOW DEFINITION
+workflow RevertBamToUnmappedRGBamsWf {
+ File input_bam
+ File ref_fasta
+ File ref_fasta_index
+ String output_dir
+
+ # Revert inputs to unmapped
+ call RevertBamToUnmappedRGBams {
+ input:
+ input_bam = input_bam,
+ output_dir = output_dir
+ }
+
+ # Outputs that will be retained when execution is complete
+ output {
+ Array[File] unmapped_bams_output=RevertBamToUnmappedRGBams.unmapped_bams
+ }
+}
@@ -0,0 +1,11 @@
+{
+ "RevertRGBamsToPairedFastQsWf.bam_list": [
+ "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam",
+ "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam",
+ "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam"
+ ],
+
+ "RevertRGBamsToPairedFastQsWf.RevertBAMToPairedFASTQ.mem_size": "1 GB",
+ "RevertRGBamsToPairedFastQsWf.RevertBAMToPairedFASTQ.disk_size": 200
+
+}
@@ -0,0 +1,81 @@
+## Copyright Broad Institute, 2017
+##
+## This WDL reverts a set of single-readgroup BAMs to paired FASTQs
+##
+## Requirements/expectations:
+## - List of valid BAM files
+## - Max one readgroup per BAM files. If there are more, the distinctions will be lost!
+##
+## Outputs:
+## - Sets of two FASTQ files of paired reads (*_1 and *_2) plus one FASTQ file of
+## unpaired reads (*_unp) per input file.
+##
+## Cromwell version support
+## - Successfully tested on v24
+## - Does not work on versions < v23 due to output syntax
+##
+## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
+## For program versions, see docker containers.
+##
+## LICENSING :
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in
+## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the docker
+## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
+## licensing information pertaining to the included programs.
+
+# TASK DEFINITIONS
+
+# Run SamToFASTQ to revert the bam
+task RevertBAMToPairedFASTQ {
+ File bam_file
+ String output_basename
+ Int disk_size
+ String mem_size
+
+ command {
+ java -Xmx3000m -jar /usr/gitc/picard.jar \
+ SamToFastq \
+ I=${bam_file} \
+ FASTQ=${output_basename}_1.fastq \
+ SECOND_END_FASTQ=${output_basename}_2.fastq \
+ UNPAIRED_FASTQ=${output_basename}_unp.fastq \
+ INCLUDE_NON_PRIMARY_ALIGNMENTS=true \
+ INCLUDE_NON_PF_READS=true
+ }
+ runtime {
+ docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018"
+ memory: mem_size
+ cpu: "1"
+ disks: "local-disk " + disk_size + " HDD"
+ }
+ output {
+ Array[File] output_fastqs = glob("*.fastq")
+ }
+}
+
+# WORKFLOW DEFINITION
+workflow RevertRGBamsToPairedFastQsWf {
+ Array[File] bam_list
+
+ # Process input files in parallel
+ scatter (input_bam in bam_list) {
+
+ String sub_strip_path = "gs://.*/"
+ String sub_strip_suffix = ".bam$"
+
+ # Revert inputs to paired FASTQ
+ call RevertBAMToPairedFASTQ {
+ input:
+ bam_file = input_bam,
+ output_basename = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, ""),
+ }
+ }
+
+ # Outputs that will be retained when execution is complete
+ output {
+ Array[Array[File]] output_fastqs_globs=RevertBAMToPairedFASTQ.output_fastqs
+ }
+}
+