WDLs to revert/convert between BAM, FASTQ and uBAM #83
Merged
Jump to file or symbol
Failed to load files and symbols.
| @@ -0,0 +1,32 @@ | ||
| +{ | ||
| + "ConvertPairedFastQsToUnmappedBamWf.readgroup_list": [ | ||
| + "NA12878_A", "NA12878_B", "NA12878_C" | ||
| + ], | ||
| + "ConvertPairedFastQsToUnmappedBamWf.metadata": { | ||
| + "NA12878_A": [ | ||
| + "NA12878", "Solexa-NA12878", "H06HDADXX130110.2.ATCACGAT", "2016-09-01T02:00:00+0200", "illumina", "BI" | ||
| + ], | ||
| + "NA12878_B": [ | ||
| + "NA12878", "Solexa-NA12878", "H06HDADXX130110.1.ATCACGAT", "2016-09-01T02:00:00+0200", "illumina", "BI" | ||
| + ], | ||
| + "NA12878_C": [ | ||
| + "NA12878", "Solexa-NA12878", "H06JUADXX130110.1.ATCACGAT", "2016-09-01T02:00:00+0200", "illumina", "BI" | ||
| + ] | ||
| + }, | ||
| + "ConvertPairedFastQsToUnmappedBamWf.fastq_pairs": { | ||
| + "NA12878_A": [ | ||
| + "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.1.ATCACGAT.20k_reads_1.fastq", | ||
| + "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.1.ATCACGAT.20k_reads_2.fastq" | ||
| + ], | ||
| + "NA12878_B": [ | ||
| + "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.2.ATCACGAT.20k_reads_1.fastq", | ||
| + "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06HDADXX130110.2.ATCACGAT.20k_reads_2.fastq" | ||
| + ], | ||
| + "NA12878_C": [ | ||
| + "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06JUADXX130110.1.ATCACGAT.20k_reads_1.fastq", | ||
| + "gs://gatk-test-data/wgs_fastq/NA12878_20k/H06JUADXX130110.1.ATCACGAT.20k_reads_2.fastq" | ||
| + ] | ||
| + }, | ||
| + "ConvertPairedFastQsToUnmappedBamWf.PairedFastQsToUnmappedBAM.mem_size": "1 GB", | ||
| + "ConvertPairedFastQsToUnmappedBamWf.PairedFastQsToUnmappedBAM.disk_size": 200 | ||
| +} |
| @@ -0,0 +1,97 @@ | ||
| +## Copyright Broad Institute, 2017 | ||
| +## | ||
| +## This WDL converts paired FASTQ to uBAM and adds read group information | ||
| +## | ||
| +## Requirements/expectations : | ||
| +## - Pair-end sequencing data in FASTQ format (one file per orientation) | ||
| +## - One or more read groups, one per pair of FASTQ files | ||
| +## | ||
| +## Outputs : | ||
| +## - Set of unmapped BAMs, one per read group | ||
| +## | ||
| +## Cromwell version support | ||
| +## - Successfully tested on v24 | ||
| +## - Does not work on versions < v23 due to output syntax | ||
| +## | ||
| +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. | ||
| +## For program versions, see docker containers. | ||
| +## | ||
| +## LICENSING : | ||
| +## This script is released under the WDL source code license (BSD-3) (see LICENSE in | ||
| +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may | ||
| +## be subject to different licenses. Users are responsible for checking that they are | ||
| +## authorized to run all programs before running this script. Please see the docker | ||
| +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed | ||
| +## licensing information pertaining to the included programs. | ||
| + | ||
| +# TASK DEFINITIONS | ||
| + | ||
| +# Convert a pair of FASTQs to uBAM | ||
| +task PairedFastQsToUnmappedBAM { | ||
| + File fastq_1 | ||
| + File fastq_2 | ||
| + String readgroup_name | ||
| + String sample_name | ||
| + String library_name | ||
| + String platform_unit | ||
| + String run_date | ||
| + String platform_name | ||
| + String sequencing_center | ||
| + Int disk_size | ||
| + String mem_size | ||
| + | ||
| + command { | ||
| + java -Xmx3000m -jar /usr/gitc/picard.jar \ | ||
| + FastqToSam \ | ||
| + FASTQ=${fastq_1} \ | ||
| + FASTQ2=${fastq_2} \ | ||
| + OUTPUT=${readgroup_name}.bam \ | ||
| + READ_GROUP_NAME=${readgroup_name} \ | ||
| + SAMPLE_NAME=${sample_name} \ | ||
| + LIBRARY_NAME=${library_name} \ | ||
| + PLATFORM_UNIT=${platform_unit} \ | ||
| + RUN_DATE=${run_date} \ | ||
| + PLATFORM=${platform_name} \ | ||
| + SEQUENCING_CENTER=${sequencing_center} | ||
| + } | ||
| + runtime { | ||
| + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | ||
| + memory: mem_size | ||
| + cpu: "1" | ||
| + disks: "local-disk " + disk_size + " HDD" | ||
| + } | ||
| + output { | ||
| + File output_bam = "${readgroup_name}.bam" | ||
| + } | ||
| +} | ||
| + | ||
| +# WORKFLOW DEFINITION | ||
| +workflow ConvertPairedFastQsToUnmappedBamWf { | ||
| + Array[String] readgroup_list | ||
| + Map[String, Array[File]] fastq_pairs | ||
| + Map[String, Array[String]] metadata | ||
| + | ||
| + # Convert multiple pairs of input fastqs in parallel | ||
| + scatter (readgroup in readgroup_list) { | ||
| + | ||
| + # Convert pair of FASTQs to uBAM | ||
| + call PairedFastQsToUnmappedBAM { | ||
| + input: | ||
| + fastq_1 = fastq_pairs[readgroup][0], | ||
| + fastq_2 = fastq_pairs[readgroup][1], | ||
| + readgroup_name = readgroup, | ||
| + sample_name = metadata[readgroup][0], | ||
| + library_name = metadata[readgroup][1], | ||
| + platform_unit = metadata[readgroup][2], | ||
| + run_date = metadata[readgroup][3], | ||
| + platform_name = metadata[readgroup][4], | ||
| + sequencing_center = metadata[readgroup][5] | ||
| + } | ||
| + } | ||
| + | ||
| + # Outputs that will be retained when execution is complete | ||
| + output { | ||
| + Array[File] output_bams = PairedFastQsToUnmappedBAM.output_bam | ||
| + } | ||
| +} | ||
| + |
| @@ -0,0 +1,16 @@ | ||
| +{ | ||
| + | ||
| + "RevertBamToUnmappedRGBamsWf.ref_fasta": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta", | ||
| + "RevertBamToUnmappedRGBamsWf.ref_fasta_index": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai", | ||
| + | ||
| + "RevertBamToUnmappedRGBamsWf.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_20k_b37/NA12878.bam", | ||
| + | ||
| + "RevertBamToUnmappedRGBamsWf.output_dir": ".", | ||
| + | ||
| + "RevertBamToUnmappedRGBamsWf.RevertBamToUnmappedRGBams.max_discard_pct": 0.01, | ||
| + | ||
| + "RevertBamToUnmappedRGBamsWf.RevertBamToUnmappedRGBams.disk_size": 10, | ||
| + "RevertBamToUnmappedRGBamsWf.RevertBamToUnmappedRGBams.mem_size": "1 GB", | ||
| + "RevertBamToUnmappedRGBamsWf.SortBamByQueryname.disk_size": 10, | ||
| + "RevertBamToUnmappedRGBamsWf.SortBamByQueryname.mem_size": "3500 MB" | ||
| +} |
| @@ -0,0 +1,77 @@ | ||
| +## Copyright Broad Institute, 2017 | ||
| +## | ||
| +## This WDL reverts a SAM or BAM file to uBAMs, one per readgroup | ||
| +## | ||
| +## Requirements/expectations : | ||
| +## - Pair-end sequencing data in SAM or BAM format | ||
| +## - One or more read groups | ||
| +## | ||
| +## Outputs : | ||
| +## - Set of unmapped BAMs, one per read group, with reads sorted by queryname | ||
| +## | ||
| +## Cromwell version support | ||
| +## - Successfully tested on v24 | ||
| +## - Does not work on versions < v23 due to output syntax | ||
| +## | ||
| +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. | ||
| +## For program versions, see docker containers. | ||
| +## | ||
| +## LICENSING : | ||
| +## This script is released under the WDL source code license (BSD-3) (see LICENSE in | ||
| +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may | ||
| +## be subject to different licenses. Users are responsible for checking that they are | ||
| +## authorized to run all programs before running this script. Please see the docker | ||
| +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed | ||
| +## licensing information pertaining to the included programs. | ||
| + | ||
| +# TASK DEFINITIONS | ||
| + | ||
| +# Revert a BAM to uBAMs, one per readgroup | ||
| +task RevertBamToUnmappedRGBams { | ||
| + File input_bam | ||
| + String output_dir | ||
| + Float? max_discard_pct | ||
| + Int disk_size | ||
| + String mem_size | ||
| + | ||
| + command { | ||
| + java -Xmx1000m -jar /usr/gitc/picard.jar \ | ||
| + RevertSam \ | ||
| + INPUT=${input_bam} \ | ||
| + O=${output_dir} \ | ||
| + OUTPUT_BY_READGROUP=true \ | ||
| + VALIDATION_STRINGENCY=LENIENT \ | ||
| + SANITIZE=TRUE \ | ||
| + MAX_DISCARD_FRACTION=${max_discard_pct} \ | ||
| + ATTRIBUTE_TO_CLEAR=FT \ | ||
| + SORT_ORDER=queryname | ||
| + } | ||
| + runtime { | ||
| + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" | ||
| + disks: "local-disk " + disk_size + " HDD" | ||
| + memory: mem_size | ||
| + } | ||
| + output { | ||
| + Array[File] unmapped_bams = glob("*.bam") | ||
| + } | ||
| +} | ||
| + | ||
| +# WORKFLOW DEFINITION | ||
| +workflow RevertBamToUnmappedRGBamsWf { | ||
| + File input_bam | ||
| + File ref_fasta | ||
| + File ref_fasta_index | ||
| + String output_dir | ||
| + | ||
| + # Revert inputs to unmapped | ||
| + call RevertBamToUnmappedRGBams { | ||
| + input: | ||
| + input_bam = input_bam, | ||
| + output_dir = output_dir | ||
| + } | ||
| + | ||
| + # Outputs that will be retained when execution is complete | ||
| + output { | ||
| + Array[File] unmapped_bams_output=RevertBamToUnmappedRGBams.unmapped_bams | ||
| + } | ||
| +} |
| @@ -0,0 +1,11 @@ | ||
| +{ | ||
| + "RevertRGBamsToPairedFastQsWf.bam_list": [ | ||
| + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", | ||
| + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", | ||
| + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" | ||
| + ], | ||
| + | ||
| + "RevertRGBamsToPairedFastQsWf.RevertBAMToPairedFASTQ.mem_size": "1 GB", | ||
| + "RevertRGBamsToPairedFastQsWf.RevertBAMToPairedFASTQ.disk_size": 200 | ||
| + | ||
| +} |
| @@ -0,0 +1,81 @@ | ||
| +## Copyright Broad Institute, 2017 | ||
| +## | ||
| +## This WDL reverts a set of single-readgroup BAMs to paired FASTQs | ||
| +## | ||
| +## Requirements/expectations: | ||
| +## - List of valid BAM files | ||
| +## - Max one readgroup per BAM files. If there are more, the distinctions will be lost! | ||
| +## | ||
| +## Outputs: | ||
| +## - Sets of two FASTQ files of paired reads (*_1 and *_2) plus one FASTQ file of | ||
| +## unpaired reads (*_unp) per input file. | ||
| +## | ||
| +## Cromwell version support | ||
| +## - Successfully tested on v24 | ||
| +## - Does not work on versions < v23 due to output syntax | ||
| +## | ||
| +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. | ||
| +## For program versions, see docker containers. | ||
| +## | ||
| +## LICENSING : | ||
| +## This script is released under the WDL source code license (BSD-3) (see LICENSE in | ||
| +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may | ||
| +## be subject to different licenses. Users are responsible for checking that they are | ||
| +## authorized to run all programs before running this script. Please see the docker | ||
| +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed | ||
| +## licensing information pertaining to the included programs. | ||
| + | ||
| +# TASK DEFINITIONS | ||
| + | ||
| +# Run SamToFASTQ to revert the bam | ||
| +task RevertBAMToPairedFASTQ { | ||
| + File bam_file | ||
| + String output_basename | ||
| + Int disk_size | ||
| + String mem_size | ||
| + | ||
| + command { | ||
| + java -Xmx3000m -jar /usr/gitc/picard.jar \ | ||
| + SamToFastq \ | ||
| + I=${bam_file} \ | ||
| + FASTQ=${output_basename}_1.fastq \ | ||
| + SECOND_END_FASTQ=${output_basename}_2.fastq \ | ||
| + UNPAIRED_FASTQ=${output_basename}_unp.fastq \ | ||
| + INCLUDE_NON_PRIMARY_ALIGNMENTS=true \ | ||
| + INCLUDE_NON_PF_READS=true | ||
| + } | ||
| + runtime { | ||
| + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" | ||
| + memory: mem_size | ||
| + cpu: "1" | ||
| + disks: "local-disk " + disk_size + " HDD" | ||
| + } | ||
| + output { | ||
| + Array[File] output_fastqs = glob("*.fastq") | ||
| + } | ||
| +} | ||
| + | ||
| +# WORKFLOW DEFINITION | ||
| +workflow RevertRGBamsToPairedFastQsWf { | ||
| + Array[File] bam_list | ||
| + | ||
| + # Process input files in parallel | ||
| + scatter (input_bam in bam_list) { | ||
| + | ||
| + String sub_strip_path = "gs://.*/" | ||
| + String sub_strip_suffix = ".bam$" | ||
| + | ||
| + # Revert inputs to paired FASTQ | ||
| + call RevertBAMToPairedFASTQ { | ||
| + input: | ||
| + bam_file = input_bam, | ||
| + output_basename = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, ""), | ||
| + } | ||
| + } | ||
| + | ||
| + # Outputs that will be retained when execution is complete | ||
| + output { | ||
| + Array[Array[File]] output_fastqs_globs=RevertBAMToPairedFASTQ.output_fastqs | ||
| + } | ||
| +} | ||
| + |