diff --git a/scripts/broad_dsde_workflows/GenericPreProcessingToGVCF_170420.b37.wgs.inputs.json b/scripts/broad_dsde_workflows/GenericPreProcessingToGVCF_170420.b37.wgs.inputs.json new file mode 100644 index 0000000..895e21d --- /dev/null +++ b/scripts/broad_dsde_workflows/GenericPreProcessingToGVCF_170420.b37.wgs.inputs.json @@ -0,0 +1,46 @@ +{ + "##_COMMENT1": "SAMPLE NAME AND UNMAPPED BAMS", + "GenericPreProcessingToGVCFWorkflow.sample_name": "NA12878", + "GenericPreProcessingToGVCFWorkflow.base_file_name": "NA12878_20k.b37", + "GenericPreProcessingToGVCFWorkflow.flowcell_unmapped_bams": [ + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" + ], + "GenericPreProcessingToGVCFWorkflow.final_gvcf_name": "NA12878_20k.b37.g.vcf.gz", + "GenericPreProcessingToGVCFWorkflow.unmapped_bam_suffix": ".bam", + + "##_COMMENT2": "INTERVALS", + "GenericPreProcessingToGVCFWorkflow.calling_interval_list": "gs://gatk-legacy-bundles/b37/wgs_calling_regions.v1.interval_list", + "GenericPreProcessingToGVCFWorkflow.scattered_calling_intervals_list": "gs://gatk-legacy-bundles/b37/wgs_scattered_calling_intervals.txt", + "GenericPreProcessingToGVCFWorkflow.HaplotypeCaller.interval_padding": 0, + + "##_COMMENT3": "REFERENCE FILES", + "GenericPreProcessingToGVCFWorkflow.ref_dict": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.dict", + "GenericPreProcessingToGVCFWorkflow.ref_fasta": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta", + "GenericPreProcessingToGVCFWorkflow.ref_fasta_index": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai", + "GenericPreProcessingToGVCFWorkflow.ref_sa": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.sa", + "GenericPreProcessingToGVCFWorkflow.ref_amb": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.amb", + "GenericPreProcessingToGVCFWorkflow.ref_bwt": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.bwt", + "GenericPreProcessingToGVCFWorkflow.ref_ann": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.ann", + "GenericPreProcessingToGVCFWorkflow.ref_pac": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.pac", + + "##_COMMENT4": "KNOWN SITES RESOURCES", + "GenericPreProcessingToGVCFWorkflow.dbSNP_vcf": "gs://gatk-legacy-bundles/b37/dbsnp_138.b37.vcf", + "GenericPreProcessingToGVCFWorkflow.dbSNP_vcf_index": "gs://gatk-legacy-bundles/b37/dbsnp_138.b37.vcf.idx", + "GenericPreProcessingToGVCFWorkflow.known_indels_sites_VCFs": [ + "gs://gatk-legacy-bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf" + ], + "GenericPreProcessingToGVCFWorkflow.known_indels_sites_indices": [ + "gs://gatk-legacy-bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf.idx" + ], + + "##_COMMENT5": "DISK SIZES + PREEMPTIBLES", + "GenericPreProcessingToGVCFWorkflow.agg_small_disk": 200, + "GenericPreProcessingToGVCFWorkflow.agg_medium_disk": 300, + "GenericPreProcessingToGVCFWorkflow.agg_large_disk": 400, + "GenericPreProcessingToGVCFWorkflow.agg_preemptible_tries": 3, + "GenericPreProcessingToGVCFWorkflow.flowcell_small_disk": 100, + "GenericPreProcessingToGVCFWorkflow.flowcell_medium_disk": 200, + "GenericPreProcessingToGVCFWorkflow.preemptible_tries": 3 +} diff --git a/scripts/broad_dsde_workflows/GenericPreProcessingToGVCF_170420.hg38.wgs.inputs.json b/scripts/broad_dsde_workflows/GenericPreProcessingToGVCF_170420.hg38.wgs.inputs.json new file mode 100644 index 0000000..d6215ef --- /dev/null +++ b/scripts/broad_dsde_workflows/GenericPreProcessingToGVCF_170420.hg38.wgs.inputs.json @@ -0,0 +1,49 @@ +{ + "##_COMMENT1": "SAMPLE NAME AND UNMAPPED BAMS", + "GenericPreProcessingToGVCFWorkflow.sample_name": "NA12878", + "GenericPreProcessingToGVCFWorkflow.base_file_name": "NA12878_20k.hg38", + "GenericPreProcessingToGVCFWorkflow.flowcell_unmapped_bams": [ + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" + ], + "GenericPreProcessingToGVCFWorkflow.final_gvcf_name": "NA12878_20k.hg38.g.vcf.gz", + "GenericPreProcessingToGVCFWorkflow.unmapped_bam_suffix": ".bam", + + "##_COMMENT2": "INTERVALS", + "GenericPreProcessingToGVCFWorkflow.calling_interval_list": "gs://genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list", + "GenericPreProcessingToGVCFWorkflow.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/hg38_wgs_scattered_calling_intervals.txt", + "GenericPreProcessingToGVCFWorkflow.HaplotypeCaller.interval_padding": 0, + + "##_COMMENT3": "REFERENCE FILES", + "GenericPreProcessingToGVCFWorkflow.ref_dict": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict", + "GenericPreProcessingToGVCFWorkflow.ref_fasta": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta", + "GenericPreProcessingToGVCFWorkflow.ref_fasta_index": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai", + "GenericPreProcessingToGVCFWorkflow.ref_alt": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt", + "GenericPreProcessingToGVCFWorkflow.ref_sa": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa", + "GenericPreProcessingToGVCFWorkflow.ref_amb": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb", + "GenericPreProcessingToGVCFWorkflow.ref_bwt": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt", + "GenericPreProcessingToGVCFWorkflow.ref_ann": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann", + "GenericPreProcessingToGVCFWorkflow.ref_pac": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac", + + "##_COMMENT4": "KNOWN SITES RESOURCES", + "GenericPreProcessingToGVCFWorkflow.dbSNP_vcf": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", + "GenericPreProcessingToGVCFWorkflow.dbSNP_vcf_index": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", + "GenericPreProcessingToGVCFWorkflow.known_indels_sites_VCFs": [ + "gs://genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", + "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz" + ], + "GenericPreProcessingToGVCFWorkflow.known_indels_sites_indices": [ + "gs://genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", + "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi" + ], + + "##_COMMENT5": "DISK SIZES + PREEMPTIBLES", + "GenericPreProcessingToGVCFWorkflow.agg_small_disk": 200, + "GenericPreProcessingToGVCFWorkflow.agg_medium_disk": 300, + "GenericPreProcessingToGVCFWorkflow.agg_large_disk": 400, + "GenericPreProcessingToGVCFWorkflow.agg_preemptible_tries": 3, + "GenericPreProcessingToGVCFWorkflow.flowcell_small_disk": 100, + "GenericPreProcessingToGVCFWorkflow.flowcell_medium_disk": 200, + "GenericPreProcessingToGVCFWorkflow.preemptible_tries": 3 +} diff --git a/scripts/broad_dsde_workflows/GenericPreProcessingToGVCF_170420.wdl b/scripts/broad_dsde_workflows/GenericPreProcessingToGVCF_170420.wdl new file mode 100644 index 0000000..1b1f534 --- /dev/null +++ b/scripts/broad_dsde_workflows/GenericPreProcessingToGVCF_170420.wdl @@ -0,0 +1,689 @@ +## Copyright Broad Institute, 2017 +## +## This WDL pipeline implements data pre-processing and initial variant calling (GVCF +## generation) according to the GATK Best Practices (June 2016) for germline SNP and +## Indel discovery in human sequencing data. Example JSONs are provided for the WGS +## use case but the workflow can be applied to Exomes and Targeted Panels by swapping +## out the intervals files and specifying a non-zero value for interval padding (we +## recommend 100). +## +## Requirements/expectations : +## - Pair-end sequencing data in unmapped BAM (uBAM) format +## - One or more read groups, one per uBAM file, all belonging to a single sample (SM) +## - Input uBAM files must additionally comply with the following requirements: +## - - filenames all have the same suffix (we use ".unmapped.bam") +## - - files must pass validation by ValidateSamFile +## - - reads are provided in query-sorted order +## - - all reads must have an RG tag +## - GVCF output names must end in ".g.vcf.gz" +## +## Outputs : +## - A clean BAM file and its index, suitable for variant discovery analyses. +## - A GVCF file of intermediate calls produced by HaplotypeCaller, suitable for +## joint discovery analysis. +## +## Cromwell version support +## - Successfully tested on C25 +## - Does not work on versions < C25 due to optional File input (C24 bug) and output +## syntax (unsupported < C23) +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Get version of BWA +task GetBwaVersion { + command { + # Not setting "set -o pipefail" here because /bwa has a rc=1 and we don't want to allow rc=1 to succeed + # because the sed may also fail with that error and that is something we actually want to fail on. + /usr/gitc/bwa 2>&1 | \ + grep -e '^Version' | \ + sed 's/Version: //' + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "1 GB" + } + output { + String version = read_string(stdout()) + } +} + +# Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment +task SamToFastqAndBwaMem { + File input_bam + String bwa_commandline + String output_bam_basename + File ref_fasta + File ref_fasta_index + File ref_dict + + # This is the .alt file from bwa-kit (https://github.com/lh3/bwa/tree/master/bwakit), + # listing the reference contigs that are "alternative". Leave blank in JSON for legacy + # references such as b37 and hg19. + File? ref_alt + + File ref_amb + File ref_ann + File ref_bwt + File ref_pac + File ref_sa + + Int disk_size + + command <<< + set -o pipefail + set -e + + # set the bash variable needed for the command-line + bash_ref_fasta=${ref_fasta} + + java -Xmx3000m -jar /usr/gitc/picard.jar \ + SamToFastq \ + INPUT=${input_bam} \ + FASTQ=/dev/stdout \ + INTERLEAVE=true \ + NON_PF=true | \ + /usr/gitc/${bwa_commandline} /dev/stdin - 2> >(tee ${output_bam_basename}.bwa.stderr.log >&2) | \ + samtools view -1 - > ${output_bam_basename}.bam + + >>> + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "14 GB" + cpu: "16" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bam = "${output_bam_basename}.bam" + File bwa_stderr_log = "${output_bam_basename}.bwa.stderr.log" + } +} + +# Merge original input uBAM file with BWA-aligned BAM file +task MergeBamAlignment { + File unmapped_bam + String bwa_commandline + String bwa_version + File aligned_bam + String output_bam_basename + File ref_fasta + File ref_fasta_index + File ref_dict + + Int disk_size + + command { + # set the bash variable needed for the command-line + bash_ref_fasta=${ref_fasta} + java -Xmx2500m -jar /usr/gitc/picard.jar \ + MergeBamAlignment \ + VALIDATION_STRINGENCY=SILENT \ + EXPECTED_ORIENTATIONS=FR \ + ATTRIBUTES_TO_RETAIN=X0 \ + ALIGNED_BAM=${aligned_bam} \ + UNMAPPED_BAM=${unmapped_bam} \ + OUTPUT=${output_bam_basename}.bam \ + REFERENCE_SEQUENCE=${ref_fasta} \ + PAIRED_RUN=true \ + SORT_ORDER="unsorted" \ + IS_BISULFITE_SEQUENCE=false \ + ALIGNED_READS_ONLY=false \ + CLIP_ADAPTERS=false \ + MAX_RECORDS_IN_RAM=2000000 \ + ADD_MATE_CIGAR=true \ + MAX_INSERTIONS_OR_DELETIONS=-1 \ + PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ + PROGRAM_RECORD_ID="bwamem" \ + PROGRAM_GROUP_VERSION="${bwa_version}" \ + PROGRAM_GROUP_COMMAND_LINE="${bwa_commandline}" \ + PROGRAM_GROUP_NAME="bwamem" \ + UNMAPPED_READ_STRATEGY=COPY_TO_TAG \ + ALIGNER_PROPER_PAIR_FLAGS=true \ + UNMAP_CONTAMINANT_READS=true + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "3500 MB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bam = "${output_bam_basename}.bam" + } +} + +# Sort BAM file by coordinate order and fix tag values for NM and UQ +task SortAndFixTags { + File input_bam + String output_bam_basename + File ref_dict + File ref_fasta + File ref_fasta_index + Int disk_size + + command { + set -o pipefail + + java -Xmx4000m -jar /usr/gitc/picard.jar \ + SortSam \ + INPUT=${input_bam} \ + OUTPUT=/dev/stdout \ + SORT_ORDER="coordinate" \ + CREATE_INDEX=false \ + CREATE_MD5_FILE=false | \ + java -Xmx500m -jar /usr/gitc/picard.jar \ + SetNmAndUqTags \ + INPUT=/dev/stdin \ + OUTPUT=${output_bam_basename}.bam \ + CREATE_INDEX=true \ + CREATE_MD5_FILE=true \ + REFERENCE_SEQUENCE=${ref_fasta} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + disks: "local-disk " + disk_size + " HDD" + cpu: "1" + memory: "5000 MB" + } + output { + File output_bam = "${output_bam_basename}.bam" + File output_bam_index = "${output_bam_basename}.bai" + File output_bam_md5 = "${output_bam_basename}.bam.md5" + } +} + +# Mark duplicate reads to avoid counting non-independent observations +task MarkDuplicates { + Array[File] input_bams + String output_bam_basename + String metrics_filename + Int disk_size + + # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly. + # This works because the output of BWA is query-grouped and therefore, so is the output of MergeBamAlignment. + # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" + command { + java -Xmx4000m -jar /usr/gitc/picard.jar \ + MarkDuplicates \ + INPUT=${sep=' INPUT=' input_bams} \ + OUTPUT=${output_bam_basename}.bam \ + METRICS_FILE=${metrics_filename} \ + VALIDATION_STRINGENCY=SILENT \ + OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ + ASSUME_SORT_ORDER="queryname" + CREATE_MD5_FILE=true + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "7 GB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bam = "${output_bam_basename}.bam" + File duplicate_metrics = "${metrics_filename}" + } +} + +# Generate sets of intervals for scatter-gathering over chromosomes +task CreateSequenceGroupingTSV { + File ref_dict + + # Use python to create the Sequencing Groupings used for BQSR and PrintReads Scatter. + # It outputs to stdout where it is parsed into a wdl Array[Array[String]] + # e.g. [["1"], ["2"], ["3", "4"], ["5"], ["6", "7", "8"]] + command <<< + python <>> + runtime { + docker: "python:2.7" + memory: "2 GB" + } + output { + Array[Array[String]] sequence_grouping = read_tsv("sequence_grouping.txt") + Array[Array[String]] sequence_grouping_with_unmapped = read_tsv("sequence_grouping_with_unmapped.txt") + } +} + +# Generate Base Quality Score Recalibration (BQSR) model +task BaseRecalibrator { + File input_bam + File input_bam_index + String recalibration_report_filename + Array[String] sequence_group_interval + File dbSNP_vcf + File dbSNP_vcf_index + Array[File] known_indels_sites_VCFs + Array[File] known_indels_sites_indices + File ref_dict + File ref_fasta + File ref_fasta_index + Int disk_size + + command { // + java -Xmx4000m \ + -jar /usr/gitc/GATK35.jar \ + -T BaseRecalibrator \ + -R ${ref_fasta} \ + -I ${input_bam} \ + --useOriginalQualities \ + -o ${recalibration_report_filename} \ + -knownSites ${dbSNP_vcf} \ + -knownSites ${sep=" -knownSites " known_indels_sites_VCFs} \ + -L ${sep=" -L " sequence_group_interval} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "6 GB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File recalibration_report = "${recalibration_report_filename}" + } +} + +# Combine multiple recalibration tables from scattered BaseRecalibrator runs +# Note that when run from GATK 3.x the tool is not a walker and is invoked differently. +task GatherBqsrReports { + Array[File] input_bqsr_reports + String output_report_filename + Int disk_size + + command { + java -Xmx3000m \ + -cp /usr/gitc/GATK35.jar org.broadinstitute.gatk.tools.GatherBqsrReports \ + I= ${sep=' I= ' input_bqsr_reports} \ + O= ${output_report_filename} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "3500 MB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bqsr_report = "${output_report_filename}" + } +} + +# Apply Base Quality Score Recalibration (BQSR) model +task ApplyBQSR { + File input_bam + File input_bam_index + String output_bam_basename + File recalibration_report + Array[String] sequence_group_interval + File ref_dict + File ref_fasta + File ref_fasta_index + Int disk_size + + command { + java -Xmx3000m \ + -jar /usr/gitc/GATK35.jar \ + -T PrintReads \ + -R ${ref_fasta} \ + -I ${input_bam} \ + --useOriginalQualities \ + -o ${output_bam_basename}.bam \ + -BQSR ${recalibration_report} \ + -SQQ 10 -SQQ 20 -SQQ 30 \ + -L ${sep=" -L " sequence_group_interval} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "3500 MB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File recalibrated_bam = "${output_bam_basename}.bam" + } +} + +# Combine multiple recalibrated BAM files from scattered ApplyRecalibration runs +task GatherBamFiles { + Array[File] input_bams + String output_bam_basename + Int disk_size + + command { + java -Xmx2000m -jar /usr/gitc/picard.jar \ + GatherBamFiles \ + INPUT=${sep=' INPUT=' input_bams} \ + OUTPUT=${output_bam_basename}.bam \ + CREATE_INDEX=true \ + CREATE_MD5_FILE=true + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "3 GB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bam = "${output_bam_basename}.bam" + File output_bam_index = "${output_bam_basename}.bai" + File output_bam_md5 = "${output_bam_basename}.bam.md5" + } +} + +# Call variants on a single sample with HaplotypeCaller to produce a GVCF +task HaplotypeCaller { + File input_bam + File input_bam_index + File interval_list + Int interval_padding + String gvcf_basename + File ref_dict + File ref_fasta + File ref_fasta_index + Int disk_size + + command { + java -Xmx8000m \ + -jar /usr/gitc/GATK35.jar \ + -T HaplotypeCaller \ + -R ${ref_fasta} \ + -o ${gvcf_basename}.vcf.gz \ + -I ${input_bam} \ + -L ${interval_list} \ + -ip ${interval_padding} \ + -ERC GVCF \ + --max_alternate_alleles 3 \ + -variant_index_parameter 128000 \ + -variant_index_type LINEAR \ + --read_filter OverclippedRead + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "10 GB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_gvcf = "${gvcf_basename}.vcf.gz" + File output_gvcf_index = "${gvcf_basename}.vcf.gz.tbi" + } +} + +# Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs +task MergeVCFs { + Array[File] input_vcfs + Array[File] input_vcfs_indexes + String output_vcf_name + Int disk_size + + # Using MergeVcfs instead of GatherVcfs so we can create indices + # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket + command { + java -Xmx2g -jar /usr/gitc/picard.jar \ + MergeVcfs \ + INPUT=${sep=' INPUT=' input_vcfs} \ + OUTPUT=${output_vcf_name} + } + output { + File output_vcf = "${output_vcf_name}" + File output_vcf_index = "${output_vcf_name}.tbi" + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "3 GB" + disks: "local-disk " + disk_size + " HDD" + } +} + +# WORKFLOW DEFINITION +workflow GenericPreProcessingToGVCFWorkflow { + + String sample_name + String base_file_name + String final_gvcf_name + Array[File] flowcell_unmapped_bams + String unmapped_bam_suffix + + File scattered_calling_intervals_list + File calling_interval_list + + File ref_fasta + File ref_fasta_index + File ref_dict + File? ref_alt + File ref_bwt + File ref_sa + File ref_amb + File ref_ann + File ref_pac + + File dbSNP_vcf + File dbSNP_vcf_index + Array[File] known_indels_sites_VCFs + Array[File] known_indels_sites_indices + + Int flowcell_small_disk + Int flowcell_medium_disk + Int agg_small_disk + Int agg_medium_disk + Int agg_large_disk + + String bwa_commandline="bwa mem -K 100000000 -p -v 3 -t 16 -Y $bash_ref_fasta" + + String recalibrated_bam_basename = base_file_name + ".aligned.duplicates_marked.recalibrated" + + Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) + + # Get the version of BWA to include in the PG record in the header of the BAM produced + # by MergeBamAlignment. + call GetBwaVersion + + # Align flowcell-level unmapped input bams in parallel + scatter (unmapped_bam in flowcell_unmapped_bams) { + + # Because of a wdl/cromwell bug this is not currently valid so we have to sub(sub()) in each task + # String base_name = sub(sub(unmapped_bam, "gs://.*/", ""), unmapped_bam_suffix + "$", "") + + String sub_strip_path = "gs://.*/" + String sub_strip_unmapped = unmapped_bam_suffix + "$" + + # Map reads to reference + call SamToFastqAndBwaMem { + input: + input_bam = unmapped_bam, + bwa_commandline = bwa_commandline, + output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".unmerged", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + ref_alt = ref_alt, + ref_bwt = ref_bwt, + ref_amb = ref_amb, + ref_ann = ref_ann, + ref_pac = ref_pac, + ref_sa = ref_sa, + disk_size = flowcell_medium_disk + } + + # Merge original uBAM and BWA-aligned BAM + call MergeBamAlignment { + input: + unmapped_bam = unmapped_bam, + bwa_commandline = bwa_commandline, + bwa_version = GetBwaVersion.version, + aligned_bam = SamToFastqAndBwaMem.output_bam, + output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".aligned.unsorted", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + disk_size = flowcell_medium_disk + } + + # Sort and fix tags in the merged BAM + call SortAndFixTags as SortAndFixReadGroupBam { + input: + input_bam = MergeBamAlignment.output_bam, + output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".sorted", + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = flowcell_medium_disk + } + + } + + # Aggregate aligned+merged flowcell BAM files and mark duplicates + # We take advantage of the tool's ability to take multiple BAM inputs and write out a single output + # to avoid having to spend time just merging BAM files. + call MarkDuplicates { + input: + input_bams = MergeBamAlignment.output_bam, + output_bam_basename = base_file_name + ".aligned.unsorted.duplicates_marked", + metrics_filename = base_file_name + ".duplicate_metrics", + disk_size = agg_large_disk + } + + # Sort aggregated+deduped BAM file and fix tags + call SortAndFixTags as SortAndFixSampleBam { + input: + input_bam = MarkDuplicates.output_bam, + output_bam_basename = base_file_name + ".aligned.duplicate_marked.sorted", + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_large_disk + } + + # Create list of sequences for scatter-gather parallelization + call CreateSequenceGroupingTSV { + input: + ref_dict = ref_dict + } + + # Perform Base Quality Score Recalibration (BQSR) on the sorted BAM in parallel + scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping) { + # Generate the recalibration model by interval + call BaseRecalibrator { + input: + input_bam = SortAndFixSampleBam.output_bam, + input_bam_index = SortAndFixSampleBam.output_bam_index, + recalibration_report_filename = base_file_name + ".recal_data.csv", + sequence_group_interval = subgroup, + dbSNP_vcf = dbSNP_vcf, + dbSNP_vcf_index = dbSNP_vcf_index, + known_indels_sites_VCFs = known_indels_sites_VCFs, + known_indels_sites_indices = known_indels_sites_indices, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_small_disk + } + } + + # Merge the recalibration reports resulting from by-interval recalibration + call GatherBqsrReports { + input: + input_bqsr_reports = BaseRecalibrator.recalibration_report, + output_report_filename = base_file_name + ".recal_data.csv", + disk_size = flowcell_small_disk + } + + scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping_with_unmapped) { + + # Apply the recalibration model by interval + call ApplyBQSR { + input: + input_bam = SortAndFixSampleBam.output_bam, + input_bam_index = SortAndFixSampleBam.output_bam_index, + output_bam_basename = recalibrated_bam_basename, + recalibration_report = GatherBqsrReports.output_bqsr_report, + sequence_group_interval = subgroup, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_small_disk + } + } + + # Merge the recalibrated BAM files resulting from by-interval recalibration + call GatherBamFiles { + input: + input_bams = ApplyBQSR.recalibrated_bam, + output_bam_basename = base_file_name, + disk_size = agg_large_disk + } + + # Call variants in parallel over calling intervals + scatter (subInterval in scattered_calling_intervals) { + + # Generate GVCF by interval + call HaplotypeCaller { + input: + input_bam = GatherBamFiles.output_bam, + input_bam_index = GatherBamFiles.output_bam_index, + interval_list = subInterval, + gvcf_basename = base_file_name, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_small_disk + } + } + + # Combine by-interval GVCFs into a single sample GVCF file + call MergeVCFs { + input: + input_vcfs = HaplotypeCaller.output_gvcf, + input_vcfs_indexes = HaplotypeCaller.output_gvcf_index, + output_vcf_name = final_gvcf_name, + disk_size = agg_small_disk + } + + # Outputs that will be retained when execution is complete + output { + File duplication_metrics = MarkDuplicates.duplicate_metrics + File bqsr_report = GatherBqsrReports.output_bqsr_report + File analysis_ready_bam = GatherBamFiles.output_bam + File analysis_ready_bam_index = GatherBamFiles.output_bam_index + File analysis_ready_bam_md5 = GatherBamFiles.output_bam_md5 + File final_output_gvcf = MergeVCFs.output_vcf + File final_output_gvcf_index = MergeVCFs.output_vcf_index + } +} diff --git a/scripts/broad_dsde_workflows/GenericPreProcessing_170421.hg38.wgs.inputs.json b/scripts/broad_dsde_workflows/GenericPreProcessing_170421.hg38.wgs.inputs.json new file mode 100644 index 0000000..361fe6c --- /dev/null +++ b/scripts/broad_dsde_workflows/GenericPreProcessing_170421.hg38.wgs.inputs.json @@ -0,0 +1,43 @@ +{ + "##_COMMENT1": "SAMPLE NAME AND UNMAPPED BAMS", + "GenericPreProcessingWorkflow.sample_name": "NA12878", + "GenericPreProcessingWorkflow.base_file_name": "NA12878_20k.hg38", + "GenericPreProcessingWorkflow.flowcell_unmapped_bams": [ + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" + ], + "GenericPreProcessingWorkflow.unmapped_bam_suffix": ".bam", + + "##_COMMENT2": "REFERENCE FILES", + "GenericPreProcessingWorkflow.ref_dict": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict", + "GenericPreProcessingWorkflow.ref_fasta": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta", + "GenericPreProcessingWorkflow.ref_fasta_index": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai", + "GenericPreProcessingWorkflow.ref_alt": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt", + "GenericPreProcessingWorkflow.ref_sa": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa", + "GenericPreProcessingWorkflow.ref_amb": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb", + "GenericPreProcessingWorkflow.ref_bwt": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt", + "GenericPreProcessingWorkflow.ref_ann": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann", + "GenericPreProcessingWorkflow.ref_pac": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac", + + "##_COMMENT3": "KNOWN SITES RESOURCES", + "GenericPreProcessingWorkflow.dbSNP_vcf": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", + "GenericPreProcessingWorkflow.dbSNP_vcf_index": "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", + "GenericPreProcessingWorkflow.known_indels_sites_VCFs": [ + "gs://genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", + "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz" + ], + "GenericPreProcessingWorkflow.known_indels_sites_indices": [ + "gs://genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", + "gs://genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi" + ], + + "##_COMMENT4": "DISK SIZES + PREEMPTIBLES", + "GenericPreProcessingWorkflow.agg_small_disk": 200, + "GenericPreProcessingWorkflow.agg_medium_disk": 300, + "GenericPreProcessingWorkflow.agg_large_disk": 400, + "GenericPreProcessingWorkflow.agg_preemptible_tries": 3, + "GenericPreProcessingWorkflow.flowcell_small_disk": 100, + "GenericPreProcessingWorkflow.flowcell_medium_disk": 200, + "GenericPreProcessingWorkflow.preemptible_tries": 3 +} diff --git a/scripts/broad_dsde_workflows/GenericPreProcessing_170421.wdl b/scripts/broad_dsde_workflows/GenericPreProcessing_170421.wdl new file mode 100644 index 0000000..f874d10 --- /dev/null +++ b/scripts/broad_dsde_workflows/GenericPreProcessing_170421.wdl @@ -0,0 +1,583 @@ +## Copyright Broad Institute, 2017 +## +## This WDL pipeline implements data pre-processing according to the GATK Best Practices +## (June 2016) for SNP and Indel discovery. Example JSONs are provided for the WGS +## use case but the workflow can be applied to Exomes and Targeted Panels. +## +## Requirements/expectations : +## - Pair-end sequencing data in unmapped BAM (uBAM) format +## - One or more read groups, one per uBAM file, all belonging to a single sample (SM) +## - Input uBAM files must additionally comply with the following requirements: +## - - filenames all have the same suffix (we use ".unmapped.bam") +## - - files must pass validation by ValidateSamFile +## - - reads are provided in query-sorted order +## - - all reads must have an RG tag +## +## Output : +## - A clean BAM file and its index, suitable for variant discovery analyses. +## +## Cromwell version support +## - Successfully tested on v25 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Get version of BWA +task GetBwaVersion { + command { + # Not setting "set -o pipefail" here because /bwa has a rc=1 and we don't want to allow rc=1 to succeed + # because the sed may also fail with that error and that is something we actually want to fail on. + /usr/gitc/bwa 2>&1 | \ + grep -e '^Version' | \ + sed 's/Version: //' + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "1 GB" + } + output { + String version = read_string(stdout()) + } +} + +# Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment +task SamToFastqAndBwaMem { + File input_bam + String bwa_commandline + String output_bam_basename + File ref_fasta + File ref_fasta_index + File ref_dict + + # This is the .alt file from bwa-kit (https://github.com/lh3/bwa/tree/master/bwakit), + # listing the reference contigs that are "alternative". Leave blank in JSON for legacy + # references such as b37 and hg19. + File? ref_alt + + File ref_amb + File ref_ann + File ref_bwt + File ref_pac + File ref_sa + + Int disk_size + + command <<< + set -o pipefail + set -e + + # set the bash variable needed for the command-line + bash_ref_fasta=${ref_fasta} + + java -Xmx3000m -jar /usr/gitc/picard.jar \ + SamToFastq \ + INPUT=${input_bam} \ + FASTQ=/dev/stdout \ + INTERLEAVE=true \ + NON_PF=true | \ + /usr/gitc/${bwa_commandline} /dev/stdin - 2> >(tee ${output_bam_basename}.bwa.stderr.log >&2) | \ + samtools view -1 - > ${output_bam_basename}.bam + + >>> + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "14 GB" + cpu: "16" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bam = "${output_bam_basename}.bam" + File bwa_stderr_log = "${output_bam_basename}.bwa.stderr.log" + } +} + +# Merge original input uBAM file with BWA-aligned BAM file +task MergeBamAlignment { + File unmapped_bam + String bwa_commandline + String bwa_version + File aligned_bam + String output_bam_basename + File ref_fasta + File ref_fasta_index + File ref_dict + + Int disk_size + + command { + # set the bash variable needed for the command-line + bash_ref_fasta=${ref_fasta} + java -Xmx2500m -jar /usr/gitc/picard.jar \ + MergeBamAlignment \ + VALIDATION_STRINGENCY=SILENT \ + EXPECTED_ORIENTATIONS=FR \ + ATTRIBUTES_TO_RETAIN=X0 \ + ALIGNED_BAM=${aligned_bam} \ + UNMAPPED_BAM=${unmapped_bam} \ + OUTPUT=${output_bam_basename}.bam \ + REFERENCE_SEQUENCE=${ref_fasta} \ + PAIRED_RUN=true \ + SORT_ORDER="unsorted" \ + IS_BISULFITE_SEQUENCE=false \ + ALIGNED_READS_ONLY=false \ + CLIP_ADAPTERS=false \ + MAX_RECORDS_IN_RAM=2000000 \ + ADD_MATE_CIGAR=true \ + MAX_INSERTIONS_OR_DELETIONS=-1 \ + PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ + PROGRAM_RECORD_ID="bwamem" \ + PROGRAM_GROUP_VERSION="${bwa_version}" \ + PROGRAM_GROUP_COMMAND_LINE="${bwa_commandline}" \ + PROGRAM_GROUP_NAME="bwamem" \ + UNMAPPED_READ_STRATEGY=COPY_TO_TAG \ + ALIGNER_PROPER_PAIR_FLAGS=true \ + UNMAP_CONTAMINANT_READS=true + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "3500 MB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bam = "${output_bam_basename}.bam" + } +} + +# Sort BAM file by coordinate order and fix tag values for NM and UQ +task SortAndFixTags { + File input_bam + String output_bam_basename + File ref_dict + File ref_fasta + File ref_fasta_index + Int disk_size + + command { + set -o pipefail + + java -Xmx4000m -jar /usr/gitc/picard.jar \ + SortSam \ + INPUT=${input_bam} \ + OUTPUT=/dev/stdout \ + SORT_ORDER="coordinate" \ + CREATE_INDEX=false \ + CREATE_MD5_FILE=false | \ + java -Xmx500m -jar /usr/gitc/picard.jar \ + SetNmAndUqTags \ + INPUT=/dev/stdin \ + OUTPUT=${output_bam_basename}.bam \ + CREATE_INDEX=true \ + CREATE_MD5_FILE=true \ + REFERENCE_SEQUENCE=${ref_fasta} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + disks: "local-disk " + disk_size + " HDD" + cpu: "1" + memory: "5000 MB" + } + output { + File output_bam = "${output_bam_basename}.bam" + File output_bam_index = "${output_bam_basename}.bai" + File output_bam_md5 = "${output_bam_basename}.bam.md5" + } +} + +# Mark duplicate reads to avoid counting non-independent observations +task MarkDuplicates { + Array[File] input_bams + String output_bam_basename + String metrics_filename + Int disk_size + + # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly. + # This works because the output of BWA is query-grouped and therefore, so is the output of MergeBamAlignment. + # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" + command { + java -Xmx4000m -jar /usr/gitc/picard.jar \ + MarkDuplicates \ + INPUT=${sep=' INPUT=' input_bams} \ + OUTPUT=${output_bam_basename}.bam \ + METRICS_FILE=${metrics_filename} \ + VALIDATION_STRINGENCY=SILENT \ + OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ + ASSUME_SORT_ORDER="queryname" + CREATE_MD5_FILE=true + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "7 GB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bam = "${output_bam_basename}.bam" + File duplicate_metrics = "${metrics_filename}" + } +} + +# Generate sets of intervals for scatter-gathering over chromosomes +task CreateSequenceGroupingTSV { + File ref_dict + + # Use python to create the Sequencing Groupings used for BQSR and PrintReads Scatter. + # It outputs to stdout where it is parsed into a wdl Array[Array[String]] + # e.g. [["1"], ["2"], ["3", "4"], ["5"], ["6", "7", "8"]] + command <<< + python <>> + runtime { + docker: "python:2.7" + memory: "2 GB" + } + output { + Array[Array[String]] sequence_grouping = read_tsv("sequence_grouping.txt") + Array[Array[String]] sequence_grouping_with_unmapped = read_tsv("sequence_grouping_with_unmapped.txt") + } +} + +# Generate Base Quality Score Recalibration (BQSR) model +task BaseRecalibrator { + File input_bam + File input_bam_index + String recalibration_report_filename + Array[String] sequence_group_interval + File dbSNP_vcf + File dbSNP_vcf_index + Array[File] known_indels_sites_VCFs + Array[File] known_indels_sites_indices + File ref_dict + File ref_fasta + File ref_fasta_index + Int disk_size + + command { // + java -Xmx4000m \ + -jar /usr/gitc/GATK35.jar \ + -T BaseRecalibrator \ + -R ${ref_fasta} \ + -I ${input_bam} \ + --useOriginalQualities \ + -o ${recalibration_report_filename} \ + -knownSites ${dbSNP_vcf} \ + -knownSites ${sep=" -knownSites " known_indels_sites_VCFs} \ + -L ${sep=" -L " sequence_group_interval} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "6 GB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File recalibration_report = "${recalibration_report_filename}" + } +} + +# Combine multiple recalibration tables from scattered BaseRecalibrator runs +# Note that when run from GATK 3.x the tool is not a walker and is invoked differently. +task GatherBqsrReports { + Array[File] input_bqsr_reports + String output_report_filename + Int disk_size + + command { + java -Xmx3000m \ + -cp /usr/gitc/GATK35.jar org.broadinstitute.gatk.tools.GatherBqsrReports \ + I= ${sep=' I= ' input_bqsr_reports} \ + O= ${output_report_filename} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "3500 MB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bqsr_report = "${output_report_filename}" + } +} + +# Apply Base Quality Score Recalibration (BQSR) model +task ApplyBQSR { + File input_bam + File input_bam_index + String output_bam_basename + File recalibration_report + Array[String] sequence_group_interval + File ref_dict + File ref_fasta + File ref_fasta_index + Int disk_size + + command { + java -Xmx3000m \ + -jar /usr/gitc/GATK35.jar \ + -T PrintReads \ + -R ${ref_fasta} \ + -I ${input_bam} \ + --useOriginalQualities \ + -o ${output_bam_basename}.bam \ + -BQSR ${recalibration_report} \ + -SQQ 10 -SQQ 20 -SQQ 30 \ + -L ${sep=" -L " sequence_group_interval} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "3500 MB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File recalibrated_bam = "${output_bam_basename}.bam" + } +} + +# Combine multiple recalibrated BAM files from scattered ApplyRecalibration runs +task GatherBamFiles { + Array[File] input_bams + String output_bam_basename + Int disk_size + + command { + java -Xmx2000m -jar /usr/gitc/picard.jar \ + GatherBamFiles \ + INPUT=${sep=' INPUT=' input_bams} \ + OUTPUT=${output_bam_basename}.bam \ + CREATE_INDEX=true \ + CREATE_MD5_FILE=true + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + memory: "3 GB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bam = "${output_bam_basename}.bam" + File output_bam_index = "${output_bam_basename}.bai" + File output_bam_md5 = "${output_bam_basename}.bam.md5" + } +} + +# WORKFLOW DEFINITION +workflow GenericPreProcessingWorkflow { + + String sample_name + String base_file_name + Array[File] flowcell_unmapped_bams + String unmapped_bam_suffix + + File ref_fasta + File ref_fasta_index + File ref_dict + File ref_alt + File ref_bwt + File ref_sa + File ref_amb + File ref_ann + File ref_pac + + File dbSNP_vcf + File dbSNP_vcf_index + Array[File] known_indels_sites_VCFs + Array[File] known_indels_sites_indices + + Int flowcell_small_disk + Int flowcell_medium_disk + Int agg_small_disk + Int agg_medium_disk + Int agg_large_disk + + String bwa_commandline="bwa mem -K 100000000 -p -v 3 -t 16 -Y $bash_ref_fasta" + + String recalibrated_bam_basename = base_file_name + ".aligned.duplicates_marked.recalibrated" + + # Get the version of BWA to include in the PG record in the header of the BAM produced + # by MergeBamAlignment. + call GetBwaVersion + + # Align flowcell-level unmapped input bams in parallel + scatter (unmapped_bam in flowcell_unmapped_bams) { + + # Because of a wdl/cromwell bug this is not currently valid so we have to sub(sub()) in each task + # String base_name = sub(sub(unmapped_bam, "gs://.*/", ""), unmapped_bam_suffix + "$", "") + + String sub_strip_path = "gs://.*/" + String sub_strip_unmapped = unmapped_bam_suffix + "$" + + # Map reads to reference + call SamToFastqAndBwaMem { + input: + input_bam = unmapped_bam, + bwa_commandline = bwa_commandline, + output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".unmerged", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + ref_alt = ref_alt, + ref_bwt = ref_bwt, + ref_amb = ref_amb, + ref_ann = ref_ann, + ref_pac = ref_pac, + ref_sa = ref_sa, + disk_size = flowcell_medium_disk + } + + # Merge original uBAM and BWA-aligned BAM + call MergeBamAlignment { + input: + unmapped_bam = unmapped_bam, + bwa_commandline = bwa_commandline, + bwa_version = GetBwaVersion.version, + aligned_bam = SamToFastqAndBwaMem.output_bam, + output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".aligned.unsorted", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + disk_size = flowcell_medium_disk + } + + # Sort and fix tags in the merged BAM + call SortAndFixTags as SortAndFixReadGroupBam { + input: + input_bam = MergeBamAlignment.output_bam, + output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".sorted", + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = flowcell_medium_disk + } + + } + + # Aggregate aligned+merged flowcell BAM files and mark duplicates + # We take advantage of the tool's ability to take multiple BAM inputs and write out a single output + # to avoid having to spend time just merging BAM files. + call MarkDuplicates { + input: + input_bams = MergeBamAlignment.output_bam, + output_bam_basename = base_file_name + ".aligned.unsorted.duplicates_marked", + metrics_filename = base_file_name + ".duplicate_metrics", + disk_size = agg_large_disk + } + + # Sort aggregated+deduped BAM file and fix tags + call SortAndFixTags as SortAndFixSampleBam { + input: + input_bam = MarkDuplicates.output_bam, + output_bam_basename = base_file_name + ".aligned.duplicate_marked.sorted", + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_large_disk + } + + # Create list of sequences for scatter-gather parallelization + call CreateSequenceGroupingTSV { + input: + ref_dict = ref_dict + } + + # Perform Base Quality Score Recalibration (BQSR) on the sorted BAM in parallel + scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping) { + # Generate the recalibration model by interval + call BaseRecalibrator { + input: + input_bam = SortAndFixSampleBam.output_bam, + input_bam_index = SortAndFixSampleBam.output_bam_index, + recalibration_report_filename = base_file_name + ".recal_data.csv", + sequence_group_interval = subgroup, + dbSNP_vcf = dbSNP_vcf, + dbSNP_vcf_index = dbSNP_vcf_index, + known_indels_sites_VCFs = known_indels_sites_VCFs, + known_indels_sites_indices = known_indels_sites_indices, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_small_disk + } + } + + # Merge the recalibration reports resulting from by-interval recalibration + call GatherBqsrReports { + input: + input_bqsr_reports = BaseRecalibrator.recalibration_report, + output_report_filename = base_file_name + ".recal_data.csv", + disk_size = flowcell_small_disk + } + + scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping_with_unmapped) { + + # Apply the recalibration model by interval + call ApplyBQSR { + input: + input_bam = SortAndFixSampleBam.output_bam, + input_bam_index = SortAndFixSampleBam.output_bam_index, + output_bam_basename = recalibrated_bam_basename, + recalibration_report = GatherBqsrReports.output_bqsr_report, + sequence_group_interval = subgroup, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_small_disk + } + } + + # Merge the recalibrated BAM files resulting from by-interval recalibration + call GatherBamFiles { + input: + input_bams = ApplyBQSR.recalibrated_bam, + output_bam_basename = base_file_name, + disk_size = agg_large_disk + } + + # Outputs that will be retained when execution is complete + output { + File duplication_metrics = MarkDuplicates.duplicate_metrics + File bqsr_report = GatherBqsrReports.output_bqsr_report + File analysis_ready_bam = GatherBamFiles.output_bam + File analysis_ready_bam_index = GatherBamFiles.output_bam_index + File analysis_ready_bam_md5 = GatherBamFiles.output_bam_md5 + } +} diff --git a/scripts/broad_dsde_workflows/JointDiscoveryWf_170305.inputs.json b/scripts/broad_dsde_workflows/JointDiscoveryWf_170305.inputs.json index 3553eec..ef328fc 100644 --- a/scripts/broad_dsde_workflows/JointDiscoveryWf_170305.inputs.json +++ b/scripts/broad_dsde_workflows/JointDiscoveryWf_170305.inputs.json @@ -3,7 +3,7 @@ "JointDiscoveryWf.ref_fasta": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta", "JointDiscoveryWf.ref_fasta_index": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai", - "JointDiscoveryWf.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/b37_wgs_scattered_calling_intervals.txt", + "JointDiscoveryWf.scattered_calling_intervals_list": "gs://gatk-legacy-bundles/b37/wgs_scattered_calling_intervals.txt", "JointDiscoveryWf.cohort_vcf_name": "PlatinumTrio_b37",