Permalink
Please sign in to comment.
Browse files
WDLs for segments of the germline short variant calling pipeline
- HaplotypeCaller GVCF scatter-gathered across intervals
- GenotypeGVCFs scatter-gathered across intervals
- VQSR (no scatter)
- Joint discovery (GGVCFs through VQSR, no genotype refinement)- Loading branch information...
Showing
with
1,087 additions
and 0 deletions.
- +28 −0 scripts/broad_dsde_workflows/GenotypeGVCFsScatterWf_170204.inputs.json
- +175 −0 scripts/broad_dsde_workflows/GenotypeGVCFsScatterWf_170204.wdl
- +17 −0 scripts/broad_dsde_workflows/HaplotypeCallerGvcfScatterWf_170204.inputs.json
- +147 −0 scripts/broad_dsde_workflows/HaplotypeCallerGvcfScatterWf_170204.wdl
- +71 −0 scripts/broad_dsde_workflows/JointDiscoveryWf_170305.inputs.json
- +364 −0 scripts/broad_dsde_workflows/JointDiscoveryWf_170305.wdl
- +56 −0 scripts/broad_dsde_workflows/VariantRecalibrationWf_170305.inputs.json
- +229 −0 scripts/broad_dsde_workflows/VariantRecalibrationWf_170305.wdl
| @@ -0,0 +1,28 @@ | ||
| +{ | ||
| + "GenotypeGVCFsScatterWf.cohort_vcf_name": "PlatinumTrio_b37", | ||
| + | ||
| + "GenotypeGVCFsScatterWf.input_gvcfs": [ | ||
| + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12877.g.vcf.gz", | ||
| + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12878.g.vcf.gz", | ||
| + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12882.g.vcf.gz" | ||
| + ], | ||
| + | ||
| + "GenotypeGVCFsScatterWf.input_gvcf_indices": [ | ||
| + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12877.g.vcf.gz.tbi", | ||
| + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12878.g.vcf.gz.tbi", | ||
| + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12882.g.vcf.gz.tbi" | ||
| + ], | ||
| + | ||
| + "GenotypeGVCFsScatterWf.ref_dict": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.dict", | ||
| + "GenotypeGVCFsScatterWf.ref_fasta": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta", | ||
| + "GenotypeGVCFsScatterWf.ref_fasta_index": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai", | ||
| + | ||
| + "GenotypeGVCFsScatterWf.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/b37_wgs_scattered_calling_intervals.txt", | ||
| + | ||
| + "GenotypeGVCFsScatterWf.UnzipGVCF.disk_size": 100, | ||
| + "GenotypeGVCFsScatterWf.UnzipGVCF.mem_size": "10 GB", | ||
| + "GenotypeGVCFsScatterWf.GenotypeGVCFs.disk_size": 100, | ||
| + "GenotypeGVCFsScatterWf.GenotypeGVCFs.mem_size": "10 GB", | ||
| + "GenotypeGVCFsScatterWf.MergeVCFs.disk_size": 100, | ||
| + "GenotypeGVCFsScatterWf.MergeVCFs.mem_size": "10 GB" | ||
| +} |
| @@ -0,0 +1,175 @@ | ||
| +## Copyright Broad Institute, 2017 | ||
| +## | ||
| +## This WDL workflow runs GenotypeGVCFs on a set of GVCFs to joint-call multiple | ||
| +## samples, scattered across intervals. | ||
| +## | ||
| +## Requirements/expectations : | ||
| +## - One or more GVCFs produced by HaplotypeCaller in GVCF mode | ||
| +## | ||
| +## Outputs : | ||
| +## - A VCF file and its index, with genotypes for all samples represented in the | ||
| +## GVCF inputs. | ||
| +## | ||
| +## Cromwell version support | ||
| +## - Successfully tested on v24 | ||
| +## - Does not work on versions < v23 due to output syntax | ||
| +## | ||
| +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. | ||
| +## For program versions, see docker containers. | ||
| +## | ||
| +## LICENSING : | ||
| +## This script is released under the WDL source code license (BSD-3) (see LICENSE in | ||
| +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may | ||
| +## be subject to different licenses. Users are responsible for checking that they are | ||
| +## authorized to run all programs before running this script. Please see the docker | ||
| +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed | ||
| +## licensing information pertaining to the included programs. | ||
| + | ||
| +# TASK DEFINITIONS | ||
| + | ||
| +# Unzip GVCFs | ||
| +task UnzipGVCF { | ||
| + File gzipped_gvcf | ||
| + String unzipped_basename | ||
| + Int disk_size | ||
| + String mem_size | ||
| + | ||
| + # HACK ALERT! Using .gvcf extension here to force IndexFeatureFile to make the right | ||
| + # kind of index, but afterward we need to change to .g.vcf which is the correct | ||
| + # for GVCFs. | ||
| + command <<< | ||
| + gunzip -c ${gzipped_gvcf} > ${unzipped_basename}.gvcf | ||
| + java -Xmx2g -jar /usr/gitc/GATK4.jar IndexFeatureFile -F ${unzipped_basename}.gvcf | ||
| + mv ${unzipped_basename}.gvcf ${unzipped_basename}.g.vcf | ||
| + mv ${unzipped_basename}.gvcf.idx ${unzipped_basename}.g.vcf.idx | ||
| + >>> | ||
| + | ||
| + runtime { | ||
| + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | ||
| + memory: mem_size | ||
| + disks: "local-disk " + disk_size + " HDD" | ||
| + } | ||
| + | ||
| + output { | ||
| + File unzipped_gvcf = "${unzipped_basename}.g.vcf" | ||
| + File gvcf_index = "${unzipped_basename}.g.vcf.idx" | ||
| + } | ||
| +} | ||
| + | ||
| +# Perform joint-genotyping | ||
| +task GenotypeGVCFs { | ||
| + Array[File] gvcfs | ||
| + Array[File] gvcf_indices | ||
| + String vcf_basename | ||
| + File ref_dict | ||
| + File ref_fasta | ||
| + File ref_fasta_index | ||
| + File interval_list | ||
| + Int disk_size | ||
| + String mem_size | ||
| + | ||
| + command { | ||
| + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \ | ||
| + -jar /usr/gitc/GATK36.jar \ | ||
| + -T GenotypeGVCFs \ | ||
| + -R ${ref_fasta} \ | ||
| + --variant ${sep=' --variant ' gvcfs} \ | ||
| + -L ${interval_list} \ | ||
| + -o ${vcf_basename}.vcf.gz | ||
| + } | ||
| + | ||
| + output { | ||
| + File genotyped_vcf = "${vcf_basename}.vcf.gz" | ||
| + File genotyped_index = "${vcf_basename}.vcf.gz.tbi" | ||
| + } | ||
| + | ||
| + runtime { | ||
| + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | ||
| + memory: mem_size | ||
| + cpu: "1" | ||
| + disks: "local-disk " + disk_size + " HDD" | ||
| + } | ||
| +} | ||
| + | ||
| +# Combine multiple VCFs | ||
| +task MergeVCFs { | ||
| + Array [File] input_vcfs | ||
| + Array [File] input_vcfs_indices | ||
| + String vcf_name | ||
| + String vcf_index | ||
| + Int disk_size | ||
| + String mem_size | ||
| + | ||
| + command { | ||
| + java -Xmx2g -jar /usr/gitc/picard.jar \ | ||
| + MergeVcfs \ | ||
| + INPUT=${sep=' INPUT=' input_vcfs} \ | ||
| + OUTPUT=${vcf_name} | ||
| + } | ||
| + | ||
| + runtime { | ||
| + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | ||
| + memory: mem_size | ||
| + disks: "local-disk " + disk_size + " HDD" | ||
| + } | ||
| + | ||
| + output { | ||
| + File output_vcf = "${vcf_name}" | ||
| + File output_vcf_index = "${vcf_index}" | ||
| + } | ||
| +} | ||
| + | ||
| +workflow GenotypeGVCFsScatterWf { | ||
| + File ref_fasta | ||
| + File ref_fasta_index | ||
| + File ref_dict | ||
| + Array[File] input_gvcfs | ||
| + Array[File] input_gvcf_indices | ||
| + String cohort_vcf_name | ||
| + File scattered_calling_intervals_list | ||
| + | ||
| + Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) | ||
| + | ||
| + # Unzip GVCFs in parallel | ||
| + scatter (input_gvcf in input_gvcfs) { | ||
| + | ||
| + # Unzip block-compressed VCFs with .gz extension because GenotypeGVCFs | ||
| + # currently does not handle compressed files. | ||
| + call UnzipGVCF { | ||
| + input: | ||
| + gzipped_gvcf = input_gvcf, | ||
| + unzipped_basename = "temp_unzipped" | ||
| + } | ||
| + } | ||
| + | ||
| + # Joint-call variants in parallel over WGS calling intervals | ||
| + scatter (subInterval in scattered_calling_intervals) { | ||
| + | ||
| + # Perform joint genotyping per interval | ||
| + call GenotypeGVCFs { | ||
| + input: | ||
| + gvcfs = UnzipGVCF.unzipped_gvcf, | ||
| + gvcf_indices = UnzipGVCF.gvcf_index, | ||
| + vcf_basename = cohort_vcf_name, | ||
| + ref_dict = ref_dict, | ||
| + ref_fasta = ref_fasta, | ||
| + ref_fasta_index = ref_fasta_index, | ||
| + interval_list = subInterval | ||
| + } | ||
| + } | ||
| + | ||
| + # Merge per-interval VCFs into a single cohort VCF file | ||
| + call MergeVCFs { | ||
| + input: | ||
| + input_vcfs = GenotypeGVCFs.genotyped_vcf, | ||
| + input_vcfs_indices = GenotypeGVCFs.genotyped_index, | ||
| + vcf_name = cohort_vcf_name + ".vcf.gz", | ||
| + vcf_index = cohort_vcf_name + ".vcf.gz.tbi" | ||
| + } | ||
| + | ||
| + # Outputs that will be retained when execution is complete | ||
| + output { | ||
| + File output_merged_vcf = MergeVCFs.output_vcf | ||
| + File output_merged_vcf_index = MergeVCFs.output_vcf_index | ||
| + } | ||
| +} |
| @@ -0,0 +1,17 @@ | ||
| +{ | ||
| + "HaplotypeCallerGvcfScatterWf.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_20k_b37/NA12878.bam", | ||
| + "HaplotypeCallerGvcfScatterWf.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_20k_b37/NA12878.bai", | ||
| + | ||
| + "HaplotypeCallerGvcfScatterWf.ref_dict": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.dict", | ||
| + "HaplotypeCallerGvcfScatterWf.ref_fasta": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta", | ||
| + "HaplotypeCallerGvcfScatterWf.ref_fasta_index": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai", | ||
| + | ||
| + "HaplotypeCallerGvcfScatterWf.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/b37_wgs_scattered_calling_intervals.txt", | ||
| + | ||
| + "HaplotypeCallerGvcfScatterWf.HaplotypeCaller.interval_padding": 100, | ||
| + | ||
| + "HaplotypeCallerGvcfScatterWf.HaplotypeCaller.disk_size": 100, | ||
| + "HaplotypeCallerGvcfScatterWf.HaplotypeCaller.mem_size": "10 GB", | ||
| + "HaplotypeCallerGvcfScatterWf.MergeVCFs.disk_size": 100, | ||
| + "HaplotypeCallerGvcfScatterWf.MergeVCFs.mem_size": "3 GB" | ||
| +} |
| @@ -0,0 +1,147 @@ | ||
| +## Copyright Broad Institute, 2017 | ||
| +## | ||
| +## This WDL workflow runs HaplotypeCaller in GVCF mode on a single sample, scattered | ||
| +## across intervals. | ||
| +## | ||
| +## Requirements/expectations : | ||
| +## - One analysis-ready BAM file for a single sample (as identified in RG:SM) | ||
| +## - Set of variant calling intervals lists for the scatter, provided in a file | ||
| +## | ||
| +## Outputs : | ||
| +## - One GVCF file and its index | ||
| +## | ||
| +## Cromwell version support | ||
| +## - Successfully tested on v24 | ||
| +## - Does not work on versions < v23 due to output syntax | ||
| +## | ||
| +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. | ||
| +## For program versions, see docker containers. | ||
| +## | ||
| +## LICENSING : | ||
| +## This script is released under the WDL source code license (BSD-3) (see LICENSE in | ||
| +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may | ||
| +## be subject to different licenses. Users are responsible for checking that they are | ||
| +## authorized to run all programs before running this script. Please see the docker | ||
| +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed | ||
| +## licensing information pertaining to the included programs. | ||
| + | ||
| +# TASK DEFINITIONS | ||
| + | ||
| +# HaplotypeCaller per-sample in GVCF mode | ||
| +task HaplotypeCaller { | ||
| + File input_bam | ||
| + File input_bam_index | ||
| + String gvcf_name | ||
| + String gvcf_index | ||
| + File ref_dict | ||
| + File ref_fasta | ||
| + File ref_fasta_index | ||
| + File interval_list | ||
| + Int? interval_padding | ||
| + Int disk_size | ||
| + String mem_size | ||
| + | ||
| + command { | ||
| + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx800m \ | ||
| + -jar /usr/gitc/GATK36.jar \ | ||
| + -T HaplotypeCaller \ | ||
| + -R ${ref_fasta} \ | ||
| + -I ${input_bam} \ | ||
| + -o ${gvcf_name} \ | ||
| + -L ${interval_list} \ | ||
| + -ip ${default=100 interval_padding} \ | ||
| + -ERC GVCF | ||
| + } | ||
| + | ||
| + runtime { | ||
| + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | ||
| + cpu: "1" | ||
| + memory: mem_size | ||
| + disks: "local-disk " + disk_size + " HDD" | ||
| + } | ||
| + | ||
| + output { | ||
| + File output_gvcf = "${gvcf_name}" | ||
| + File output_gvcf_index = "${gvcf_index}" | ||
| + } | ||
| +} | ||
| + | ||
| +# Merge GVCFs generated per-interval for the same sample | ||
| +task MergeVCFs { | ||
| + Array [File] input_vcfs | ||
| + Array [File] input_vcfs_indices | ||
| + String vcf_name | ||
| + String vcf_index | ||
| + Int disk_size | ||
| + String mem_size | ||
| + | ||
| + command { | ||
| + java -Xmx2g -jar /usr/gitc/picard.jar \ | ||
| + MergeVcfs \ | ||
| + INPUT=${sep=' INPUT=' input_vcfs} \ | ||
| + OUTPUT=${vcf_name} | ||
| + } | ||
| + | ||
| + runtime { | ||
| + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | ||
| + memory: mem_size | ||
| + disks: "local-disk " + disk_size + " HDD" | ||
| +} | ||
| + | ||
| + output { | ||
| + File output_vcf = "${vcf_name}" | ||
| + File output_vcf_index = "${vcf_index}" | ||
| + } | ||
| +} | ||
| + | ||
| +# WORKFLOW DEFINITION | ||
| +workflow HaplotypeCallerGvcfScatterWf { | ||
| + File input_bam | ||
| + File input_bam_index | ||
| + File ref_dict | ||
| + File ref_fasta | ||
| + File ref_fasta_index | ||
| + File scattered_calling_intervals_list | ||
| + | ||
| + Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) | ||
| + | ||
| + String sub_strip_path = "gs://.*/" | ||
| + String sub_strip_suffix = ".bam$" | ||
| + | ||
| + String sample_basename = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, "") | ||
| + | ||
| + String gvcf_name = sample_basename + ".g.vcf.gz" | ||
| + String gvcf_index = sample_basename + ".g.vcf.gz.tbi" | ||
| + | ||
| + # Call variants in parallel over grouped calling intervals | ||
| + scatter (interval_file in scattered_calling_intervals) { | ||
| + | ||
| + # Generate GVCF by interval | ||
| + call HaplotypeCaller { | ||
| + input: | ||
| + input_bam = input_bam, | ||
| + input_bam_index = input_bam_index, | ||
| + interval_list = interval_file, | ||
| + gvcf_name = gvcf_name, | ||
| + gvcf_index = gvcf_index, | ||
| + ref_dict = ref_dict, | ||
| + ref_fasta = ref_fasta, | ||
| + ref_fasta_index = ref_fasta_index | ||
| + } | ||
| + } | ||
| + | ||
| + # Merge per-interval GVCFs | ||
| + call MergeVCFs { | ||
| + input: | ||
| + input_vcfs = HaplotypeCaller.output_gvcf, | ||
| + input_vcfs_indices = HaplotypeCaller.output_gvcf_index, | ||
| + vcf_name = gvcf_name, | ||
| + vcf_index = gvcf_index | ||
| + } | ||
| + | ||
| + # Outputs that will be retained when execution is complete | ||
| + output { | ||
| + File output_merged_gvcf = MergeVCFs.output_vcf | ||
| + File output_merged_gvcf_index = MergeVCFs.output_vcf_index | ||
| + } | ||
| +} |
Oops, something went wrong.
0 comments on commit
2c56658