Standalone variant calling steps + joint discovery #88

Merged
merged 1 commit into from Mar 6, 2017
Jump to file or symbol
Failed to load files and symbols.
+1,087 −0
Split
@@ -0,0 +1,28 @@
+{
+ "GenotypeGVCFsScatterWf.cohort_vcf_name": "PlatinumTrio_b37",
+
+ "GenotypeGVCFsScatterWf.input_gvcfs": [
+ "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12877.g.vcf.gz",
+ "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12878.g.vcf.gz",
+ "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12882.g.vcf.gz"
+ ],
+
+ "GenotypeGVCFsScatterWf.input_gvcf_indices": [
+ "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12877.g.vcf.gz.tbi",
+ "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12878.g.vcf.gz.tbi",
+ "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12882.g.vcf.gz.tbi"
+ ],
+
+ "GenotypeGVCFsScatterWf.ref_dict": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.dict",
+ "GenotypeGVCFsScatterWf.ref_fasta": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta",
+ "GenotypeGVCFsScatterWf.ref_fasta_index": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai",
+
+ "GenotypeGVCFsScatterWf.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/b37_wgs_scattered_calling_intervals.txt",
+
+ "GenotypeGVCFsScatterWf.UnzipGVCF.disk_size": 100,
+ "GenotypeGVCFsScatterWf.UnzipGVCF.mem_size": "10 GB",
+ "GenotypeGVCFsScatterWf.GenotypeGVCFs.disk_size": 100,
+ "GenotypeGVCFsScatterWf.GenotypeGVCFs.mem_size": "10 GB",
+ "GenotypeGVCFsScatterWf.MergeVCFs.disk_size": 100,
+ "GenotypeGVCFsScatterWf.MergeVCFs.mem_size": "10 GB"
+}
@@ -0,0 +1,175 @@
+## Copyright Broad Institute, 2017
+##
+## This WDL workflow runs GenotypeGVCFs on a set of GVCFs to joint-call multiple
+## samples, scattered across intervals.
+##
+## Requirements/expectations :
+## - One or more GVCFs produced by HaplotypeCaller in GVCF mode
+##
+## Outputs :
+## - A VCF file and its index, with genotypes for all samples represented in the
+## GVCF inputs.
+##
+## Cromwell version support
+## - Successfully tested on v24
+## - Does not work on versions < v23 due to output syntax
+##
+## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
+## For program versions, see docker containers.
+##
+## LICENSING :
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in
+## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
@sooheelee

sooheelee Mar 6, 2017

Collaborator

Here, are programs tools?

@vdauwera

vdauwera Mar 6, 2017

Collaborator

correct

@sooheelee

sooheelee Mar 7, 2017

Collaborator

I'm going to start using programs instead of tools too then.

@vdauwera

vdauwera Mar 7, 2017

Collaborator

Actually my "correct" comment was an oversimplification. I should have clarified that I use these terms to distinguish two levels: programs are GATK, Picard, BWA, Cromwell, and tools, such as HaplotypeCaller, are what we invoke in programs that are toolkits, like GATK or Picard. So they're not interchangeable.

+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the docker
+## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
+## licensing information pertaining to the included programs.
+
+# TASK DEFINITIONS
+
+# Unzip GVCFs
+task UnzipGVCF {
+ File gzipped_gvcf
+ String unzipped_basename
+ Int disk_size
+ String mem_size
+
+ # HACK ALERT! Using .gvcf extension here to force IndexFeatureFile to make the right
@sooheelee

sooheelee Mar 6, 2017

Collaborator

Why can't we name the gunzipped output to ${unzipped_basename}.g.vcf instead to start with? Then we can avoid the renaming.

@vdauwera

vdauwera Mar 6, 2017

Collaborator

Mainly because this should be a very temporary situation, so when it's fixed we can just change this bit. In fact I should check whether it's still even the case.

+ # kind of index, but afterward we need to change to .g.vcf which is the correct
+ # for GVCFs.
+ command <<<
+ gunzip -c ${gzipped_gvcf} > ${unzipped_basename}.gvcf
+ java -Xmx2g -jar /usr/gitc/GATK4.jar IndexFeatureFile -F ${unzipped_basename}.gvcf
+ mv ${unzipped_basename}.gvcf ${unzipped_basename}.g.vcf
+ mv ${unzipped_basename}.gvcf.idx ${unzipped_basename}.g.vcf.idx
+ >>>
+
+ runtime {
+ docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
+ memory: mem_size
+ disks: "local-disk " + disk_size + " HDD"
+ }
+
+ output {
+ File unzipped_gvcf = "${unzipped_basename}.g.vcf"
+ File gvcf_index = "${unzipped_basename}.g.vcf.idx"
+ }
+}
+
+# Perform joint-genotyping
+task GenotypeGVCFs {
+ Array[File] gvcfs
+ Array[File] gvcf_indices
+ String vcf_basename
+ File ref_dict
+ File ref_fasta
+ File ref_fasta_index
+ File interval_list
+ Int disk_size
+ String mem_size
+
+ command {
+ java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \
+ -jar /usr/gitc/GATK36.jar \
+ -T GenotypeGVCFs \
+ -R ${ref_fasta} \
+ --variant ${sep=' --variant ' gvcfs} \
+ -L ${interval_list} \
+ -o ${vcf_basename}.vcf.gz
+ }
+
+ output {
+ File genotyped_vcf = "${vcf_basename}.vcf.gz"
+ File genotyped_index = "${vcf_basename}.vcf.gz.tbi"
+ }
+
+ runtime {
+ docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
+ memory: mem_size
+ cpu: "1"
+ disks: "local-disk " + disk_size + " HDD"
+ }
+}
+
+# Combine multiple VCFs
+task MergeVCFs {
+ Array [File] input_vcfs
+ Array [File] input_vcfs_indices
+ String vcf_name
+ String vcf_index
+ Int disk_size
+ String mem_size
+
+ command {
+ java -Xmx2g -jar /usr/gitc/picard.jar \
+ MergeVcfs \
+ INPUT=${sep=' INPUT=' input_vcfs} \
+ OUTPUT=${vcf_name}
+ }
+
+ runtime {
+ docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
+ memory: mem_size
+ disks: "local-disk " + disk_size + " HDD"
+ }
+
+ output {
+ File output_vcf = "${vcf_name}"
+ File output_vcf_index = "${vcf_index}"
+ }
+}
+
+workflow GenotypeGVCFsScatterWf {
+ File ref_fasta
+ File ref_fasta_index
+ File ref_dict
+ Array[File] input_gvcfs
+ Array[File] input_gvcf_indices
+ String cohort_vcf_name
+ File scattered_calling_intervals_list
+
+ Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list)
+
+ # Unzip GVCFs in parallel
+ scatter (input_gvcf in input_gvcfs) {
+
+ # Unzip block-compressed VCFs with .gz extension because GenotypeGVCFs
+ # currently does not handle compressed files.
+ call UnzipGVCF {
+ input:
+ gzipped_gvcf = input_gvcf,
+ unzipped_basename = "temp_unzipped"
+ }
+ }
+
+ # Joint-call variants in parallel over WGS calling intervals
+ scatter (subInterval in scattered_calling_intervals) {
+
+ # Perform joint genotyping per interval
+ call GenotypeGVCFs {
+ input:
+ gvcfs = UnzipGVCF.unzipped_gvcf,
+ gvcf_indices = UnzipGVCF.gvcf_index,
+ vcf_basename = cohort_vcf_name,
+ ref_dict = ref_dict,
+ ref_fasta = ref_fasta,
+ ref_fasta_index = ref_fasta_index,
+ interval_list = subInterval
+ }
+ }
+
+ # Merge per-interval VCFs into a single cohort VCF file
+ call MergeVCFs {
+ input:
+ input_vcfs = GenotypeGVCFs.genotyped_vcf,
+ input_vcfs_indices = GenotypeGVCFs.genotyped_index,
+ vcf_name = cohort_vcf_name + ".vcf.gz",
+ vcf_index = cohort_vcf_name + ".vcf.gz.tbi"
+ }
+
+ # Outputs that will be retained when execution is complete
@sooheelee

sooheelee Mar 6, 2017

Collaborator

Instead of "execution", use "workflow".

@vdauwera

vdauwera Mar 6, 2017

Collaborator

I'd rather continue to distinguish the workflow (the thing that gets executed) from its execution.

@sooheelee

sooheelee Mar 7, 2017

Collaborator

Ok Oliver.

@vdauwera

vdauwera Mar 7, 2017

Collaborator

Hey :)

+ output {
+ File output_merged_vcf = MergeVCFs.output_vcf
+ File output_merged_vcf_index = MergeVCFs.output_vcf_index
+ }
+}
@@ -0,0 +1,17 @@
+{
+ "HaplotypeCallerGvcfScatterWf.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_20k_b37/NA12878.bam",
+ "HaplotypeCallerGvcfScatterWf.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_20k_b37/NA12878.bai",
+
+ "HaplotypeCallerGvcfScatterWf.ref_dict": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.dict",
+ "HaplotypeCallerGvcfScatterWf.ref_fasta": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta",
+ "HaplotypeCallerGvcfScatterWf.ref_fasta_index": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai",
+
+ "HaplotypeCallerGvcfScatterWf.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/b37_wgs_scattered_calling_intervals.txt",
+
+ "HaplotypeCallerGvcfScatterWf.HaplotypeCaller.interval_padding": 100,
+
+ "HaplotypeCallerGvcfScatterWf.HaplotypeCaller.disk_size": 100,
+ "HaplotypeCallerGvcfScatterWf.HaplotypeCaller.mem_size": "10 GB",
+ "HaplotypeCallerGvcfScatterWf.MergeVCFs.disk_size": 100,
+ "HaplotypeCallerGvcfScatterWf.MergeVCFs.mem_size": "3 GB"
+}
@@ -0,0 +1,147 @@
+## Copyright Broad Institute, 2017
+##
+## This WDL workflow runs HaplotypeCaller in GVCF mode on a single sample, scattered
+## across intervals.
+##
+## Requirements/expectations :
+## - One analysis-ready BAM file for a single sample (as identified in RG:SM)
+## - Set of variant calling intervals lists for the scatter, provided in a file
+##
+## Outputs :
+## - One GVCF file and its index
+##
+## Cromwell version support
+## - Successfully tested on v24
+## - Does not work on versions < v23 due to output syntax
+##
+## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
+## For program versions, see docker containers.
+##
+## LICENSING :
+## This script is released under the WDL source code license (BSD-3) (see LICENSE in
+## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
+## be subject to different licenses. Users are responsible for checking that they are
+## authorized to run all programs before running this script. Please see the docker
+## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
+## licensing information pertaining to the included programs.
+
+# TASK DEFINITIONS
+
+# HaplotypeCaller per-sample in GVCF mode
@sooheelee

sooheelee Mar 6, 2017

Collaborator

A link to our documentation explaining GVCF mode may be helpful here.

@vdauwera

vdauwera Mar 6, 2017

Collaborator

Eh, that's kind of a slippery slope... By the same logic we could link every single tool call to docs. I don't have the stomach to take that on right now (though for the future it would be nice).

@sooheelee

sooheelee Mar 7, 2017

Collaborator

Ok

+task HaplotypeCaller {
+ File input_bam
+ File input_bam_index
+ String gvcf_name
+ String gvcf_index
+ File ref_dict
+ File ref_fasta
+ File ref_fasta_index
+ File interval_list
+ Int? interval_padding
+ Int disk_size
+ String mem_size
+
+ command {
+ java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx800m \
+ -jar /usr/gitc/GATK36.jar \
+ -T HaplotypeCaller \
+ -R ${ref_fasta} \
+ -I ${input_bam} \
+ -o ${gvcf_name} \
+ -L ${interval_list} \
+ -ip ${default=100 interval_padding} \
+ -ERC GVCF
+ }
+
+ runtime {
+ docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
+ cpu: "1"
+ memory: mem_size
+ disks: "local-disk " + disk_size + " HDD"
+ }
+
+ output {
+ File output_gvcf = "${gvcf_name}"
+ File output_gvcf_index = "${gvcf_index}"
+ }
+}
+
+# Merge GVCFs generated per-interval for the same sample
+task MergeVCFs {
+ Array [File] input_vcfs
+ Array [File] input_vcfs_indices
+ String vcf_name
+ String vcf_index
+ Int disk_size
+ String mem_size
+
+ command {
+ java -Xmx2g -jar /usr/gitc/picard.jar \
+ MergeVcfs \
+ INPUT=${sep=' INPUT=' input_vcfs} \
+ OUTPUT=${vcf_name}
+ }
+
+ runtime {
+ docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
+ memory: mem_size
+ disks: "local-disk " + disk_size + " HDD"
+}
+
+ output {
+ File output_vcf = "${vcf_name}"
+ File output_vcf_index = "${vcf_index}"
+ }
+}
+
+# WORKFLOW DEFINITION
+workflow HaplotypeCallerGvcfScatterWf {
+ File input_bam
+ File input_bam_index
+ File ref_dict
+ File ref_fasta
+ File ref_fasta_index
+ File scattered_calling_intervals_list
+
+ Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list)
+
+ String sub_strip_path = "gs://.*/"
+ String sub_strip_suffix = ".bam$"
+
+ String sample_basename = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, "")
+
+ String gvcf_name = sample_basename + ".g.vcf.gz"
+ String gvcf_index = sample_basename + ".g.vcf.gz.tbi"
+
+ # Call variants in parallel over grouped calling intervals
+ scatter (interval_file in scattered_calling_intervals) {
+
+ # Generate GVCF by interval
+ call HaplotypeCaller {
+ input:
+ input_bam = input_bam,
+ input_bam_index = input_bam_index,
+ interval_list = interval_file,
+ gvcf_name = gvcf_name,
+ gvcf_index = gvcf_index,
+ ref_dict = ref_dict,
+ ref_fasta = ref_fasta,
+ ref_fasta_index = ref_fasta_index
+ }
+ }
+
+ # Merge per-interval GVCFs
+ call MergeVCFs {
+ input:
+ input_vcfs = HaplotypeCaller.output_gvcf,
+ input_vcfs_indices = HaplotypeCaller.output_gvcf_index,
+ vcf_name = gvcf_name,
+ vcf_index = gvcf_index
+ }
+
+ # Outputs that will be retained when execution is complete
+ output {
+ File output_merged_gvcf = MergeVCFs.output_vcf
+ File output_merged_gvcf_index = MergeVCFs.output_vcf_index
+ }
+}
Oops, something went wrong.