From 05fc537152a5d8e2f073b8e6748bc6df57a3d149 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Thu, 1 Sep 2016 21:30:19 -0400 Subject: [PATCH 1/4] Add a WDL that extracts headers from bam files --- scripts/other/GrabSamHeaderFromBams.inputs.json | 7 +++ scripts/other/GrabSamHeaderFromBams.options.json | 6 +++ scripts/other/GrabSamHeaderFromBams.wdl | 63 ++++++++++++++++++++++++ 3 files changed, 76 insertions(+) create mode 100644 scripts/other/GrabSamHeaderFromBams.inputs.json create mode 100644 scripts/other/GrabSamHeaderFromBams.options.json create mode 100644 scripts/other/GrabSamHeaderFromBams.wdl diff --git a/scripts/other/GrabSamHeaderFromBams.inputs.json b/scripts/other/GrabSamHeaderFromBams.inputs.json new file mode 100644 index 0000000..2a54524 --- /dev/null +++ b/scripts/other/GrabSamHeaderFromBams.inputs.json @@ -0,0 +1,7 @@ +{ + "GrabSamHeaderFromBams.bam_list": [ + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" + ] +} diff --git a/scripts/other/GrabSamHeaderFromBams.options.json b/scripts/other/GrabSamHeaderFromBams.options.json new file mode 100644 index 0000000..84dbaa3 --- /dev/null +++ b/scripts/other/GrabSamHeaderFromBams.options.json @@ -0,0 +1,6 @@ +{ + "read_from_cache":false, + "defaultRuntimeOptions": { + "zones": "us-central1-b us-central1-c" + } +} \ No newline at end of file diff --git a/scripts/other/GrabSamHeaderFromBams.wdl b/scripts/other/GrabSamHeaderFromBams.wdl new file mode 100644 index 0000000..fc40791 --- /dev/null +++ b/scripts/other/GrabSamHeaderFromBams.wdl @@ -0,0 +1,63 @@ +## Copyright Broad Institute, 2016 +## +## This WDL grabs the headers from a list of BAMs +## +## Requirements/expectations : +## - List of valid BAM files +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Merge original input uBAM file with BWA-aligned BAM file +task GrabSAMHeader { + File bam_file + String output_basename + + command { + samtools view -H ${bam_file} > ${output_basename}.txt + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "1 GB" + cpu: "1" + disks: "local-disk " + 200 + " HDD" + } + output { + File output_bam = "${output_basename}.txt" + } +} + +# WORKFLOW DEFINITION +workflow GrabSamHeaderFromBams { + Array[File] bam_list + + # Convert multiple pairs of input fastqs in parallel + scatter (input_bam in bam_list) { + + String sub_strip_path = "gs://.*/" + String sub_strip_suffix = ".bam$" + + # Convert pair of FASTQs to uBAM + call GrabSAMHeader { + input: + bam_file = input_bam, + output_basename = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, "") + ".header" + } + } + + # Outputs that will be retained when execution is complete + output { + GrabSAMHeader.* + } +} + From 762cb476bfa2687554715bb788195b34cb0a9ee8 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Fri, 2 Sep 2016 01:58:22 -0400 Subject: [PATCH 2/4] refine header grabber + add draft of fastq from bam converter --- ...on => GrabSamHeaderFromBams_160901.inputs.json} | 0 ...n => GrabSamHeaderFromBams_160901.options.json} | 0 ...omBams.wdl => GrabSamHeaderFromBams_160901.wdl} | 2 +- .../other/PairedFastQFromBams_160901.inputs.json | 5 ++ .../other/PairedFastQFromBams_160901.options.json | 6 ++ scripts/other/PairedFastQFromBams_160901.wdl | 68 ++++++++++++++++++++++ 6 files changed, 80 insertions(+), 1 deletion(-) rename scripts/other/{GrabSamHeaderFromBams.inputs.json => GrabSamHeaderFromBams_160901.inputs.json} (100%) rename scripts/other/{GrabSamHeaderFromBams.options.json => GrabSamHeaderFromBams_160901.options.json} (100%) rename scripts/other/{GrabSamHeaderFromBams.wdl => GrabSamHeaderFromBams_160901.wdl} (96%) create mode 100644 scripts/other/PairedFastQFromBams_160901.inputs.json create mode 100644 scripts/other/PairedFastQFromBams_160901.options.json create mode 100644 scripts/other/PairedFastQFromBams_160901.wdl diff --git a/scripts/other/GrabSamHeaderFromBams.inputs.json b/scripts/other/GrabSamHeaderFromBams_160901.inputs.json similarity index 100% rename from scripts/other/GrabSamHeaderFromBams.inputs.json rename to scripts/other/GrabSamHeaderFromBams_160901.inputs.json diff --git a/scripts/other/GrabSamHeaderFromBams.options.json b/scripts/other/GrabSamHeaderFromBams_160901.options.json similarity index 100% rename from scripts/other/GrabSamHeaderFromBams.options.json rename to scripts/other/GrabSamHeaderFromBams_160901.options.json diff --git a/scripts/other/GrabSamHeaderFromBams.wdl b/scripts/other/GrabSamHeaderFromBams_160901.wdl similarity index 96% rename from scripts/other/GrabSamHeaderFromBams.wdl rename to scripts/other/GrabSamHeaderFromBams_160901.wdl index fc40791..a04f170 100644 --- a/scripts/other/GrabSamHeaderFromBams.wdl +++ b/scripts/other/GrabSamHeaderFromBams_160901.wdl @@ -18,7 +18,7 @@ # TASK DEFINITIONS -# Merge original input uBAM file with BWA-aligned BAM file +# Extract the header from a BAM using samtools task GrabSAMHeader { File bam_file String output_basename diff --git a/scripts/other/PairedFastQFromBams_160901.inputs.json b/scripts/other/PairedFastQFromBams_160901.inputs.json new file mode 100644 index 0000000..43e8bba --- /dev/null +++ b/scripts/other/PairedFastQFromBams_160901.inputs.json @@ -0,0 +1,5 @@ +{ + "PairedFastQFromBams.bam_list": [ + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam" + ] +} diff --git a/scripts/other/PairedFastQFromBams_160901.options.json b/scripts/other/PairedFastQFromBams_160901.options.json new file mode 100644 index 0000000..84dbaa3 --- /dev/null +++ b/scripts/other/PairedFastQFromBams_160901.options.json @@ -0,0 +1,6 @@ +{ + "read_from_cache":false, + "defaultRuntimeOptions": { + "zones": "us-central1-b us-central1-c" + } +} \ No newline at end of file diff --git a/scripts/other/PairedFastQFromBams_160901.wdl b/scripts/other/PairedFastQFromBams_160901.wdl new file mode 100644 index 0000000..aa1cfbd --- /dev/null +++ b/scripts/other/PairedFastQFromBams_160901.wdl @@ -0,0 +1,68 @@ +## Copyright Broad Institute, 2016 +## +## This WDL converts a list of BAMs to pairs of FASTQs +## +## Requirements/expectations : +## - List of valid BAM files +## - Max one readgroup per BAM files. If there are more, the distinctions will be lost. +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Run SamToFASTQ to revert the bam +task PairedFastQFromBam { + File bam_file + String fastq_1 + String fastq_2 + String unpaired + + command { + java -Xmx3000m -jar /usr/gitc/picard.jar \ + SamToFastq \ + I=${bam_file} \ + FASTQ=${fastq_1} \ + SECOND_END_FASTQ=${fastq_2} \ + UNPAIRED_FASTQ=${unpaired} \ + INCLUDE_NON_PRIMARY_ALIGNMENTS=true \ + INCLUDE_NON_PF_READS=true \ + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "3500 MB" + cpu: "1" + disks: "local-disk " + 200 + " HDD" + } +} + +# WORKFLOW DEFINITION +workflow PairedFastQFromBams { + Array[File] bam_list + + # Convert multiple pairs of input fastqs in parallel + scatter (input_bam in bam_list) { + + String sub_strip_path = "gs://.*/" + String sub_strip_suffix = ".bam$" + String output_basename = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, "") + + # Convert pair of FASTQs to uBAM + call PairedFastQFromBam { + input: + bam_file = input_bam, + fastq_1 = output_basename + "_1.fastq", + fastq_2 = output_basename + "_2.fastq", + unpaired = output_basename + "_up.fastq" + } + } +} + From 0b88bb62e2b6e3d1f8ab3c93c45dadc0c34a0e0c Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Wed, 7 Sep 2016 18:10:43 -0400 Subject: [PATCH 3/4] added a bunch of new others -- don't merge yet --- .../other/PairedFastQFromBams_160901.inputs.json | 4 +- .../other/PairedFastQFromBams_160901.options.json | 6 - scripts/other/PairedFastQFromBams_160901.wdl | 14 +- scripts/other/ValidateBams_160902.inputs.json | 7 + scripts/other/ValidateBams_160902.wdl | 67 ++ ...GS_PE_SingleSample_LegacyRef_160901.inputs.json | 98 +++ .../other/WGS_PE_SingleSample_LegacyRef_160901.wdl | 748 +++++++++++++++++++++ ...ms_160901.options.json => generic.options.json} | 0 .../other/uBamFromPairedFastQ_160902.inputs.json | 30 + scripts/other/uBamFromPairedFastQ_160902.wdl | 90 +++ scripts/utilities/create_scatter_intervals.py | 145 ++++ 11 files changed, 1200 insertions(+), 9 deletions(-) delete mode 100644 scripts/other/PairedFastQFromBams_160901.options.json create mode 100644 scripts/other/ValidateBams_160902.inputs.json create mode 100644 scripts/other/ValidateBams_160902.wdl create mode 100644 scripts/other/WGS_PE_SingleSample_LegacyRef_160901.inputs.json create mode 100644 scripts/other/WGS_PE_SingleSample_LegacyRef_160901.wdl rename scripts/other/{GrabSamHeaderFromBams_160901.options.json => generic.options.json} (100%) create mode 100644 scripts/other/uBamFromPairedFastQ_160902.inputs.json create mode 100644 scripts/other/uBamFromPairedFastQ_160902.wdl create mode 100644 scripts/utilities/create_scatter_intervals.py diff --git a/scripts/other/PairedFastQFromBams_160901.inputs.json b/scripts/other/PairedFastQFromBams_160901.inputs.json index 43e8bba..226e076 100644 --- a/scripts/other/PairedFastQFromBams_160901.inputs.json +++ b/scripts/other/PairedFastQFromBams_160901.inputs.json @@ -1,5 +1,7 @@ { "PairedFastQFromBams.bam_list": [ - "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam" + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" ] } diff --git a/scripts/other/PairedFastQFromBams_160901.options.json b/scripts/other/PairedFastQFromBams_160901.options.json deleted file mode 100644 index 84dbaa3..0000000 --- a/scripts/other/PairedFastQFromBams_160901.options.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "read_from_cache":false, - "defaultRuntimeOptions": { - "zones": "us-central1-b us-central1-c" - } -} \ No newline at end of file diff --git a/scripts/other/PairedFastQFromBams_160901.wdl b/scripts/other/PairedFastQFromBams_160901.wdl index aa1cfbd..7fc919d 100644 --- a/scripts/other/PairedFastQFromBams_160901.wdl +++ b/scripts/other/PairedFastQFromBams_160901.wdl @@ -34,7 +34,7 @@ task PairedFastQFromBam { SECOND_END_FASTQ=${fastq_2} \ UNPAIRED_FASTQ=${unpaired} \ INCLUDE_NON_PRIMARY_ALIGNMENTS=true \ - INCLUDE_NON_PF_READS=true \ + INCLUDE_NON_PF_READS=true } runtime { docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" @@ -42,6 +42,11 @@ task PairedFastQFromBam { cpu: "1" disks: "local-disk " + 200 + " HDD" } + output { + File out_fastq_1 = "${fastq_1}" + File out_fastq_2 = "${fastq_2}" + File out_unpaired = "${unpaired}" + } } # WORKFLOW DEFINITION @@ -53,7 +58,7 @@ workflow PairedFastQFromBams { String sub_strip_path = "gs://.*/" String sub_strip_suffix = ".bam$" - String output_basename = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, "") + File output_basename = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, "") # Convert pair of FASTQs to uBAM call PairedFastQFromBam { @@ -64,5 +69,10 @@ workflow PairedFastQFromBams { unpaired = output_basename + "_up.fastq" } } + + # Outputs that will be retained when execution is complete + output { + PairedFastQFromBam.* + } } diff --git a/scripts/other/ValidateBams_160902.inputs.json b/scripts/other/ValidateBams_160902.inputs.json new file mode 100644 index 0000000..cf22b16 --- /dev/null +++ b/scripts/other/ValidateBams_160902.inputs.json @@ -0,0 +1,7 @@ +{ + "ValidateBAMs.bam_list": [ + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" + ] +} diff --git a/scripts/other/ValidateBams_160902.wdl b/scripts/other/ValidateBams_160902.wdl new file mode 100644 index 0000000..325165b --- /dev/null +++ b/scripts/other/ValidateBams_160902.wdl @@ -0,0 +1,67 @@ +## Copyright Broad Institute, 2016 +## +## This WDL validates a list of BAMs in SUMMARY mode +## +## Requirements/expectations : +## - List of BAM files to validate +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Extract the header from a BAM using samtools +task ValidateBAM { + File bam_file + String output_basename + + command { + java -Xmx3000m -jar /usr/gitc/picard.jar \ + ValidateSamFile \ + I=${bam_file} \ + OUTPUT=${output_basename}.txt \ + MODE=SUMMARY + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "1 GB" + cpu: "1" + disks: "local-disk " + 200 + " HDD" + } + output { + File output_bam = "${output_basename}.txt" + } +} + +# WORKFLOW DEFINITION +workflow ValidateBAMs { + Array[File] bam_list + + # Convert multiple pairs of input fastqs in parallel + scatter (input_bam in bam_list) { + + String sub_strip_path = "gs://.*/" + String sub_strip_suffix = ".bam$" + + # Convert pair of FASTQs to uBAM + call ValidateBAM { + input: + bam_file = input_bam, + output_basename = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, "") + ".validation" + } + } + + # Outputs that will be retained when execution is complete + output { + ValidateBAM.* + } +} + diff --git a/scripts/other/WGS_PE_SingleSample_LegacyRef_160901.inputs.json b/scripts/other/WGS_PE_SingleSample_LegacyRef_160901.inputs.json new file mode 100644 index 0000000..0485791 --- /dev/null +++ b/scripts/other/WGS_PE_SingleSample_LegacyRef_160901.inputs.json @@ -0,0 +1,98 @@ +{ + "##_COMMENT1": "SAMPLE NAME AND UNMAPPED BAMS", + "PairedEndSingleSampleWorkflow.sample_name": "NA12878", + "PairedEndSingleSampleWorkflow.flowcell_unmapped_bams": [ + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", + "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" + ], + "PairedEndSingleSampleWorkflow.final_gvcf_name": "NA12878.g.vcf.gz", + "PairedEndSingleSampleWorkflow.unmapped_bam_suffix": ".bam", + + "##_COMMENT2": "INTERVALS", + "PairedEndSingleSampleWorkflow.scattered_calling_intervals": [ + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0001_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0002_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0003_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0004_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0005_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0006_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0007_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0008_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0009_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0010_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0011_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0012_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0013_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0014_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0015_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0016_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0017_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0018_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0019_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0020_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0021_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0022_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0023_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0024_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0025_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0026_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0027_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0028_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0029_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0030_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0031_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0032_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0033_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0034_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0035_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0036_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0037_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0038_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0039_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0040_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0041_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0042_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0043_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0044_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0045_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0046_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0047_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0048_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0049_of_50/scattered.interval_list", + "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0050_of_50/scattered.interval_list" + ], + "PairedEndSingleSampleWorkflow.wgs_calling_interval_list": "gs://genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list", + + "##_COMMENT2": "OPTIONAL ARGUMENTS", + "PairedEndSingleSampleWorkflow.HaplotypeCaller.contamination": 0, + + "##_COMMENT3": "REFERENCE FILES", + "PairedEndSingleSampleWorkflow.ref_dict": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.dict.gz", + "PairedEndSingleSampleWorkflow.ref_fasta": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.gz", + "PairedEndSingleSampleWorkflow.ref_fasta_index": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai.gz", + "PairedEndSingleSampleWorkflow.ref_sa": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.sa", + "PairedEndSingleSampleWorkflow.ref_amb": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.amb", + "PairedEndSingleSampleWorkflow.ref_bwt": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.bwt", + "PairedEndSingleSampleWorkflow.ref_ann": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.ann", + "PairedEndSingleSampleWorkflow.ref_pac": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.pac", + + "##_COMMENT4": "KNOWN SITES RESOURCES", + "PairedEndSingleSampleWorkflow.dbSNP_vcf": "gs://vdauwera-legacy-bundles/b37/dbsnp_138.b37.vcf.gz", + "PairedEndSingleSampleWorkflow.dbSNP_vcf_index": "gs://vdauwera-legacy-bundles/b37/dbsnp_138.b37.vcf.idx.gz", + "PairedEndSingleSampleWorkflow.known_indels_sites_VCFs": [ + "gs://vdauwera-legacy-bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf.gz" + ], + "PairedEndSingleSampleWorkflow.known_indels_sites_indices": [ + "gs://vdauwera-legacy-bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf.idx.gz" + ], + + "##_COMMENT5": "DISK SIZES + PREEMPTIBLES", + "PairedEndSingleSampleWorkflow.agg_small_disk": 200, + "PairedEndSingleSampleWorkflow.agg_medium_disk": 300, + "PairedEndSingleSampleWorkflow.agg_large_disk": 400, + "PairedEndSingleSampleWorkflow.agg_preemptible_tries": 3, + "PairedEndSingleSampleWorkflow.flowcell_small_disk": 200, + "PairedEndSingleSampleWorkflow.flowcell_medium_disk": 300, + "PairedEndSingleSampleWorkflow.preemptible_tries": 3 +} diff --git a/scripts/other/WGS_PE_SingleSample_LegacyRef_160901.wdl b/scripts/other/WGS_PE_SingleSample_LegacyRef_160901.wdl new file mode 100644 index 0000000..28f0e3a --- /dev/null +++ b/scripts/other/WGS_PE_SingleSample_LegacyRef_160901.wdl @@ -0,0 +1,748 @@ +## Copyright Broad Institute, 2016 +## +## This WDL pipeline implements data pre-processing and initial variant calling (GVCF +## generation) according to the GATK Best Practices (June 2016) for germline SNP and +## Indel discovery in human whole-genome sequencing (WGS) data. +## +## Requirements/expectations : +## - Whole-genome pair-end sequencing data in unmapped BAM (uBAM) format +## - One or more read groups, one per uBAM file, all belonging to a single sample (SM) +## - Input uBAM files must additionally comply with the following requirements: +## - - filenames all have the same suffix (we use ".unmapped.bam") +## - - files must pass validation by ValidateSamFile +## - - reads are provided in query-sorted order +## - - all reads must have an RG tag +## - Reference genome does NOT have ALT contigs +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Get version of BWA +task GetBwaVersion { + command { + /usr/gitc/bwa 2>&1 | \ + grep -e '^Version' | \ + sed 's/Version: //' + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "1 GB" + } + output { + String version = read_string(stdout()) + } +} + +# Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment +task SamToFastqAndBwaMem { + File input_bam + String bwa_commandline + String output_bam_basename + File ref_fasta + File ref_fasta_index + File ref_dict + File ref_amb + File ref_ann + File ref_bwt + File ref_pac + File ref_sa + Int disk_size + Int preemptible_tries + + command <<< + set -o pipefail + # set the bash variable needed for the command-line + bash_ref_fasta=${ref_fasta} + # assume no ALT contigs + java -Xmx3000m -jar /usr/gitc/picard.jar \ + SamToFastq \ + INPUT=${input_bam} \ + FASTQ=/dev/stdout \ + INTERLEAVE=true \ + NON_PF=true | \ + /usr/gitc/${bwa_commandline} /dev/stdin - 2> >(tee ${output_bam_basename}.bwa.stderr.log >&2) | \ + samtools view -1 - > ${output_bam_basename}.bam + >>> + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "14 GB" + cpu: "16" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File output_bam = "${output_bam_basename}.bam" + File bwa_stderr_log = "${output_bam_basename}.bwa.stderr.log" + } +} + +# Merge original input uBAM file with BWA-aligned BAM file +task MergeBamAlignment { + File unmapped_bam + String bwa_commandline + String bwa_version + File aligned_bam + String output_bam_basename + File ref_fasta + File ref_fasta_index + File ref_dict + Int disk_size + Int preemptible_tries + + command { + # set the bash variable needed for the command-line + bash_ref_fasta=${ref_fasta} + java -Xmx3000m -jar /usr/gitc/picard.jar \ + MergeBamAlignment \ + VALIDATION_STRINGENCY=SILENT \ + EXPECTED_ORIENTATIONS=FR \ + ATTRIBUTES_TO_RETAIN=X0 \ + ALIGNED_BAM=${aligned_bam} \ + UNMAPPED_BAM=${unmapped_bam} \ + OUTPUT=${output_bam_basename}.bam \ + REFERENCE_SEQUENCE=${ref_fasta} \ + PAIRED_RUN=true \ + SORT_ORDER="unsorted" \ + IS_BISULFITE_SEQUENCE=false \ + ALIGNED_READS_ONLY=false \ + CLIP_ADAPTERS=false \ + MAX_RECORDS_IN_RAM=2000000 \ + ADD_MATE_CIGAR=true \ + MAX_INSERTIONS_OR_DELETIONS=-1 \ + PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ + PROGRAM_RECORD_ID="bwamem" \ + PROGRAM_GROUP_VERSION="${bwa_version}" \ + PROGRAM_GROUP_COMMAND_LINE="${bwa_commandline}" \ + PROGRAM_GROUP_NAME="bwamem" \ + UNMAP_CONTAMINANT_READS=true + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "3500 MB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File output_bam = "${output_bam_basename}.bam" + } +} + +# Sort BAM file by coordinate order and fix tag values for NM and UQ +task SortAndFixTags { + File input_bam + String output_bam_basename + File ref_dict + File ref_fasta + File ref_fasta_index + Int disk_size + Int preemptible_tries + + command { + java -Xmx4000m -jar /usr/gitc/picard.jar \ + SortSam \ + INPUT=${input_bam} \ + OUTPUT=/dev/stdout \ + SORT_ORDER="coordinate" \ + CREATE_INDEX=false \ + CREATE_MD5_FILE=false | \ + java -Xmx500m -jar /usr/gitc/picard.jar \ + SetNmAndUqTags \ + INPUT=/dev/stdin \ + OUTPUT=${output_bam_basename}.bam \ + CREATE_INDEX=true \ + CREATE_MD5_FILE=true \ + REFERENCE_SEQUENCE=${ref_fasta} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + disks: "local-disk " + disk_size + " HDD" + cpu: "1" + memory: "5000 MB" + preemptible: preemptible_tries + } + output { + File output_bam = "${output_bam_basename}.bam" + File output_bam_index = "${output_bam_basename}.bai" + File output_bam_md5 = "${output_bam_basename}.bam.md5" + } +} + +# Mark duplicate reads to avoid counting non-independent observations +task MarkDuplicates { + Array[File] input_bams + String output_bam_basename + String metrics_filename + Int disk_size + + # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly + # This works because the output of BWA is query-grouped, and thus so is the output of MergeBamAlignment. + # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" + command { + java -Xmx4000m -jar /usr/gitc/picard.jar \ + MarkDuplicates \ + INPUT=${sep=' INPUT=' input_bams} \ + OUTPUT=${output_bam_basename}.bam \ + METRICS_FILE=${metrics_filename} \ + VALIDATION_STRINGENCY=SILENT \ + OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ + ASSUME_SORT_ORDER="queryname" + CREATE_MD5_FILE=true + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "7 GB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bam = "${output_bam_basename}.bam" + File duplicate_metrics = "${metrics_filename}" + } +} + +# Generate sets of intervals for scatter-gathering over chromosomes +task CreateSequenceGroupingTSV { + File ref_dict + Int preemptible_tries + + # Use python to create the Sequencing Groupings used for BQSR and PrintReads Scatter. It outputs to stdout + # where it is parsed into a wdl Array[Array[String]] + # e.g. [["1"], ["2"], ["3", "4"], ["5"], ["6", "7", "8"]] + command <<< + python <>> + runtime { + docker: "python:2.7" + memory: "2 GB" + preemptible: preemptible_tries + } + output { + Array[Array[String]] sequence_grouping = read_tsv(stdout()) + } +} + +# Generate Base Quality Score Recalibration (BQSR) model +task BaseRecalibrator { + File input_bam + File input_bam_index + String recalibration_report_filename + Array[String] sequence_group_interval + File dbSNP_vcf + File dbSNP_vcf_index + Array[File] known_indels_sites_VCFs + Array[File] known_indels_sites_indices + File ref_dict + File ref_fasta + File ref_fasta_index + Int disk_size + Int preemptible_tries + + command { + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \ + -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCDetails \ + -Xloggc:gc_log.log -Dsamjdk.use_async_io=false -Xmx4000m \ + -jar /usr/gitc/GATK4.jar \ + BaseRecalibrator \ + -R ${ref_fasta} \ + -I ${input_bam} \ + --useOriginalQualities \ + -O ${recalibration_report_filename} \ + -knownSites ${dbSNP_vcf} \ + -knownSites ${sep=" -knownSites " known_indels_sites_VCFs} \ + -L ${sep=" -L " sequence_group_interval} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "6 GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File recalibration_report = "${recalibration_report_filename}" + #this output is only for GOTC STAGING to give some GC statistics to the GATK4 team + #File gc_logs = "gc_log.log" + } +} + +# Apply Base Quality Score Recalibration (BQSR) model +task ApplyBQSR { + File input_bam + File input_bam_index + String output_bam_basename + File recalibration_report + Array[String] sequence_group_interval + File ref_dict + File ref_fasta + File ref_fasta_index + Int disk_size + Int preemptible_tries + + command { + java -XX:+PrintFlagsFinal -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps \ + -XX:+PrintGCDetails -Xloggc:gc_log.log -Dsamjdk.use_async_io=false \ + -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx3000m \ + -jar /usr/gitc/GATK4.jar \ + ApplyBQSR \ + --createOutputBamMD5 \ + --addOutputSAMProgramRecord \ + -R ${ref_fasta} \ + -I ${input_bam} \ + --useOriginalQualities \ + -O ${output_bam_basename}.bam \ + -bqsr ${recalibration_report} \ + -SQQ 10 -SQQ 20 -SQQ 30 -SQQ 40 \ + --emit_original_quals \ + -L ${sep=" -L " sequence_group_interval} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "3500 MB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File recalibrated_bam = "${output_bam_basename}.bam" + File recalibrated_bam_checksum = "${output_bam_basename}.bam.md5" + #this output is only for GOTC STAGING to give some GC statistics to the GATK4 team + #File gc_logs = "gc_log.log" + } +} + +# Combine multiple recalibration tables from scattered BaseRecalibrator runs +task GatherBqsrReports { + Array[File] input_bqsr_reports + String output_report_filename + Int disk_size + Int preemptible_tries + + command { + java -Xmx3000m -jar /usr/gitc/GATK4.jar \ + GatherBQSRReports \ + -I ${sep=' -I ' input_bqsr_reports} \ + -O ${output_report_filename} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "3500 MB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File output_bqsr_report = "${output_report_filename}" + } +} + +# Combine multiple recalibrated BAM files from scattered ApplyRecalibration runs +task GatherBamFiles { + Array[File] input_bams + File input_unmapped_reads_bam + String output_bam_basename + Int disk_size + Int preemptible_tries + + command { + java -Xmx2000m -jar /usr/gitc/picard.jar \ + GatherBamFiles \ + INPUT=${sep=' INPUT=' input_bams} \ + INPUT=${input_unmapped_reads_bam} \ + OUTPUT=${output_bam_basename}.bam \ + CREATE_INDEX=true \ + CREATE_MD5_FILE=true + + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "3 GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File output_bam = "${output_bam_basename}.bam" + File output_bam_index = "${output_bam_basename}.bai" + File output_bam_md5 = "${output_bam_basename}.bam.md5" + } +} + +# Call variants on a single sample with HaplotypeCaller to produce a GVCF +task HaplotypeCaller { + File input_bam + File input_bam_index + File interval_list + String gvcf_basename + File ref_dict + File ref_fasta + File ref_fasta_index + Float? contamination + Int disk_size + Int preemptible_tries + + # tried to find lowest memory variable where it would still work, might change once tested on JES + command { + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \ + -jar /usr/gitc/GATK35.jar \ + -T HaplotypeCaller \ + -R ${ref_fasta} \ + -o ${gvcf_basename}.vcf.gz \ + -I ${input_bam} \ + -L ${interval_list} \ + -ERC GVCF \ + --max_alternate_alleles 3 \ + -variant_index_parameter 128000 \ + -variant_index_type LINEAR \ + -contamination ${default=0 contamination} \ + --read_filter OverclippedRead + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "10 GB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File output_gvcf = "${gvcf_basename}.vcf.gz" + File output_gvcf_index = "${gvcf_basename}.vcf.gz.tbi" + } +} + +# Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs +task GatherVCFs { + Array[File] input_vcfs + Array[File] input_vcfs_indexes + String output_vcf_name + Int disk_size + Int preemptible_tries + + # using MergeVcfs instead of GatherVcfs so we can create indices + # WARNING 2015-10-28 15:01:48 GatherVcfs Index creation not currently supported when gathering block compressed VCFs. + command { + java -Xmx2g -jar /usr/gitc/picard.jar \ + MergeVcfs \ + INPUT=${sep=' INPUT=' input_vcfs} \ + OUTPUT=${output_vcf_name} + } + output { + File output_vcf = "${output_vcf_name}" + File output_vcf_index = "${output_vcf_name}.tbi" + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "3 GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } +} + +# Convert BAM file to CRAM format +task ConvertToCram { + File input_bam + File ref_fasta + File ref_fasta_index + String output_basename + Int disk_size + + # Note that we are not activating pre-emptible instances for this step yet, + # but we should if it ends up being fairly quick + command <<< + samtools view -C -T ${ref_fasta} ${input_bam} | \ + tee ${output_basename}.cram | \ + md5sum > ${output_basename}.cram.md5 && \ + samtools index ${output_basename}.cram && \ + mv ${output_basename}.cram.crai ${output_basename}.crai + >>> + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "3 GB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_cram = "${output_basename}.cram" + File output_cram_index = "${output_basename}.crai" + File output_cram_md5 = "${output_basename}.cram.md5" + } +} + +# WORKFLOW DEFINITION +workflow WGS_PE_SingleSample_LR_Workflow { + + String sample_name + String final_gvcf_name + Array[File] flowcell_unmapped_bams + String unmapped_bam_suffix + + Array[File] scattered_calling_intervals + File wgs_calling_interval_list + + File ref_fasta + File ref_fasta_index + File ref_dict + File ref_alt + File ref_bwt + File ref_sa + File ref_amb + File ref_ann + File ref_pac + + File dbSNP_vcf + File dbSNP_vcf_index + Array[File] known_indels_sites_VCFs + Array[File] known_indels_sites_indices + + Int flowcell_small_disk + Int flowcell_medium_disk + Int agg_small_disk + Int agg_medium_disk + Int agg_large_disk + Int preemptible_tries + Int agg_preemptible_tries + + String bwa_commandline="bwa mem -K 100000000 -p -v 3 -t 16 $bash_ref_fasta" + + String recalibrated_bam_basename = sample_name + ".aligned.duplicates_marked.recalibrated" + + # Get the version of BWA to include in the PG record in the header of the BAM produced + # by MergeBamAlignment. + call GetBwaVersion + + # Align flowcell-level unmapped input bams in parallel + scatter (unmapped_bam in flowcell_unmapped_bams) { + + # Because of a wdl/cromwell bug this is not currently valid so we have to sub(sub()) in each task + # String base_name = sub(sub(unmapped_bam, "gs://.*/", ""), unmapped_bam_suffix + "$", "") + + String sub_strip_path = "gs://.*/" + String sub_strip_unmapped = unmapped_bam_suffix + "$" + + # Map reads to reference + call SamToFastqAndBwaMem { + input: + input_bam = unmapped_bam, + bwa_commandline = bwa_commandline, + output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".unmerged", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + ref_alt = ref_alt, + ref_bwt = ref_bwt, + ref_amb = ref_amb, + ref_ann = ref_ann, + ref_pac = ref_pac, + ref_sa = ref_sa, + disk_size = flowcell_medium_disk, + preemptible_tries = preemptible_tries + + } + + # Merge original uBAM and BWA-aligned BAM + call MergeBamAlignment { + input: + unmapped_bam = unmapped_bam, + bwa_commandline = bwa_commandline, + bwa_version = GetBwaVersion.version, + aligned_bam = SamToFastqAndBwaMem.output_bam, + output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".aligned.unsorted", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + disk_size = flowcell_medium_disk, + preemptible_tries = preemptible_tries + } + + # Sort and fix tags in the merged BAM + call SortAndFixTags as SortAndFixReadGroupBam { + input: + input_bam = MergeBamAlignment.output_bam, + output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".sorted", + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = flowcell_medium_disk, + preemptible_tries = preemptible_tries + } + + } + + # Aggregate aligned+merged flowcell BAM files and mark duplicates + call MarkDuplicates { + input: + input_bams = MergeBamAlignment.output_bam, + output_bam_basename = sample_name + ".aligned.unsorted.duplicates_marked", + metrics_filename = sample_name + ".duplicate_metrics", + disk_size = agg_large_disk + } + + # Sort aggregated+deduped BAM file and fix tags + call SortAndFixTags as SortAndFixSampleBam { + input: + input_bam = MarkDuplicates.output_bam, + output_bam_basename = sample_name + ".aligned.duplicate_marked.sorted", + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_large_disk, + preemptible_tries = 0 + } + + # Create list of sequences for scatter-gather parallelization + call CreateSequenceGroupingTSV { + input: + ref_dict = ref_dict, + preemptible_tries = preemptible_tries + } + + # Perform Base Quality Score Recalibration (BQSR) on the sorted BAM in parallel + scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping) { + # Generate the recalibration model by interval + call BaseRecalibrator { + input: + input_bam = SortAndFixSampleBam.output_bam, + input_bam_index = SortAndFixSampleBam.output_bam_index, + recalibration_report_filename = sample_name + ".recal_data.csv", + sequence_group_interval = subgroup, + dbSNP_vcf = dbSNP_vcf, + dbSNP_vcf_index = dbSNP_vcf_index, + known_indels_sites_VCFs = known_indels_sites_VCFs, + known_indels_sites_indices = known_indels_sites_indices, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_small_disk, + preemptible_tries = agg_preemptible_tries + } + # Apply the recalibration model by interval + call ApplyBQSR { + input: + input_bam = SortAndFixSampleBam.output_bam, + input_bam_index = SortAndFixSampleBam.output_bam_index, + output_bam_basename = recalibrated_bam_basename, + recalibration_report = GatherBqsrReports.output_bqsr_report, + sequence_group_interval = subgroup, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_small_disk, + preemptible_tries = agg_preemptible_tries + } + } + + # Merge the recalibration reports resulting from by-interval recalibration + call GatherBqsrReports{ + input: + input_bqsr_reports = BaseRecalibrator.recalibration_report, + output_report_filename = sample_name + ".recal_data.csv", + disk_size = flowcell_small_disk, + preemptible_tries = preemptible_tries + } + + # Do an additional round of recalibration on the unmapped reads (which would otherwise + # be left behind because they're not accounted for in the scatter intervals). This is + # done by running ApplyBQSR with "-L unmapped". + Array[String] unmapped_group_interval = ["unmapped"] + call ApplyBQSR as ApplyBQSRToUnmappedReads { + input: + input_bam = SortAndFixSampleBam.output_bam, + input_bam_index = SortAndFixSampleBam.output_bam_index, + output_bam_basename = recalibrated_bam_basename, + recalibration_report = GatherBqsrReports.output_bqsr_report, + sequence_group_interval = unmapped_group_interval, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_small_disk, + preemptible_tries = agg_preemptible_tries + } + + # Merge the recalibrated BAM files resulting from by-interval recalibration + # TODO: when we have capability of adding elements to arrays, can just have one array + # as an input and add the output of the above task to the scattered printreads bams + call GatherBamFiles { + input: + input_bams = ApplyBQSR.recalibrated_bam, + input_unmapped_reads_bam = ApplyBQSRToUnmappedReads.recalibrated_bam, + output_bam_basename = sample_name, + disk_size = agg_large_disk, + preemptible_tries = agg_preemptible_tries + } + + # Convert the final merged recalibrated BAM file to CRAM format + call ConvertToCram { + input: + input_bam = GatherBamFiles.output_bam, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + output_basename = sample_name, + disk_size = agg_medium_disk + } + + # Call variants in parallel over WGS calling intervals + scatter (subInterval in scattered_calling_intervals) { + + # Generate GVCF by interval + call HaplotypeCaller { + input: + input_bam = GatherBamFiles.output_bam, + input_bam_index = GatherBamFiles.output_bam_index, + interval_list = subInterval, + gvcf_basename = sample_name, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_small_disk, + preemptible_tries = agg_preemptible_tries + } + } + + # Combine by-interval GVCFs into a single sample GVCF file + call GatherVCFs { + input: + input_vcfs = HaplotypeCaller.output_gvcf, + input_vcfs_indexes = HaplotypeCaller.output_gvcf_index, + output_vcf_name = final_gvcf_name, + disk_size = agg_small_disk, + preemptible_tries = agg_preemptible_tries + } + + # Outputs that will be retained when execution is complete + + output { + MarkDuplicates.duplicate_metrics + GatherBqsrReports.* + ConvertToCram.* + GatherVCFs.* + } + +} diff --git a/scripts/other/GrabSamHeaderFromBams_160901.options.json b/scripts/other/generic.options.json similarity index 100% rename from scripts/other/GrabSamHeaderFromBams_160901.options.json rename to scripts/other/generic.options.json diff --git a/scripts/other/uBamFromPairedFastQ_160902.inputs.json b/scripts/other/uBamFromPairedFastQ_160902.inputs.json new file mode 100644 index 0000000..3d16142 --- /dev/null +++ b/scripts/other/uBamFromPairedFastQ_160902.inputs.json @@ -0,0 +1,30 @@ +{ + "uBamFromPairedFastQ.readgroup_list": [ + "NA12878_A", "NA12878_B", "NA12878_C" + ], + "uBamFromPairedFastQ.metadata": { + "NA12878_A": [ + "NA12878", "Solexa-NA12878", "H06HDADXX130110.2.ATCACGAT", "2016-09-01T02:00:00+0200", "illumina", "BI" + ], + "NA12878_B": [ + "NA12878", "Solexa-NA12878", "H06HDADXX130110.1.ATCACGAT", "2016-09-01T02:00:00+0200", "illumina", "BI" + ], + "NA12878_C": [ + "NA12878", "Solexa-NA12878", "H06JUADXX130110.1.ATCACGAT", "2016-09-01T02:00:00+0200", "illumina", "BI" + ] + }, + "uBamFromPairedFastQ.fastq_pairs": { + "NA12878_A": [ + "gs://dsde-comms-resources/fastq/H06HDADXX130110.1.ATCACGAT.20k_reads_1.fastq", + "gs://dsde-comms-resources/fastq/H06HDADXX130110.1.ATCACGAT.20k_reads_2.fastq" + ], + "NA12878_B": [ + "gs://dsde-comms-resources/fastq/H06HDADXX130110.2.ATCACGAT.20k_reads_1.fastq", + "gs://dsde-comms-resources/fastq/H06HDADXX130110.2.ATCACGAT.20k_reads_2.fastq" + ], + "NA12878_C": [ + "gs://dsde-comms-resources/fastq/H06JUADXX130110.1.ATCACGAT.20k_reads_1.fastq", + "gs://dsde-comms-resources/fastq/H06JUADXX130110.1.ATCACGAT.20k_reads_2.fastq" + ] + } +} diff --git a/scripts/other/uBamFromPairedFastQ_160902.wdl b/scripts/other/uBamFromPairedFastQ_160902.wdl new file mode 100644 index 0000000..f28e857 --- /dev/null +++ b/scripts/other/uBamFromPairedFastQ_160902.wdl @@ -0,0 +1,90 @@ +## Copyright Broad Institute, 2016 +## +## This WDL converts FASTQ to uBAM and adds read group information using FastqToSam +## +## Requirements/expectations : +## - Whole-genome pair-end sequencing data in FASTQ format (one file per orientation) +## - One or more read groups, one per pair of FASTQ files +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Merge original input uBAM file with BWA-aligned BAM file +task FastqToSam { + # pair-specific + File fastq_1 + File fastq_2 + String readgroup_name + String sample_name + String library_name + String platform_unit + String run_date + String platform_name + String sequencing_center + + command { + java -Xmx3000m -jar /usr/gitc/picard.jar \ + FastqToSam \ + FASTQ=${fastq_1} \ + FASTQ2=${fastq_2} \ + OUTPUT=${readgroup_name}.bam \ + READ_GROUP_NAME=${readgroup_name} \ + SAMPLE_NAME=${sample_name} \ + LIBRARY_NAME=${library_name} \ + PLATFORM_UNIT=${platform_unit} \ + RUN_DATE=${run_date} \ + PLATFORM=${platform_name} \ + SEQUENCING_CENTER=${sequencing_center} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "3500 MB" + cpu: "1" + disks: "local-disk " + 400 + " HDD" + } + output { + File output_bam = "${readgroup_name}.bam" + } +} + +# WORKFLOW DEFINITION +workflow uBamFromPairedFastQ { + Array[String] readgroup_list + Map[String, Array[File]] fastq_pairs + Map[String, Array[String]] metadata + + # Convert multiple pairs of input fastqs in parallel + scatter (readgroup in readgroup_list) { + + # Convert pair of FASTQs to uBAM + call FastqToSam { + input: + # pair-specific + fastq_1 = fastq_pairs[readgroup][0], + fastq_2 = fastq_pairs[readgroup][1], + readgroup_name = readgroup, + sample_name = metadata[readgroup][0], + library_name = metadata[readgroup][1], + platform_unit = metadata[readgroup][2], + run_date = metadata[readgroup][3], + platform_name = metadata[readgroup][4], + sequencing_center = metadata[readgroup][5] + } + } + + # Outputs that will be retained when execution is complete + output { + FastqToSam.* + } +} + diff --git a/scripts/utilities/create_scatter_intervals.py b/scripts/utilities/create_scatter_intervals.py new file mode 100644 index 0000000..c271ebb --- /dev/null +++ b/scripts/utilities/create_scatter_intervals.py @@ -0,0 +1,145 @@ +# usr/bin/python + +###################################################################################### +# This script creates interval subset lists from a master list for scattering N-ways # +###################################################################################### + +import os +import sys + +# CLI arguments +master_list_file = sys.argv[1] +desired_N = int(sys.argv[2]) +wiggle_factor = int(sys.argv[3]) +dir_name = sys.argv[4] +comment = "@CO\t"+sys.argv[5]+"\n" + +# Read in the master list file contents: +with open(master_list_file, "r") as master_list: + + header_lines = [] + intervals_list = [] + longest_interval = 0 + + for line in master_list: + # store the header lines (starting with @) to serve as output stub + if line.startswith("@"): + header_lines.append(line) + else: + line_split = line.split("\t") + length = int(line_split[2])-int(line_split[1]) + intervals_list.append((line, length)) + + # keep track of what is the longest interval + if length > longest_interval: + longest_interval = length + +print "Number of intervals: "+str(len(intervals_list)) +print "Longest interval was: "+str(longest_interval) + +# Determine what is the total territory covered by intervals +total_length = 0 +for interval in intervals_list: + total_length = total_length + interval[1] + +print "Total length of covered territory: "+str(total_length) + +# Determine what should be the theoretical maximum territory per subset +# based on the desired N +max_length_per_subset = total_length / desired_N + +print "Theoretical max subset length: "+str(max_length_per_subset) + +# Distribute intervals to separate files + +interval_count = 0 +batch_count = 0 +current_batch = [] +current_length = 0 +length_so_far = 0 +batches_list = [] + +print "Processing..." + +def dump_batch(msg): + + global batch_count + global current_batch + global current_length + global length_so_far + global interval_count + global batches_list + + # increment appropriate counters + batch_count +=1 + length_so_far = length_so_far + current_length + # report batch stats + print "\t"+str(batch_count)+". \tBatch of "+str(len(current_batch))+"\t| "+str(current_length)+" \t|"+msg+" \t| "+str(interval_count)+" \t| So far "+str(length_so_far)+" \t| Remains "+str(total_length-length_so_far) + # store batch + batches_list.append(current_batch) + # reset everything + current_batch = [] + current_length = 0 + +for interval in intervals_list: + + interval_count +=1 + #print interval_count + + # Is this new interval above the length limit by itself? + if interval[1] > max_length_per_subset: + dump_batch("close-out") + current_batch.append(interval) + current_length = current_length + interval[1] + dump_batch("godzilla") + + # Is this new interval putting us above the length limit when added to the batch? + elif current_length + interval[1] > max_length_per_subset+max_length_per_subset/wiggle_factor: + dump_batch("normal") + current_batch.append(interval) + current_length = current_length + interval[1] + + else: + current_batch.append(interval) + current_length = current_length + interval[1] + +dump_batch("finalize") + +print "Done.\nGrouped intervals into "+str(len(batches_list))+" batches." + +# Write batches to files and compose a JSON stub +counter = 0 +json_stub = ["{", "\t\"workflow.scattered_calling_intervals\": ["] +os.mkdir(dir_name) +for batch in batches_list: + counter +=1 + path = dir_name+"/temp_"+str(counter)+"_of_"+str(len(batches_list)) + os.mkdir(path) + with open(path+"/scattered.interval_list", "w") as intervals_file: + # Write out the header copied from the original + for line in header_lines: + intervals_file.write("%s" % line) + # Add a comment to the header + intervals_file.write("%s" % comment) + # Write out the intervals + for interval in batch: + intervals_file.write("%s" % interval[0]) + + # add the json line + json_stub.append("\t\t\"gs://bucket/dir/"+path+"/scattered.interval_list\",") +json_stub.append("\t]") +json_stub.append("}") + +print "Wrote "+str(counter)+" interval files to \""+dir_name+"/temp_n_of_N/scattered.interval_list\"" + +# Write out the json stub +with open("scattered_intervals.json", "w") as json_file: + for line in json_stub: + json_file.write("%s\n" % line) + +print "Wrote a JSON stub to \"scattered_intervals.json\"" + + + + + From 50b4dd09314c2c89177b39f92ab7bcfa280b39e0 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Thu, 15 Sep 2016 14:48:40 -0400 Subject: [PATCH 4/4] more wdls... --- scripts/other/ValidateBams_160902.inputs.json | 4 +- scripts/other/ValidateBams_160902.wdl | 4 +- scripts/other/WGS_HC_GVCF_160901.inputs.json | 71 +++++++ scripts/other/WGS_HC_GVCF_160901.wdl | 133 +++++++++++++ .../other/WGS_Joint_Analysis_160909.inputs.json | 70 +++++++ scripts/other/WGS_Joint_Analysis_160909.wdl | 182 ++++++++++++++++++ ...GS_PE_SingleSample_LegacyRef_160901.inputs.json | 156 +++++++-------- .../other/WGS_PE_SingleSample_LegacyRef_160901.wdl | 32 ++- scripts/other/WGS_VQSR_160909.inputs.json | 47 +++++ scripts/other/WGS_VQSR_160909.wdl | 214 +++++++++++++++++++++ scripts/other/uBamFromPairedFastQ_160902.wdl | 4 +- scripts/utilities/create_scatter_intervals.py | 4 + 12 files changed, 822 insertions(+), 99 deletions(-) create mode 100644 scripts/other/WGS_HC_GVCF_160901.inputs.json create mode 100644 scripts/other/WGS_HC_GVCF_160901.wdl create mode 100644 scripts/other/WGS_Joint_Analysis_160909.inputs.json create mode 100644 scripts/other/WGS_Joint_Analysis_160909.wdl create mode 100644 scripts/other/WGS_VQSR_160909.inputs.json create mode 100644 scripts/other/WGS_VQSR_160909.wdl diff --git a/scripts/other/ValidateBams_160902.inputs.json b/scripts/other/ValidateBams_160902.inputs.json index cf22b16..c96a30c 100644 --- a/scripts/other/ValidateBams_160902.inputs.json +++ b/scripts/other/ValidateBams_160902.inputs.json @@ -3,5 +3,7 @@ "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" - ] + ], + + "ValidateBAMs.disk_size": 200 } diff --git a/scripts/other/ValidateBams_160902.wdl b/scripts/other/ValidateBams_160902.wdl index 325165b..f7b077e 100644 --- a/scripts/other/ValidateBams_160902.wdl +++ b/scripts/other/ValidateBams_160902.wdl @@ -22,6 +22,7 @@ task ValidateBAM { File bam_file String output_basename + Int disk_size command { java -Xmx3000m -jar /usr/gitc/picard.jar \ @@ -34,7 +35,7 @@ task ValidateBAM { docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" memory: "1 GB" cpu: "1" - disks: "local-disk " + 200 + " HDD" + disks: "local-disk " + disk_size + " HDD" } output { File output_bam = "${output_basename}.txt" @@ -44,6 +45,7 @@ task ValidateBAM { # WORKFLOW DEFINITION workflow ValidateBAMs { Array[File] bam_list + Int disk_size # Convert multiple pairs of input fastqs in parallel scatter (input_bam in bam_list) { diff --git a/scripts/other/WGS_HC_GVCF_160901.inputs.json b/scripts/other/WGS_HC_GVCF_160901.inputs.json new file mode 100644 index 0000000..8bd1399 --- /dev/null +++ b/scripts/other/WGS_HC_GVCF_160901.inputs.json @@ -0,0 +1,71 @@ +{ + "ScatterHaplotypeCaller.sample_basename": "NA12878", + "ScatterHaplotypeCaller.input_bam": "gs://dsde-comms-resources/bams_wgs/NA12878.bam", + "ScatterHaplotypeCaller.input_bam_index": "gs://dsde-comms-resources/bams_wgs/NA12878.bai", + + "ScatterHaplotypeCaller.ref_dict": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.dict", + "ScatterHaplotypeCaller.ref_fasta": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.fasta", + "ScatterHaplotypeCaller.ref_fasta_index": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.fasta.fai", + + "ScatterHaplotypeCaller.scattered_intervals": [ + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0001_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0002_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0003_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0004_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0005_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0006_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0007_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0008_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0009_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0010_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0011_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0012_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0013_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0014_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0015_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0016_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0017_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0018_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0019_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0020_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0021_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0022_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0023_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0024_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0025_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0026_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0027_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0028_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0029_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0030_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0031_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0032_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0033_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0034_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0035_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0036_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0037_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0038_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0039_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0040_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0041_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0042_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0043_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0044_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0045_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0046_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0047_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0048_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0049_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0050_of_50/scattered.interval_list" + ], + + "##_COMMENT5": "DISK SIZES + PREEMPTIBLES", + "ScatterHaplotypeCaller.agg_small_disk": 200, + "ScatterHaplotypeCaller.agg_medium_disk": 300, + "ScatterHaplotypeCaller.agg_large_disk": 400, + "ScatterHaplotypeCaller.agg_preemptible_tries": 0, + "ScatterHaplotypeCaller.flowcell_small_disk": 200, + "ScatterHaplotypeCaller.flowcell_medium_disk": 300, + "ScatterHaplotypeCaller.preemptible_tries": 0 +} \ No newline at end of file diff --git a/scripts/other/WGS_HC_GVCF_160901.wdl b/scripts/other/WGS_HC_GVCF_160901.wdl new file mode 100644 index 0000000..d9d4330 --- /dev/null +++ b/scripts/other/WGS_HC_GVCF_160901.wdl @@ -0,0 +1,133 @@ +## Copyright Broad Institute, 2016 +## +## This WDL pipeline implements HaplotypeCaller GVCF calling according to the +## GATK Best Practices (June 2016) for germline SNP and Indel discovery in human +## whole-genome sequencing (WGS) data. +## +## Requirements/expectations : +## - Analysis-ready BAM produced according to Best Practices +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# HaplotypeCaller per-sample in GVCF mode +task HaplotypeCaller { + File input_bam + File bam_index + String sample_basename + File ref_dict + File ref_fasta + File ref_fasta_index + File intervals_file + Int disk_size + Int preemptible_tries + + command { + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx800m \ + -jar /usr/gitc/GATK36.jar \ + -T HaplotypeCaller \ + -R ${ref_fasta} \ + -o ${sample_basename}.g.vcf.gz \ + -I ${input_bam} \ + -L ${intervals_file} \ + -ERC GVCF + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + memory: "10 GB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + + output { + File output_gvcf = "${sample_basename}.g.vcf.gz" + File output_gvcf_index = "${sample_basename}.g.vcf.gz.tbi" + } +} + +task MergeVCFs { + File ref_dict + Array [File] input_vcfs + Array [File] input_vcfs_indices + String vcf_basename + Int disk_size + Int preemptible_tries + + command { + java -Xmx2g -jar /usr/gitc/picard.jar \ + MergeVcfs \ + INPUT=${sep=' INPUT=' input_vcfs} \ + OUTPUT=${vcf_basename}.g.vcf.gz + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: "3 GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries +} + + output { + File output_vcf = "${vcf_basename}.g.vcf.gz" + File output_vcf_index = "${vcf_basename}.g.vcf.gz.tbi" + } +} + +# WORKFLOW DEFINITION +workflow ScatterHaplotypeCaller { + File input_bam + File input_bam_index + File ref_dict + File ref_fasta + File ref_fasta_index + String sample_basename + Array[File] scattered_intervals + Int agg_preemptible_tries + Int agg_small_disk + + # Call variants in parallel over grouped calling intervals + scatter (interval_file in scattered_intervals) { + + # Generate GVCF by interval + call HaplotypeCaller { + input: + input_bam = input_bam, + bam_index = input_bam_index, + intervals_file = interval_file, + sample_basename = sample_basename, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_small_disk, + preemptible_tries = agg_preemptible_tries + } + } + + # Merge per-interval GVCFs + call MergeVCFs { + input: + ref_dict = ref_dict, + input_vcfs = HaplotypeCaller.output_gvcf, + input_vcfs_indices = HaplotypeCaller.output_gvcf_index, + vcf_basename = sample_basename, + disk_size = agg_small_disk, + preemptible_tries = agg_preemptible_tries + } + + # Outputs that will be retained when execution is complete + output { + MergeVCFs.* + } +} \ No newline at end of file diff --git a/scripts/other/WGS_Joint_Analysis_160909.inputs.json b/scripts/other/WGS_Joint_Analysis_160909.inputs.json new file mode 100644 index 0000000..68d2ee0 --- /dev/null +++ b/scripts/other/WGS_Joint_Analysis_160909.inputs.json @@ -0,0 +1,70 @@ +{ + "JointAnalysis.ref_fasta": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.fasta", + "JointAnalysis.ref_dict": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.dict", + "JointAnalysis.ref_fasta_index": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.fasta.fai", + + "JointAnalysis.cohort_vcf_name": "Platinum_NA12882_WGS", + + "JointAnalysis.input_gvcfs": [ + "gs://dsde-comms-resources/gvcfs_wgs/Platinum_b37/NA12882.g.vcf.gz" + ], + "JointAnalysis.input_gvcf_indices": [ + "gs://dsde-comms-resources/gvcfs_wgs/Platinum_b37/NA12882.g.vcf.gz.tbi" + ], + + "JointAnalysis.scattered_calling_intervals": [ + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0001_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0002_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0003_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0004_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0005_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0006_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0007_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0008_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0009_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0010_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0011_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0012_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0013_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0014_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0015_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0016_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0017_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0018_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0019_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0020_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0021_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0022_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0023_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0024_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0025_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0026_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0027_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0028_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0029_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0030_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0031_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0032_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0033_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0034_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0035_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0036_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0037_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0038_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0039_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0040_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0041_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0042_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0043_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0044_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0045_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0046_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0047_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0048_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0049_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0050_of_50/scattered.interval_list" + ], + + "JointAnalysis.preemptible_tries": 0, + "JointAnalysis.small_disk": 200 +} \ No newline at end of file diff --git a/scripts/other/WGS_Joint_Analysis_160909.wdl b/scripts/other/WGS_Joint_Analysis_160909.wdl new file mode 100644 index 0000000..d0c0a9d --- /dev/null +++ b/scripts/other/WGS_Joint_Analysis_160909.wdl @@ -0,0 +1,182 @@ +## Copyright Broad Institute, 2016 +## +## This WDL pipeline implements joint analysis (joint genotyping, variant filtering +## and genotype refinement) according to the GATK Best Practices (June 2016) for +## germline SNP and Indel discovery in human whole-genome sequencing (WGS) data. +## +## DEV NOTE: STILL NEED TO ADD GENOTYPE REFINEMENT +## +## Requirements/expectations : +## - GVCFs produced by HaplotypeCaller in GVCF mode from WGS data +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Unzip GVCFs because GenotypeGVCFs is picky +task UnzipGVCF { + File gzipped_gvcf + String unzipped_basename + File ref_dict + Int disk_size + Int preemptible_tries + + # HACK ALERT! Using .gvcf extension here to force IndexFeatureFile to make the right + # kind of index, but afterward we need to change to .g.vcf which is the correct + # for GVCFs. + command <<< + gunzip -c ${gzipped_gvcf} > ${unzipped_basename}.gvcf + java -Xmx2g -jar /usr/gitc/GATK4.jar IndexFeatureFile -F ${unzipped_basename}.gvcf + mv ${unzipped_basename}.gvcf ${unzipped_basename}.g.vcf + mv ${unzipped_basename}.gvcf.idx ${unzipped_basename}.g.vcf.idx + >>> + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: "3 GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + + output { + File unzipped_gvcf = "${unzipped_basename}.g.vcf" + File gvcf_index = "${unzipped_basename}.g.vcf.idx" + } +} + +# Perform joint-genotyping +task GenotypeGVCFs { + Array[File] gvcfs + Array[File] gvcf_indices + String vcf_basename + File ref_dict + File ref_fasta + File ref_fasta_index + File interval_list + Int disk_size + Int preemptible_tries + + command { + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \ + -jar /usr/gitc/GATK36.jar \ + -T GenotypeGVCFs \ + -R ${ref_fasta} \ + --variant ${sep=' --variant ' gvcfs} \ + -L ${interval_list} \ + -o ${vcf_basename}.gz + } + + output { + File genotyped_vcf = "${vcf_basename}.gz" + File genotyped_index = "${vcf_basename}.gz.tbi" + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: "10 GB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } +} + +# Combine multiple VCFs from scattered GenotypeGVCFs runs +task MergeVCFs { + File ref_dict + Array [File] input_vcfs + Array [File] input_vcfs_indices + String cohort_vcf_name + Int disk_size + Int preemptible_tries + + command { + java -Xmx2g -jar /usr/gitc/picard.jar \ + MergeVcfs \ + INPUT=${sep=' INPUT=' input_vcfs} \ + OUTPUT=${cohort_vcf_name}.vcf.gz + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: "3 GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + + output { + File output_vcf = "${cohort_vcf_name}.vcf.gz" + File output_vcf_index = "${cohort_vcf_name}.vcf.gz.tbi" + } +} + +workflow JointAnalysis { + String cohort_vcf_name + File ref_fasta + File ref_fasta_index + File ref_dict + Array[File] input_gvcfs + Array[File] input_gvcf_indices + Array[File] scattered_calling_intervals + String cohort_vcf_name + Int preemptible_tries + Int small_disk + + # Unzip GVCFs + scatter (input_gvcf in input_gvcfs) { + + call UnzipGVCF { + input: + gzipped_gvcf = input_gvcf, + unzipped_basename = "temp_unzipped", + ref_dict = ref_dict, + disk_size = small_disk, + preemptible_tries = preemptible_tries + } + } + + # Joint-genotype variants in parallel over WGS calling intervals + scatter (subInterval in scattered_calling_intervals) { + + # Perform joint genotyping per interval + call GenotypeGVCFs { + input: + gvcfs = UnzipGVCF.unzipped_gvcf, + gvcf_indices = UnzipGVCF.gvcf_index, + vcf_basename = cohort_vcf_name, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + interval_list = subInterval, + disk_size = small_disk, + preemptible_tries = preemptible_tries + } + } + + # Merge per-interval VCFs into a single cohort VCF file + call MergeVCFs { + input: + ref_dict = ref_dict, + input_vcfs = GenotypeGVCFs.genotyped_vcf, + input_vcfs_indices = GenotypeGVCFs.genotyped_index, + cohort_vcf_name = cohort_vcf_name, + disk_size = small_disk, + preemptible_tries = preemptible_tries + } + + ### ADD GENOTYPE REFINEMENT HERE + + # Outputs that will be retained when execution is complete + output { + MergeVCFs.* + } +} + diff --git a/scripts/other/WGS_PE_SingleSample_LegacyRef_160901.inputs.json b/scripts/other/WGS_PE_SingleSample_LegacyRef_160901.inputs.json index 0485791..c75f6de 100644 --- a/scripts/other/WGS_PE_SingleSample_LegacyRef_160901.inputs.json +++ b/scripts/other/WGS_PE_SingleSample_LegacyRef_160901.inputs.json @@ -1,98 +1,98 @@ { "##_COMMENT1": "SAMPLE NAME AND UNMAPPED BAMS", - "PairedEndSingleSampleWorkflow.sample_name": "NA12878", - "PairedEndSingleSampleWorkflow.flowcell_unmapped_bams": [ + "WGS_PE_SingleSample_LR_Workflow.sample_name": "NA12878", + "WGS_PE_SingleSample_LR_Workflow.flowcell_unmapped_bams": [ "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.1.ATCACGAT.20k_reads.bam", "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06HDADXX130110.2.ATCACGAT.20k_reads.bam", "gs://genomics-public-data/test-data/dna/wgs/hiseq2500/NA12878/H06JUADXX130110.1.ATCACGAT.20k_reads.bam" ], - "PairedEndSingleSampleWorkflow.final_gvcf_name": "NA12878.g.vcf.gz", - "PairedEndSingleSampleWorkflow.unmapped_bam_suffix": ".bam", + "WGS_PE_SingleSample_LR_Workflow.final_gvcf_name": "NA12878.g.vcf.gz", + "WGS_PE_SingleSample_LR_Workflow.unmapped_bam_suffix": ".bam", "##_COMMENT2": "INTERVALS", - "PairedEndSingleSampleWorkflow.scattered_calling_intervals": [ - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0001_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0002_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0003_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0004_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0005_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0006_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0007_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0008_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0009_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0010_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0011_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0012_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0013_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0014_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0015_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0016_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0017_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0018_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0019_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0020_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0021_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0022_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0023_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0024_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0025_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0026_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0027_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0028_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0029_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0030_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0031_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0032_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0033_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0034_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0035_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0036_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0037_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0038_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0039_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0040_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0041_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0042_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0043_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0044_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0045_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0046_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0047_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0048_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0049_of_50/scattered.interval_list", - "gs://genomics-public-data/resources/broad/hg38/v0/scattered_calling_intervals/temp_0050_of_50/scattered.interval_list" + "WGS_PE_SingleSample_LR_Workflow.scattered_calling_intervals": [ + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0001_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0002_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0003_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0004_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0005_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0006_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0007_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0008_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0009_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0010_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0011_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0012_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0013_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0014_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0015_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0016_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0017_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0018_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0019_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0020_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0021_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0022_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0023_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0024_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0025_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0026_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0027_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0028_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0029_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0030_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0031_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0032_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0033_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0034_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0035_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0036_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0037_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0038_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0039_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0040_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0041_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0042_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0043_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0044_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0045_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0046_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0047_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0048_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0049_of_50/scattered.interval_list", + "gs://dsde-comms-resources/legacy_bundles/b37/scattered_wgs_intervals_b37/temp_0050_of_50/scattered.interval_list" ], - "PairedEndSingleSampleWorkflow.wgs_calling_interval_list": "gs://genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list", + "WGS_PE_SingleSample_LR_Workflow.wgs_calling_interval_list": "gs://dsde-comms-resources/legacy_bundles/b37/wgs_calling_regions.v1.interval_list", "##_COMMENT2": "OPTIONAL ARGUMENTS", - "PairedEndSingleSampleWorkflow.HaplotypeCaller.contamination": 0, + "WGS_PE_SingleSample_LR_Workflow.HaplotypeCaller.contamination": 0, "##_COMMENT3": "REFERENCE FILES", - "PairedEndSingleSampleWorkflow.ref_dict": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.dict.gz", - "PairedEndSingleSampleWorkflow.ref_fasta": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.gz", - "PairedEndSingleSampleWorkflow.ref_fasta_index": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai.gz", - "PairedEndSingleSampleWorkflow.ref_sa": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.sa", - "PairedEndSingleSampleWorkflow.ref_amb": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.amb", - "PairedEndSingleSampleWorkflow.ref_bwt": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.bwt", - "PairedEndSingleSampleWorkflow.ref_ann": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.ann", - "PairedEndSingleSampleWorkflow.ref_pac": "gs://vdauwera-legacy-bundles/b37/human_g1k_v37_decoy.fasta.pac", + "WGS_PE_SingleSample_LR_Workflow.ref_dict": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.dict", + "WGS_PE_SingleSample_LR_Workflow.ref_fasta": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.fasta", + "WGS_PE_SingleSample_LR_Workflow.ref_fasta_index": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.fasta.fai", + "WGS_PE_SingleSample_LR_Workflow.ref_sa": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.fasta.sa", + "WGS_PE_SingleSample_LR_Workflow.ref_amb": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.fasta.amb", + "WGS_PE_SingleSample_LR_Workflow.ref_bwt": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.fasta.bwt", + "WGS_PE_SingleSample_LR_Workflow.ref_ann": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.fasta.ann", + "WGS_PE_SingleSample_LR_Workflow.ref_pac": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.fasta.pac", "##_COMMENT4": "KNOWN SITES RESOURCES", - "PairedEndSingleSampleWorkflow.dbSNP_vcf": "gs://vdauwera-legacy-bundles/b37/dbsnp_138.b37.vcf.gz", - "PairedEndSingleSampleWorkflow.dbSNP_vcf_index": "gs://vdauwera-legacy-bundles/b37/dbsnp_138.b37.vcf.idx.gz", - "PairedEndSingleSampleWorkflow.known_indels_sites_VCFs": [ - "gs://vdauwera-legacy-bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf.gz" + "WGS_PE_SingleSample_LR_Workflow.dbSNP_vcf": "gs://dsde-comms-resources/legacy_bundles/b37/dbsnp_138.b37.vcf", + "WGS_PE_SingleSample_LR_Workflow.dbSNP_vcf_index": "gs://dsde-comms-resources/legacy_bundles/b37/dbsnp_138.b37.vcf.idx", + "WGS_PE_SingleSample_LR_Workflow.known_indels_sites_VCFs": [ + "gs://dsde-comms-resources/legacy_bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf" ], - "PairedEndSingleSampleWorkflow.known_indels_sites_indices": [ - "gs://vdauwera-legacy-bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf.idx.gz" + "WGS_PE_SingleSample_LR_Workflow.known_indels_sites_indices": [ + "gs://dsde-comms-resources/legacy_bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf.idx" ], "##_COMMENT5": "DISK SIZES + PREEMPTIBLES", - "PairedEndSingleSampleWorkflow.agg_small_disk": 200, - "PairedEndSingleSampleWorkflow.agg_medium_disk": 300, - "PairedEndSingleSampleWorkflow.agg_large_disk": 400, - "PairedEndSingleSampleWorkflow.agg_preemptible_tries": 3, - "PairedEndSingleSampleWorkflow.flowcell_small_disk": 200, - "PairedEndSingleSampleWorkflow.flowcell_medium_disk": 300, - "PairedEndSingleSampleWorkflow.preemptible_tries": 3 + "WGS_PE_SingleSample_LR_Workflow.agg_small_disk": 200, + "WGS_PE_SingleSample_LR_Workflow.agg_medium_disk": 300, + "WGS_PE_SingleSample_LR_Workflow.agg_large_disk": 400, + "WGS_PE_SingleSample_LR_Workflow.agg_preemptible_tries": 3, + "WGS_PE_SingleSample_LR_Workflow.flowcell_small_disk": 200, + "WGS_PE_SingleSample_LR_Workflow.flowcell_medium_disk": 300, + "WGS_PE_SingleSample_LR_Workflow.preemptible_tries": 3 } diff --git a/scripts/other/WGS_PE_SingleSample_LegacyRef_160901.wdl b/scripts/other/WGS_PE_SingleSample_LegacyRef_160901.wdl index 28f0e3a..d7c55ef 100644 --- a/scripts/other/WGS_PE_SingleSample_LegacyRef_160901.wdl +++ b/scripts/other/WGS_PE_SingleSample_LegacyRef_160901.wdl @@ -35,7 +35,7 @@ task GetBwaVersion { sed 's/Version: //' } runtime { - docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" memory: "1 GB" } output { @@ -74,9 +74,9 @@ task SamToFastqAndBwaMem { samtools view -1 - > ${output_bam_basename}.bam >>> runtime { - docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" - memory: "14 GB" - cpu: "16" + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: "20 GB" + cpu: "32" disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries } @@ -127,7 +127,7 @@ task MergeBamAlignment { UNMAP_CONTAMINANT_READS=true } runtime { - docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" memory: "3500 MB" cpu: "1" disks: "local-disk " + disk_size + " HDD" @@ -165,7 +165,7 @@ task SortAndFixTags { REFERENCE_SEQUENCE=${ref_fasta} } runtime { - docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" disks: "local-disk " + disk_size + " HDD" cpu: "1" memory: "5000 MB" @@ -200,7 +200,7 @@ task MarkDuplicates { CREATE_MD5_FILE=true } runtime { - docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" memory: "7 GB" disks: "local-disk " + disk_size + " HDD" } @@ -285,7 +285,7 @@ task BaseRecalibrator { -L ${sep=" -L " sequence_group_interval} } runtime { - docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" memory: "6 GB" disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries @@ -328,7 +328,7 @@ task ApplyBQSR { -L ${sep=" -L " sequence_group_interval} } runtime { - docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" memory: "3500 MB" disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries @@ -355,7 +355,7 @@ task GatherBqsrReports { -O ${output_report_filename} } runtime { - docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" memory: "3500 MB" disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries @@ -384,7 +384,7 @@ task GatherBamFiles { } runtime { - docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" memory: "3 GB" disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries @@ -426,7 +426,7 @@ task HaplotypeCaller { --read_filter OverclippedRead } runtime { - docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" memory: "10 GB" cpu: "1" disks: "local-disk " + disk_size + " HDD" @@ -459,7 +459,7 @@ task GatherVCFs { File output_vcf_index = "${output_vcf_name}.tbi" } runtime { - docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" memory: "3 GB" disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries @@ -484,7 +484,7 @@ task ConvertToCram { mv ${output_basename}.cram.crai ${output_basename}.crai >>> runtime { - docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" memory: "3 GB" cpu: "1" disks: "local-disk " + disk_size + " HDD" @@ -510,7 +510,6 @@ workflow WGS_PE_SingleSample_LR_Workflow { File ref_fasta File ref_fasta_index File ref_dict - File ref_alt File ref_bwt File ref_sa File ref_amb @@ -530,7 +529,7 @@ workflow WGS_PE_SingleSample_LR_Workflow { Int preemptible_tries Int agg_preemptible_tries - String bwa_commandline="bwa mem -K 100000000 -p -v 3 -t 16 $bash_ref_fasta" + String bwa_commandline="bwa mem -K 100000000 -p -v 3 -t 32 $bash_ref_fasta" String recalibrated_bam_basename = sample_name + ".aligned.duplicates_marked.recalibrated" @@ -556,7 +555,6 @@ workflow WGS_PE_SingleSample_LR_Workflow { ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, ref_dict = ref_dict, - ref_alt = ref_alt, ref_bwt = ref_bwt, ref_amb = ref_amb, ref_ann = ref_ann, diff --git a/scripts/other/WGS_VQSR_160909.inputs.json b/scripts/other/WGS_VQSR_160909.inputs.json new file mode 100644 index 0000000..e3838e0 --- /dev/null +++ b/scripts/other/WGS_VQSR_160909.inputs.json @@ -0,0 +1,47 @@ +{ + "JustVQSR.ref_dict": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.dict", + "JustVQSR.ref_fasta": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.fasta", + "JustVQSR.ref_fasta_index": "gs://dsde-comms-resources/legacy_bundles/b37/human_g1k_v37_decoy.fasta.fai", + + "JustVQSR.small_disk": 200, + "JustVQSR.preemptible_tries": 0, + + "JustVQSR.cohort_vcf_name": "Test_trio", + + "JustVQSR.input_vcf": "gs://dsde-comms-resources/joint_vcfs/Test_trio.vcf.gz", + "JustVQSR.input_vcf_index": "gs://dsde-comms-resources/joint_vcfs/Test_trio.vcf.gz.tbi", + + "JustVQSR.SNP_annotations": ["DP", "QD", "FS", "SOR", "MQ", "MQRankSum", "ReadPosRankSum"], + "JustVQSR.INDEL_annotations": ["QD", "FS", "SOR", "MQRankSum", "ReadPosRankSum"], + + "JustVQSR.SNP_tranches": [100.0, 99.99, 99.95, 99.9, 99.8, 99.7, 99.6, 99.5, 99.4, 99.3, 99.2, 99.1, 99.0, 98.0, 97.0, 96.0, 95.0, 90.0], + "JustVQSR.INDEL_tranches": [100.0, 99.99, 99.95, 99.9, 99.8, 99.7, 99.6, 99.5, 99.0, 98.0, 97.0, 96.0, 95.0, 94.0, 93.0, 92.0, 91.0, 90.0], + + "JustVQSR.SNP_filter_level": 99.7, + "JustVQSR.INDEL_filter_level": 99.7, + + "JustVQSR.SNP_resources": [ + "hapmap,known=false,training=true,truth=true,prior=15.0 dsde-comms-resources/legacy_bundles/b37/hapmap_3.3.b37.vcf", + "omni,known=false,training=true,truth=true,prior=12.0 dsde-comms-resources/legacy_bundles/b37/1000G_omni2.5.b37.vcf", + "1000G,known=false,training=true,truth=false,prior=10.0 dsde-comms-resources/legacy_bundles/b37/1000G_phase1.snps.high_confidence.b37.vcf", + "dbsnp,known=true,training=false,truth=false,prior=2.0 dsde-comms-resources/legacy_bundles/b37/dbsnp_138.b37.vcf" + ], + "JustVQSR.INDEL_resources": [ + "mills,known=false,training=true,truth=true,prior=12.0 dsde-comms-resources/legacy_bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf", + "dbsnp,known=true,training=false,truth=false,prior=2.0 dsde-comms-resources/legacy_bundles/b37/dbsnp_138.b37.vcf" + ], + "JustVQSR.resource_files": [ + "gs://dsde-comms-resources/legacy_bundles/b37/hapmap_3.3.b37.vcf", + "gs://dsde-comms-resources/legacy_bundles/b37/1000G_omni2.5.b37.vcf", + "gs://dsde-comms-resources/legacy_bundles/b37/1000G_phase1.snps.high_confidence.b37.vcf", + "gs://dsde-comms-resources/legacy_bundles/b37/dbsnp_138.b37.vcf", + "gs://dsde-comms-resources/legacy_bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf" + ], + "JustVQSR.resource_indices": [ + "gs://dsde-comms-resources/legacy_bundles/b37/hapmap_3.3.b37.vcf.idx", + "gs://dsde-comms-resources/legacy_bundles/b37/1000G_omni2.5.b37.vcf.idx", + "gs://dsde-comms-resources/legacy_bundles/b37/1000G_phase1.snps.high_confidence.b37.vcf.idx", + "gs://dsde-comms-resources/legacy_bundles/b37/dbsnp_138.b37.vcf.idx", + "gs://dsde-comms-resources/legacy_bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf.idx" + ] +} \ No newline at end of file diff --git a/scripts/other/WGS_VQSR_160909.wdl b/scripts/other/WGS_VQSR_160909.wdl new file mode 100644 index 0000000..2cd1a75 --- /dev/null +++ b/scripts/other/WGS_VQSR_160909.wdl @@ -0,0 +1,214 @@ +## Copyright Broad Institute, 2016 +## +## This WDL pipeline implements VQSR filtering according to the GATK Best Practices +## (June 2016) for germline SNP and Indel discovery in human whole-genome sequencing +## (WGS) data. +## +## Requirements/expectations : +## - Cohort VCF produced by GenotypeGVCFs from WGS data +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Build VQSR model +task BuildVQSRModel { + File ref_dict + File ref_fasta + File ref_fasta_index + File cohort_vcf + File cohort_vcf_index + String output_basename + String mode + Array[String] annotations + Array[Float] tranches + Array[String] resources + Array[File] resource_files + Array[File] resource_indices + Int disk_size + Int preemptible_tries + + command { + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \ + -jar /usr/gitc/GATK36.jar \ + -T VariantRecalibrator \ + -R ${ref_fasta} \ + -input ${cohort_vcf} \ + -resource:${sep=' -resource:' resources} \ + -an ${sep=' -an ' annotations} \ + -mode ${mode} \ + -tranche ${sep=' -tranche ' tranches} \ + -recalFile ${output_basename}.${mode}.recal \ + -tranchesFile ${output_basename}.${mode}.tranches \ + -rscriptFile ${output_basename}.${mode}.plots.R + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: "2 GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + + output { + File recal_file = "${output_basename}.${mode}.recal" + File recal_file_index = "${output_basename}.${mode}.recal.idx" + File tranches_file = "${output_basename}.${mode}.tranches" + File rscript_file = "${output_basename}.${mode}.plots.R" + } +} + +# Apply recalibration +task ApplyRecalibrationFilter { + File ref_dict + File ref_fasta + File ref_fasta_index + File cohort_vcf + File cohort_vcf_index + File recal_file + File recal_file_index + String output_basename + String mode + File tranches_file + Float filter_level + Int disk_size + Int preemptible_tries + + command { + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \ + -jar /usr/gitc/GATK36.jar \ + -T ApplyRecalibration \ + -R ${ref_fasta} \ + -input ${cohort_vcf} \ + -mode ${mode} \ + --ts_filter_level ${filter_level} \ + -recalFile ${recal_file} \ + -tranchesFile ${tranches_file} \ + -o ${output_basename}.vcf.gz + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: "2 GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + + output { + File recalibrated_vcf = "${output_basename}.vcf.gz" + File recalibrated_vcf_index = "${output_basename}.vcf.gz.tbi" + } +} + +workflow JustVQSR { + File ref_fasta + File ref_fasta_index + File ref_dict + File input_vcf + File input_vcf_index + Array[String] SNP_annotations + Array[String] INDEL_annotations + Array[Float] SNP_tranches + Array[Float] INDEL_tranches + Array[String] SNP_resources + Array[String] INDEL_resources + Array[File] resource_files + Array[File] resource_indices + String cohort_vcf_name + Float SNP_filter_level + Float INDEL_filter_level + Int preemptible_tries + Int small_disk + + # Build SNP model + call BuildVQSRModel as BuildVQSRModelForSNPs { + input: + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + cohort_vcf = input_vcf, + cohort_vcf_index = input_vcf_index, + output_basename = cohort_vcf_name, + annotations = SNP_annotations, + mode = "SNP", + tranches = SNP_tranches, + resources = SNP_resources, + resource_files = resource_files, + resource_indices = resource_indices, + disk_size = small_disk, + preemptible_tries = preemptible_tries + } + + # Build INDEL model + call BuildVQSRModel as BuildVQSRModelForINDELs { + input: + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + cohort_vcf = input_vcf, + cohort_vcf_index = input_vcf_index, + output_basename = cohort_vcf_name, + annotations = INDEL_annotations, + mode = "INDEL", + tranches = INDEL_tranches, + resources = INDEL_resources, + resource_files = resource_files, + resource_indices = resource_indices, + disk_size = small_disk, + preemptible_tries = preemptible_tries + } + + # Apply SNP filter + call ApplyRecalibrationFilter as ApplyRecalibrationFilterForSNPs { + input: + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + cohort_vcf = input_vcf, + cohort_vcf_index = input_vcf_index, + output_basename = cohort_vcf_name + ".recalibrated.SNP", + mode = "SNP", + recal_file = BuildVQSRModelForSNPs.recal_file, + recal_file_index = BuildVQSRModelForSNPs.recal_file_index, + tranches_file = BuildVQSRModelForSNPs.tranches_file, + filter_level = SNP_filter_level, + disk_size = small_disk, + preemptible_tries = preemptible_tries + } + + # Apply INDEL filter + call ApplyRecalibrationFilter as ApplyRecalibrationFilterForINDELs { + input: + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + cohort_vcf = ApplyRecalibrationFilterForSNPs.recalibrated_vcf, + cohort_vcf_index = ApplyRecalibrationFilterForSNPs.recalibrated_vcf_index, + output_basename = cohort_vcf_name + ".recalibrated.SNP.INDEL", + mode = "INDEL", + recal_file = BuildVQSRModelForINDELs.recal_file, + recal_file_index = BuildVQSRModelForINDELs.recal_file_index, + tranches_file = BuildVQSRModelForINDELs.tranches_file, + filter_level = INDEL_filter_level, + disk_size = small_disk, + preemptible_tries = preemptible_tries + } + + # Outputs that will be retained when execution is complete + output { + BuildVQSRModelForSNPs.* + BuildVQSRModelForINDELs.* + ApplyRecalibrationFilterForSNPs.* + ApplyRecalibrationFilterForINDELs.* + } +} + diff --git a/scripts/other/uBamFromPairedFastQ_160902.wdl b/scripts/other/uBamFromPairedFastQ_160902.wdl index f28e857..003444c 100644 --- a/scripts/other/uBamFromPairedFastQ_160902.wdl +++ b/scripts/other/uBamFromPairedFastQ_160902.wdl @@ -47,8 +47,8 @@ task FastqToSam { SEQUENCING_CENTER=${sequencing_center} } runtime { - docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" - memory: "3500 MB" + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: "10 GB" cpu: "1" disks: "local-disk " + 400 + " HDD" } diff --git a/scripts/utilities/create_scatter_intervals.py b/scripts/utilities/create_scatter_intervals.py index c271ebb..25e2869 100644 --- a/scripts/utilities/create_scatter_intervals.py +++ b/scripts/utilities/create_scatter_intervals.py @@ -2,6 +2,10 @@ ###################################################################################### # This script creates interval subset lists from a master list for scattering N-ways # +# # +# Usage: # +# python create_scatter_intervals.py \ # +# master.interval_list 50 4 wgs_intervals "WGS intervals scattered 50-ways" # ###################################################################################### import os