From 1309f4abe35ac2e0bd0f530ac81b21dd97b8bee3 Mon Sep 17 00:00:00 2001 From: Chris Tomkins-Tinch Date: Tue, 27 Feb 2018 14:34:43 -0500 Subject: [PATCH 01/35] skip blank lines in samples-*.txt files (#794) --- pipes/rules/common.rules | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pipes/rules/common.rules b/pipes/rules/common.rules index 9527f1b05..92ec54338 100644 --- a/pipes/rules/common.rules +++ b/pipes/rules/common.rules @@ -31,6 +31,8 @@ def read_tab_file(fname): with util.file.open_or_gzopen(fname, 'rU') as inf: header = [item.strip() for item in inf.readline().strip().rstrip('\n').split('\t')] for line in inf: + if len(line.strip())==0: + continue row = [item.strip() for item in line.rstrip('\n').split('\t')] if len(row) > len(header): # truncate the row to the header length, and only include extra items if they are not spaces @@ -44,6 +46,8 @@ def read_samples_file(fname, number_of_chromosomes=1, append_chrom_num=False): return [] with util.file.open_or_gzopen(fname, 'rU') as inf: for line in inf: + if len(line.strip())==0: + continue if not append_chrom_num: yield line.strip() else: @@ -56,6 +60,8 @@ def read_accessions_file(fname): return [] with util.file.open_or_gzopen(fname, 'rU') as inf: for line in inf: + if len(line.strip())==0: + continue yield line.strip() def download_file(uriToGet, dest, destFileName=None): @@ -102,7 +108,7 @@ def strip_protocol(uri, relative=False): return uri -import botocore +import botocore.session def objectify_remote(file_address, *args, **kwargs): if file_address is None: From 909c7751a968b94f7ca391db82cb1d1ccb35b794 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 1 Mar 2018 09:26:35 -0500 Subject: [PATCH 02/35] add deplete-and-spades workflow --- pipes/WDL/workflows/contigs.wdl | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 pipes/WDL/workflows/contigs.wdl diff --git a/pipes/WDL/workflows/contigs.wdl b/pipes/WDL/workflows/contigs.wdl new file mode 100644 index 000000000..baa9d8fcf --- /dev/null +++ b/pipes/WDL/workflows/contigs.wdl @@ -0,0 +1,17 @@ +import "tasks/metagenomics.wdl" as metagenomics +import "tasks/taxon_filter.wdl" as taxon_filter +import "tasks/assembly.wdl" as assembly + +workflow contigs { + + call taxon_filter.deplete_taxa as deplete + + call assembly.assemble as spades { + input: + assembler = "spades", + reads_unmapped_bam = deplete.cleaned_bam + } + + # TO DO: taxonomic classification of contigs + +} From 1bd140fdf468ec332ac42aaebb48a2389920f46e Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 1 Mar 2018 10:27:21 -0500 Subject: [PATCH 03/35] add default dbs for contigs workflow --- pipes/WDL/dx-defaults-contigs.json | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 pipes/WDL/dx-defaults-contigs.json diff --git a/pipes/WDL/dx-defaults-contigs.json b/pipes/WDL/dx-defaults-contigs.json new file mode 100644 index 000000000..ba193eff8 --- /dev/null +++ b/pipes/WDL/dx-defaults-contigs.json @@ -0,0 +1,14 @@ +{ + "contigs.deplete.bwaDbs": [ + "dx://file-F9k7Bx00Z3ybJjvY3ZVj7Z9P" + ], + "contigs.deplete.blastDbs": [ + "dx://file-F8B3B6Q09y3bZg3j1FqK7bJ9", + "dx://file-F8BjgXj09y3gkfZGPPQZbZkK", + "dx://file-F8B3Pp809y3jBpXq7xjxbq94", + "dx://file-F8B3B6809y3kK1JP5X8Pg361" + ], + + "contigs.spades.trim_clip_db": + "dx://file-BXF0vYQ0QyBF509G9J12g927" +} From ca5b92670a2e10fe11361dd63e5cb08d229a20a1 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 1 Mar 2018 12:47:43 -0500 Subject: [PATCH 04/35] convert default dnanexus depletion databases from hg19 bmtagger to hg19 bwa --- pipes/WDL/dx-defaults-assemble_denovo_with_deplete.json | 4 ++-- pipes/WDL/dx-defaults-demux_plus.json | 4 ++-- pipes/WDL/dx-defaults-deplete_only.json | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pipes/WDL/dx-defaults-assemble_denovo_with_deplete.json b/pipes/WDL/dx-defaults-assemble_denovo_with_deplete.json index 887ad1ba5..118d1fe5e 100644 --- a/pipes/WDL/dx-defaults-assemble_denovo_with_deplete.json +++ b/pipes/WDL/dx-defaults-assemble_denovo_with_deplete.json @@ -1,6 +1,6 @@ { - "assemble_denovo_with_deplete.deplete_taxa.bmtaggerDbs": [ - "dx://file-BYF8y0Q06PJ7G1fPvkB9q3fK" + "assemble_denovo_with_deplete.deplete_taxa.bwaDbs": [ + "dx://file-F9k7Bx00Z3ybJjvY3ZVj7Z9P" ], "assemble_denovo_with_deplete.deplete_taxa.blastDbs": [ "dx://file-F8B3B6Q09y3bZg3j1FqK7bJ9", diff --git a/pipes/WDL/dx-defaults-demux_plus.json b/pipes/WDL/dx-defaults-demux_plus.json index b532f8c81..cd3d53ef8 100644 --- a/pipes/WDL/dx-defaults-demux_plus.json +++ b/pipes/WDL/dx-defaults-demux_plus.json @@ -2,8 +2,8 @@ "demux_plus.spikein.spikein_db": "dx://file-F6PXkF00Yqp3zVXq14fF98Kz", - "demux_plus.deplete.bmtaggerDbs": [ - "dx://file-BYF8y0Q06PJ7G1fPvkB9q3fK" + "demux_plus.deplete.bwaDbs": [ + "dx://file-F9k7Bx00Z3ybJjvY3ZVj7Z9P" ], "demux_plus.deplete.blastDbs": [ "dx://file-F8B3B6Q09y3bZg3j1FqK7bJ9", diff --git a/pipes/WDL/dx-defaults-deplete_only.json b/pipes/WDL/dx-defaults-deplete_only.json index a3c962261..cf065c8ef 100644 --- a/pipes/WDL/dx-defaults-deplete_only.json +++ b/pipes/WDL/dx-defaults-deplete_only.json @@ -1,6 +1,6 @@ { - "deplete_only.deplete_taxa.bmtaggerDbs": [ - "dx://file-BYF8y0Q06PJ7G1fPvkB9q3fK" + "deplete_only.deplete_taxa.bwaDbs": [ + "dx://file-F9k7Bx00Z3ybJjvY3ZVj7Z9P" ], "deplete_only.deplete_taxa.blastDbs": [ "dx://file-F8B3B6Q09y3bZg3j1FqK7bJ9", From 24c84115258076901127fe7c20ed30e495d4fb3c Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 1 Mar 2018 14:39:37 -0500 Subject: [PATCH 05/35] shape up ncbi.wdl tasks a bit --- pipes/WDL/workflows/tasks/ncbi.wdl | 58 ++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 6 deletions(-) diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl index bc9bc4400..73408a283 100644 --- a/pipes/WDL/workflows/tasks/ncbi.wdl +++ b/pipes/WDL/workflows/tasks/ncbi.wdl @@ -24,6 +24,9 @@ task download_reference_genome { } runtime { docker: "quay.io/broadinstitute/viral-ngs" + memory: "3 GB" + cpu: 2 + dx_instance_type: "mem1_ssd1_x2" } } @@ -49,23 +52,61 @@ task download_lastal_sources { } runtime { docker: "quay.io/broadinstitute/viral-ngs" + memory: "3 GB" + cpu: 2 + dx_instance_type: "mem1_ssd1_x2" } } task build_lastal_db { - File sequences # fasta file + File sequences_fasta + + String db_name = basename(sequences_fasta, ".fasta") command { - taxon_filter.py lastal_build_db \ - "${sequences}" \ - "./" + set -ex -o pipefail + taxon_filter.py lastal_build_db ${sequences_fasta} ./ --loglevel=DEBUG + tar -c ${db_name}* | lz4 -9 > ${db_name}.tar.lz4 } output { - Array[File] lastalDbFiles = glob("lastal.*") + File lastal_db = "${db_name}.tar.lz4" + } + + runtime { + docker: "quay.io/broadinstitute/viral-ngs" + memory: "7 GB" + cpu: 2 + dx_instance_type: "mem1_ssd1_x4" } +} + +task download_annotation { + String referenceName + Array[String] accessions + String emailAddress + + command { + set -ex -o pipefail + ncbi.py fetch_feature_tables \ + ${emailAddress} \ + ./ \ + ${sep=' ' accessions} \ + --combinedFilePrefix ${referenceName} \ + --loglevel DEBUG + } + + output { + File referenceFasta = "${referenceName}.fasta" + File featureTable = "${referenceName}.tbl" + Array[File] featureTables = glob("*.tbl") + } + runtime { docker: "quay.io/broadinstitute/viral-ngs" + memory: "3 GB" + cpu: 2 + dx_instance_type: "mem1_ssd1_x2" } } @@ -89,8 +130,10 @@ task annot_transfer { Array[File] featureTables = glob(".tbl") } runtime { - memory: "4GB" docker: "quay.io/broadinstitute/viral-ngs" + memory: "3 GB" + cpu: 2 + dx_instance_type: "mem1_ssd1_x2" } } @@ -122,5 +165,8 @@ task prepare_genbank { } runtime { docker: "quay.io/broadinstitute/viral-ngs" + memory: "3 GB" + cpu: 2 + dx_instance_type: "mem1_ssd1_x2" } } \ No newline at end of file From f4908f22b96f69899309a832ee8c66a8aff30d8f Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 2 Mar 2018 14:19:02 -0500 Subject: [PATCH 06/35] wdl updates --- pipes/WDL/workflows/tasks/interhost.wdl | 15 +++++++-------- pipes/WDL/workflows/tasks/ncbi.wdl | 1 - 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pipes/WDL/workflows/tasks/interhost.wdl b/pipes/WDL/workflows/tasks/interhost.wdl index 5324385ad..57fa55cee 100644 --- a/pipes/WDL/workflows/tasks/interhost.wdl +++ b/pipes/WDL/workflows/tasks/interhost.wdl @@ -79,23 +79,22 @@ task multi_align_mafft { Array[File] inputAssemblies # fasta files, one per sample File referenceGenome # fasta - Int? threads Int? maxIters Int? ep command { interhost.py multichr_mafft \ - "${referenceGenome}" \ - "${sep=' ' inputAssemblies+}" \ - "./" \ - "${'--ep' + ep}" \ - "${'--maxiters' + maxIters}" \ + ${referenceGenome} \ + ${sep=' ' inputAssemblies+} \ + ./ \ + ${'--ep' + ep} \ + ${'--maxiters' + maxIters} \ --preservecase \ --localpair \ --outFilePrefix aligned \ - --sampleNameListFile "sampleNameList.txt" \ - "${'--threads' + threads}" + --sampleNameListFile sampleNameList.txt \ + --loglevel DEBUG } output { diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl index 73408a283..1d5b02f25 100644 --- a/pipes/WDL/workflows/tasks/ncbi.wdl +++ b/pipes/WDL/workflows/tasks/ncbi.wdl @@ -97,7 +97,6 @@ task download_annotation { } output { - File referenceFasta = "${referenceName}.fasta" File featureTable = "${referenceName}.tbl" Array[File] featureTables = glob("*.tbl") } From ccdb5e321a1564ac0b9ccb224ecaead30c4b8dcc Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 2 Mar 2018 14:21:26 -0500 Subject: [PATCH 07/35] add spikein-only workflow --- pipes/WDL/dx-defaults-spikein.json | 4 ++++ pipes/WDL/workflows/spikein.wdl | 7 +++++++ 2 files changed, 11 insertions(+) create mode 100644 pipes/WDL/dx-defaults-spikein.json create mode 100644 pipes/WDL/workflows/spikein.wdl diff --git a/pipes/WDL/dx-defaults-spikein.json b/pipes/WDL/dx-defaults-spikein.json new file mode 100644 index 000000000..ce3f58951 --- /dev/null +++ b/pipes/WDL/dx-defaults-spikein.json @@ -0,0 +1,4 @@ +{ + "demux_plus.spikein.spikein_db": + "dx://file-F6PXkF00Yqp3zVXq14fF98Kz" +} diff --git a/pipes/WDL/workflows/spikein.wdl b/pipes/WDL/workflows/spikein.wdl new file mode 100644 index 000000000..583d364c7 --- /dev/null +++ b/pipes/WDL/workflows/spikein.wdl @@ -0,0 +1,7 @@ +import "tasks/reports.wdl" as reports + +workflow spikein { + + call reports.spikein_report as spikein + +} From 53432ab749000e685f88c9052659c7bd55bc044a Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 2 Mar 2018 14:31:34 -0500 Subject: [PATCH 08/35] oops rename default param --- pipes/WDL/dx-defaults-spikein.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/dx-defaults-spikein.json b/pipes/WDL/dx-defaults-spikein.json index ce3f58951..6c1c6b406 100644 --- a/pipes/WDL/dx-defaults-spikein.json +++ b/pipes/WDL/dx-defaults-spikein.json @@ -1,4 +1,4 @@ { - "demux_plus.spikein.spikein_db": + "spikein.spikein.spikein_db": "dx://file-F6PXkF00Yqp3zVXq14fF98Kz" } From c75db830c376a0ddc9e6656ebd1558ca4e38d5c3 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 8 Mar 2018 09:01:13 -0500 Subject: [PATCH 09/35] change filename of scaffold.scaffold_fasta from samplename.scaffold.fasta to samplename.scaffolded_imputed.fasta in order to make clear which fasta has imputed bases. add a String output for scaffolding_chosen_ref_name on top of existing fasta output. change mean_coverage from Int to Float. --- pipes/WDL/workflows/tasks/assembly.wdl | 26 +++++++++++++++----------- pipes/WDL/workflows/tasks/reports.wdl | 5 +++-- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/pipes/WDL/workflows/tasks/assembly.wdl b/pipes/WDL/workflows/tasks/assembly.wdl index 8389b6347..f073fc111 100644 --- a/pipes/WDL/workflows/tasks/assembly.wdl +++ b/pipes/WDL/workflows/tasks/assembly.wdl @@ -118,6 +118,8 @@ task scaffold { --outAlternateContigs ${sample_name}.scaffolding_alt_contigs.fasta \ --loglevel=DEBUG + grep '^>' ${sample_name}.scaffolding_chosen_ref.fasta | cut -c 2- | tr '\n' '\t' > ${sample_name}.scaffolding_chosen_ref.txt + assembly.py gapfill_gap2seq \ ${sample_name}.intermediate_scaffold.fasta \ ${reads_bam} \ @@ -132,7 +134,7 @@ task scaffold { assembly.py impute_from_reference \ ${sample_name}.intermediate_gapfill.fasta \ ${sample_name}.scaffolding_chosen_ref.fasta \ - ${sample_name}.scaffold.fasta \ + ${sample_name}.scaffolded_imputed.fasta \ --newName ${sample_name} \ ${'--replaceLength=' + replace_length} \ ${'--minLengthFraction=' + min_length_fraction} \ @@ -142,14 +144,15 @@ task scaffold { } output { - File scaffold_fasta = "${sample_name}.scaffold.fasta" - File intermediate_scaffold_fasta = "${sample_name}.intermediate_scaffold.fasta" - File intermediate_gapfill_fasta = "${sample_name}.intermediate_gapfill.fasta" - Int assembly_preimpute_length = read_int("assembly_preimpute_length") - Int assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous") - File scaffolding_chosen_ref = "${sample_name}.scaffolding_chosen_ref.fasta" - File scaffolding_stats = "${sample_name}.scaffolding_stats.txt" - File scaffolding_alt_contigs = "${sample_name}.scaffolding_alt_contigs.fasta" + File scaffold_fasta = "${sample_name}.scaffolded_imputed.fasta" + File intermediate_scaffold_fasta = "${sample_name}.intermediate_scaffold.fasta" + File intermediate_gapfill_fasta = "${sample_name}.intermediate_gapfill.fasta" + Int assembly_preimpute_length = read_int("assembly_preimpute_length") + Int assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous") + String scaffolding_chosen_ref_name = read_string("${sample_name}.scaffolding_chosen_ref.txt") + File scaffolding_chosen_ref = "${sample_name}.scaffolding_chosen_ref.fasta" + File scaffolding_stats = "${sample_name}.scaffolding_stats.txt" + File scaffolding_alt_contigs = "${sample_name}.scaffolding_alt_contigs.fasta" } runtime { @@ -307,7 +310,8 @@ task refine_2x_and_plot { samtools flagstat ${sample_name}.all.bam | tee ${sample_name}.all.bam.flagstat.txt grep properly ${sample_name}.all.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned samtools view ${sample_name}.mapped.bam | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned - echo $(( $(cat bases_aligned) / $(cat assembly_length) )) | tee mean_coverage + #echo $(( $(cat bases_aligned) / $(cat assembly_length) )) | tee mean_coverage + python -c "print (float("`cat bases_aligned`")/"`cat assembly_length`") if "`cat assembly_length`">0 else 0" > mean_coverage # fastqc mapped bam reports.py fastqc ${sample_name}.mapped.bam ${sample_name}.mapped_fastqc.html @@ -342,7 +346,7 @@ task refine_2x_and_plot { Int reads_aligned = read_int("reads_aligned") Int read_pairs_aligned = read_int("read_pairs_aligned") Int bases_aligned = read_int("bases_aligned") - Int mean_coverage = read_int("mean_coverage") + Float mean_coverage = read_float("mean_coverage") } runtime { diff --git a/pipes/WDL/workflows/tasks/reports.wdl b/pipes/WDL/workflows/tasks/reports.wdl index 8a9121fbb..22b57cfbc 100644 --- a/pipes/WDL/workflows/tasks/reports.wdl +++ b/pipes/WDL/workflows/tasks/reports.wdl @@ -50,7 +50,8 @@ task plot_coverage { samtools flagstat ${sample_name}.bam | tee ${sample_name}.bam.flagstat.txt grep properly ${sample_name}.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned samtools view ${sample_name}.mapped.bam | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned - echo $(( $(cat bases_aligned) / $(cat assembly_length) )) | tee mean_coverage + #echo $(( $(cat bases_aligned) / $(cat assembly_length) )) | tee mean_coverage + python -c "print (float("`cat bases_aligned`")/"`cat assembly_length`") if "`cat assembly_length`">0 else 0" > mean_coverage # fastqc mapped bam reports.py fastqc ${sample_name}.mapped.bam ${sample_name}.mapped_fastqc.html @@ -81,7 +82,7 @@ task plot_coverage { Int reads_aligned = read_int("reads_aligned") Int read_pairs_aligned = read_int("read_pairs_aligned") Int bases_aligned = read_int("bases_aligned") - Int mean_coverage = read_int("mean_coverage") + Float mean_coverage = read_float("mean_coverage") } runtime { From 38a2bd57ea882e3521878f17e9f5b641b586a345 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 8 Mar 2018 16:43:52 -0500 Subject: [PATCH 10/35] tune up ncbi steps --- ncbi.py | 2 +- pipes/WDL/workflows/align_and_annot.wdl | 29 ++++++++++++ pipes/WDL/workflows/tasks/interhost.wdl | 63 +++++++++++++++++++------ pipes/WDL/workflows/tasks/ncbi.wdl | 62 ++++++++++++------------ util/genbank.py | 7 +-- 5 files changed, 113 insertions(+), 50 deletions(-) create mode 100644 pipes/WDL/workflows/align_and_annot.wdl diff --git a/ncbi.py b/ncbi.py index af96da9f3..08e8db299 100755 --- a/ncbi.py +++ b/ncbi.py @@ -176,7 +176,7 @@ def tbl_transfer_prealigned(inputFasta, refFasta, refAnnotTblFiles, outputDir, o # identify the correct feature table as the one that has an ID that is # part of the ref seq ID fileAccession = util.genbank.get_feature_table_id(tblFilename) - if fileAccession in matchingRefSeq.id: + if fileAccession == matchingRefSeq.id.split('|')[0]: ref_tbl = tblFilename break if ref_tbl == "": diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl new file mode 100644 index 000000000..aa8e618e4 --- /dev/null +++ b/pipes/WDL/workflows/align_and_annot.wdl @@ -0,0 +1,29 @@ +import "tasks/interhost.wdl" as interhost +import "tasks/ncbi.wdl" as ncbi +import "tasks/reports.wdl" as reports + + +workflow align_and_annot { + + File reference_fasta + Array[File] assemblies_fasta + + call interhost.multi_align_mafft_ref as mafft { + input: + reference_fasta = reference_fasta, + assemblies_fasta = assemblies_fasta + } + + scatter(mutli_aln_fasta in mafft.alignments_by_chr) { + call ncbi.annot_transfer as annot { + input: + chr_mutli_aln_fasta = mutli_aln_fasta, + reference_fasta = reference_fasta + } + call ncbi.prepare_genbank as genbank { + input: + assemblies_fasta = assemblies_fasta, + annotations_tbl = annot.featureTables # I'm worried that the order got messed up + } + } +} diff --git a/pipes/WDL/workflows/tasks/interhost.wdl b/pipes/WDL/workflows/tasks/interhost.wdl index 57fa55cee..636ff55b9 100644 --- a/pipes/WDL/workflows/tasks/interhost.wdl +++ b/pipes/WDL/workflows/tasks/interhost.wdl @@ -75,35 +75,68 @@ task ref_guided_consensus_aligned_with_dups { # } #} -task multi_align_mafft { - Array[File] inputAssemblies # fasta files, one per sample - File referenceGenome # fasta +task multi_align_mafft_ref { + File reference_fasta + Array[File]+ assemblies_fasta # fasta files, one per sample, multiple chrs per file okay + String? out_prefix = basename(reference_fasta, '.fasta') + Int? mafft_maxIters + Int? mafft_ep + + command { + interhost.py multichr_mafft \ + ${reference_fasta} ${sep=' ' assemblies_fasta} \ + . \ + ${'--ep' + mafft_ep} \ + ${'--maxiters' + mafft_maxIters} \ + --outFilePrefix ${out_prefix} \ + --preservecase \ + --localpair \ + --sampleNameListFile ${out_prefix}-sample_names.txt \ + --loglevel DEBUG + } + + output { + File sampleNamesFile = "${out_prefix}-sample_names.txt" + Array[File] alignments_by_chr = glob("${out_prefix}*.fasta") + } - Int? maxIters - Int? ep + runtime { + docker: "quay.io/broadinstitute/viral-ngs" + memory: "7 GB" + cpu: 4 + dx_instance_type: "mem1_ssd1_x4" + } +} +task multi_align_mafft { + Array[File]+ assemblies_fasta # fasta files, one per sample, multiple chrs per file okay + String? out_prefix = basename(select_first(assemblies_fasta), '.fasta') + Int? mafft_maxIters + Int? mafft_ep command { interhost.py multichr_mafft \ - ${referenceGenome} \ - ${sep=' ' inputAssemblies+} \ - ./ \ - ${'--ep' + ep} \ - ${'--maxiters' + maxIters} \ + ${sep=' ' assemblies_fasta} \ + . \ + ${'--ep' + mafft_ep} \ + ${'--maxiters' + mafft_maxIters} \ + --outFilePrefix ${out_prefix} \ --preservecase \ --localpair \ - --outFilePrefix aligned \ - --sampleNameListFile sampleNameList.txt \ + --sampleNameListFile ${out_prefix}-sample_names.txt \ --loglevel DEBUG } output { - File sampleNamesFile = "sampleNamesList.txt" - Array[File] chrAlignedFiles = glob("aligned_*.fasta") + File sampleNamesFile = "${out_prefix}-sample_names.txt" + Array[File] alignments_by_chr = glob("${out_prefix}*.fasta") } + runtime { - memory: "8 GB" docker: "quay.io/broadinstitute/viral-ngs" + memory: "7 GB" + cpu: 4 + dx_instance_type: "mem1_ssd1_x4" } } diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl index 1d5b02f25..35008ff6c 100644 --- a/pipes/WDL/workflows/tasks/ncbi.wdl +++ b/pipes/WDL/workflows/tasks/ncbi.wdl @@ -110,23 +110,22 @@ task download_annotation { } task annot_transfer { - # TODO: Iterate over chr-specifc MSAs in workflow rather than in task - - File chrMultipleAlignment # fasta; multiple alignments of sample sequences - File referenceFeatureTable # feature table corresponding to the chr in the alignment - File referenceGenome # fasta + File chr_mutli_aln_fasta # fasta; multiple alignments of sample sequences + File reference_fasta # fasta + File reference_feature_table # feature table corresponding to the chr in the alignment command { ncbi.py tbl_transfer_prealigned \ - "${chrMultipleAlignment}" \ - "${referenceGenome}" \ - "${referenceFeatureTable}" \ - "./" \ - --oob_clip + ${chr_mutli_aln_fasta} \ + ${reference_fasta} \ + ${reference_feature_table} \ + . \ + --oob_clip \ + --loglevel DEBUG } output { - Array[File] featureTables = glob(".tbl") + Array[File] featureTables = glob("*.tbl") } runtime { docker: "quay.io/broadinstitute/viral-ngs" @@ -137,31 +136,36 @@ task annot_transfer { } task prepare_genbank { - Array[File] fastaFiles - File assemblySummary # summary.assembly.txt - File featureTableDir - - String genbankTemplate - String genbankSourceTable - String biosampleMap - String sequencingTech - String comment + Array[File]+ assemblies_fasta + Array[File]+ annotations_tbl + File authors_sbt + File assemblySummary # summary.assembly.txt + File genbankSourceTable + File biosampleMap + String sequencingTech + String comment command { + cp ${sep=' ' annotations_tbl} . ncbi.py prep_genbank_files \ - "${genbankTemplate}" \ - "${sep=' ' fastaFiles}" \ - "${featureTableDir}" \ - --master_source_table "${genbankSourceTable}" \ - --sequencing_tech "${sequencingTech}" \ - --biosample_map "${biosampleMap}" \ - --coverage_table "${assemblySummary}" \ - --comment "${comment}" + ${authors_sbt} \ + ${sep=' ' assemblies_fasta} \ + . \ + --master_source_table ${genbankSourceTable} \ + --sequencing_tech ${sequencingTech} \ + --biosample_map ${biosampleMap} \ + --coverage_table ${assemblySummary} \ + --comment ${comment} \ + --loglevel DEBUG + tar -czpvf ncbi_package.tar.gz *.val *.cmt *.fsa *.gbf *.sqn *.src *.tbl } output { - File errorSummary = "${featureTableDir}/errorsummary.val" + Array[File] sequin_files = glob("*.sqn") + File ncbi_package = "ncbi_package.tar.gz" + File errorSummary = "errorsummary.val" } + runtime { docker: "quay.io/broadinstitute/viral-ngs" memory: "3 GB" diff --git a/util/genbank.py b/util/genbank.py index 453a780e3..6ea381c4d 100644 --- a/util/genbank.py +++ b/util/genbank.py @@ -23,7 +23,6 @@ def parse_accession_str(chr_ref): return chr_ref def get_feature_table_id(featureTableFile): - seqid = "" with open(featureTableFile, 'rt') as inf: for line in inf: line = line.rstrip('\r\n') @@ -37,10 +36,8 @@ def get_feature_table_id(featureTableFile): if not ( (seqid.startswith('gb|') or seqid.startswith('ref|')) and seqid.endswith('|') and len(seqid) > 4): raise Exception("reference annotation does not refer to a GenBank or RefSeq accession") - seqid = seqid[seqid.find("|") + 1:-1] - if len(seqid) > 0: - return seqid - + seqid = '|'.join(seqid.split('|')[1:-1]) + return seqid def _seq_chunks(seq, n): # http://stackoverflow.com/a/312464/190597 (Ned Batchelder) From 735330454c31c53474a92c1c5c7e0374ae0d6598 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 8 Mar 2018 21:04:54 -0500 Subject: [PATCH 11/35] fix some linking of input files --- pipes/WDL/workflows/align_and_annot.wdl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl index aa8e618e4..666cf2632 100644 --- a/pipes/WDL/workflows/align_and_annot.wdl +++ b/pipes/WDL/workflows/align_and_annot.wdl @@ -1,12 +1,11 @@ import "tasks/interhost.wdl" as interhost import "tasks/ncbi.wdl" as ncbi -import "tasks/reports.wdl" as reports - workflow align_and_annot { - File reference_fasta - Array[File] assemblies_fasta + File reference_fasta + Array[File]+ assemblies_fasta + Array[File]+ annotations_tbl call interhost.multi_align_mafft_ref as mafft { input: @@ -14,16 +13,17 @@ workflow align_and_annot { assemblies_fasta = assemblies_fasta } - scatter(mutli_aln_fasta in mafft.alignments_by_chr) { + scatter(chr_num in range(len(mafft.alignments_by_chr))) { call ncbi.annot_transfer as annot { input: - chr_mutli_aln_fasta = mutli_aln_fasta, - reference_fasta = reference_fasta + chr_mutli_aln_fasta = mafft.alignments_by_chr[chr_num], + reference_fasta = reference_fasta, + reference_feature_table = annotations_tbl[chr_num] } call ncbi.prepare_genbank as genbank { input: assemblies_fasta = assemblies_fasta, - annotations_tbl = annot.featureTables # I'm worried that the order got messed up + annotations_tbl = annot.featureTables # I'm worried that the order got messed up and we'll have to remap? } } } From 7a9141d84c6a9219bd3a720bb2ce76d60549e7ca Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 8 Mar 2018 21:40:35 -0500 Subject: [PATCH 12/35] add bam index files to output of refine_2x_and_plot and plot_coverage. Make plot_coverage more consistent with refine_2x_and_plot. --- pipes/WDL/workflows/tasks/assembly.wdl | 2 ++ pipes/WDL/workflows/tasks/ncbi.wdl | 2 +- pipes/WDL/workflows/tasks/reports.wdl | 28 ++++++++++++++------------ 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/pipes/WDL/workflows/tasks/assembly.wdl b/pipes/WDL/workflows/tasks/assembly.wdl index 8389b6347..8dfa4e73f 100644 --- a/pipes/WDL/workflows/tasks/assembly.wdl +++ b/pipes/WDL/workflows/tasks/assembly.wdl @@ -333,8 +333,10 @@ task refine_2x_and_plot { File refine2_sites_vcf_gz = "${sample_name}.refine2.pre_fasta.vcf.gz" File final_assembly_fasta = "${sample_name}.fasta" File aligned_bam = "${sample_name}.all.bam" + File aligned_bam_idx = "${sample_name}.all.bai" File aligned_bam_flagstat = "${sample_name}.all.bam.flagstat.txt" File aligned_only_reads_bam = "${sample_name}.mapped.bam" + File aligned_only_reads_bam_idx = "${sample_name}.mapped.bai" File aligned_only_reads_fastqc = "${sample_name}.mapped_fastqc.html" File coverage_plot = "${sample_name}.coverage_plot.pdf" Int assembly_length = read_int("assembly_length") diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl index 35008ff6c..d58f10586 100644 --- a/pipes/WDL/workflows/tasks/ncbi.wdl +++ b/pipes/WDL/workflows/tasks/ncbi.wdl @@ -87,7 +87,6 @@ task download_annotation { String emailAddress command { - set -ex -o pipefail ncbi.py fetch_feature_tables \ ${emailAddress} \ ./ \ @@ -146,6 +145,7 @@ task prepare_genbank { String comment command { + set -ex -o pipefail cp ${sep=' ' annotations_tbl} . ncbi.py prep_genbank_files \ ${authors_sbt} \ diff --git a/pipes/WDL/workflows/tasks/reports.wdl b/pipes/WDL/workflows/tasks/reports.wdl index 8a9121fbb..e811fcf34 100644 --- a/pipes/WDL/workflows/tasks/reports.wdl +++ b/pipes/WDL/workflows/tasks/reports.wdl @@ -33,7 +33,7 @@ task plot_coverage { read_utils.py align_and_fix \ ${reads_unmapped_bam} \ assembly.fasta \ - --outBamAll ${sample_name}.bam \ + --outBamAll ${sample_name}.all.bam \ --outBamFiltered ${sample_name}.mapped.bam \ --GATK_PATH gatk/ \ --aligner ${aligner} \ @@ -66,22 +66,24 @@ task plot_coverage { --plotDPI 100 \ --loglevel=DEBUG else - touch ${sample_name}.coverage_plot.pdf ${sample_name}.mapped_fastqc.html + touch ${sample_name}.coverage_plot.pdf fi } output { - File reads_bam = "${sample_name}.bam" - File reads_bam_flagstat = "${sample_name}.bam.flagstat.txt" - File mapped_reads_bam = "${sample_name}.mapped.bam" - File mapped_reads_fastqc = "${sample_name}.mapped_fastqc.html" - File coverage_plot = "${sample_name}.coverage_plot.pdf" - Int assembly_length = read_int("assembly_length") - Int assembly_length_unambiguous = read_int("assembly_length_unambiguous") - Int reads_aligned = read_int("reads_aligned") - Int read_pairs_aligned = read_int("read_pairs_aligned") - Int bases_aligned = read_int("bases_aligned") - Int mean_coverage = read_int("mean_coverage") + File aligned_bam = "${sample_name}.all.bam" + File aligned_bam_idx = "${sample_name}.all.bai" + File aligned_bam_flagstat = "${sample_name}.all.bam.flagstat.txt" + File aligned_only_reads_bam = "${sample_name}.mapped.bam" + File aligned_only_reads_bam_idx = "${sample_name}.mapped.bai" + File aligned_only_reads_fastqc = "${sample_name}.mapped_fastqc.html" + File coverage_plot = "${sample_name}.coverage_plot.pdf" + Int assembly_length = read_int("assembly_length") + Int assembly_length_unambiguous = read_int("assembly_length_unambiguous") + Int reads_aligned = read_int("reads_aligned") + Int read_pairs_aligned = read_int("read_pairs_aligned") + Int bases_aligned = read_int("bases_aligned") + Int mean_coverage = read_int("mean_coverage") } runtime { From 8053789cf986374ac48e19949b6dd2cab436e5b5 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 8 Mar 2018 22:24:58 -0500 Subject: [PATCH 13/35] rework the scatter in align_and_annot, a few other fixes --- ncbi.py | 2 +- pipes/WDL/workflows/align_and_annot.wdl | 9 +- pipes/WDL/workflows/download_annotations.wdl | 7 ++ pipes/WDL/workflows/tasks/ncbi.wdl | 106 +++++-------------- pipes/WDL/workflows/tasks/taxon_filter.wdl | 21 ++++ 5 files changed, 62 insertions(+), 83 deletions(-) create mode 100644 pipes/WDL/workflows/download_annotations.wdl diff --git a/ncbi.py b/ncbi.py index 08e8db299..6f40589fd 100755 --- a/ncbi.py +++ b/ncbi.py @@ -51,7 +51,7 @@ def tbl_transfer_common(cmap, ref_tbl, out_tbl, alt_chrlens, oob_clip=False): if not ((refID.startswith('gb|') or refID.startswith('ref|')) and refID.endswith('|') and len(refID) > 4): raise Exception("reference annotation does not refer to a GenBank or RefSeq accession") - refID = refID[refID.find("|") + 1:-1] + refID = '|'.join(refID.split('|')[1:-1]) refSeqID = [x for x in cmap.keys() if refID in x][0] #altid = cmap.mapChr(refSeqID, altid) altid = list(set(cmap.keys()) - set([refSeqID]))[0] # cmap.mapChr(refSeqID, altid) diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl index 666cf2632..5fbd2f57c 100644 --- a/pipes/WDL/workflows/align_and_annot.wdl +++ b/pipes/WDL/workflows/align_and_annot.wdl @@ -13,17 +13,18 @@ workflow align_and_annot { assemblies_fasta = assemblies_fasta } - scatter(chr_num in range(len(mafft.alignments_by_chr))) { + scatter(aln_by_chr, ref_annot_by_chr in zip(mafft.alignments_by_chr, annotations_tbl)) { call ncbi.annot_transfer as annot { input: - chr_mutli_aln_fasta = mafft.alignments_by_chr[chr_num], + chr_mutli_aln_fasta = aln_by_chr, reference_fasta = reference_fasta, - reference_feature_table = annotations_tbl[chr_num] + reference_feature_table = ref_annot_by_chr } call ncbi.prepare_genbank as genbank { input: assemblies_fasta = assemblies_fasta, - annotations_tbl = annot.featureTables # I'm worried that the order got messed up and we'll have to remap? + annotations_tbl = annot.transferred_feature_tables, + out_prefix = basename(annotations_tbl, '.tbl') } } } diff --git a/pipes/WDL/workflows/download_annotations.wdl b/pipes/WDL/workflows/download_annotations.wdl new file mode 100644 index 000000000..a3ef7c6f2 --- /dev/null +++ b/pipes/WDL/workflows/download_annotations.wdl @@ -0,0 +1,7 @@ +import "tasks/ncbi.wdl" as ncbi + +workflow download_annotations { + + call ncbi.download_annotations + +} diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl index d58f10586..3a2668e21 100644 --- a/pipes/WDL/workflows/tasks/ncbi.wdl +++ b/pipes/WDL/workflows/tasks/ncbi.wdl @@ -1,54 +1,19 @@ -task download_reference_genome { - String referenceName - Array[String] accessions # NCBI accessions to include in the reference - String emailAddress - command { - ncbi.py fetch_fastas \ - "${emailAddress}" \ - "./" \ - "${sep=' ' accessions}" \ - --combinedFilePrefix "${referenceName}" \ - --removeSeparateFiles \ - --forceOverwrite - ncbi.py fetch_feature_tables \ - "${emailAddress}" \ - "./" \ - "${sep=' ' accessions}" \ - --forceOverwrite - } - - output { - File referenceFasta = "${referenceName}.fasta" - Array[File] featureTables = glob("*.tbl") - } - runtime { - docker: "quay.io/broadinstitute/viral-ngs" - memory: "3 GB" - cpu: 2 - dx_instance_type: "mem1_ssd1_x2" - } -} - -task download_lastal_sources { - String referenceName - Array[String] accessions # NCBI accessions to include in the lastal db - String emailAddress +task download_fasta { + String out_prefix + Array[String]+ accessions + String emailAddress command { ncbi.py fetch_fastas \ - "${emailAddress}" \ - "./" \ - "${sep=' ' accessions}" \ - --combinedFilePrefix lastal \ - --removeSeparateFiles \ - --forceOverwrite \ - --chunkSize 300 + ${emailAddress} \ + . \ + ${sep=' ' accessions} \ + --combinedFilePrefix ${out_prefix} \ } output { - File referenceFasta = "lastal.fasta" - Array[File] featureTables = glob("*.tbl") + File sequences_fasta = "${out_prefix}.fasta" } runtime { docker: "quay.io/broadinstitute/viral-ngs" @@ -58,46 +23,30 @@ task download_lastal_sources { } } -task build_lastal_db { - File sequences_fasta - - String db_name = basename(sequences_fasta, ".fasta") +task download_annotations { + Array[String]+ accessions + String emailAddress + String combined_fasta command { set -ex -o pipefail - taxon_filter.py lastal_build_db ${sequences_fasta} ./ --loglevel=DEBUG - tar -c ${db_name}* | lz4 -9 > ${db_name}.tar.lz4 - } - - output { - File lastal_db = "${db_name}.tar.lz4" - } - - runtime { - docker: "quay.io/broadinstitute/viral-ngs" - memory: "7 GB" - cpu: 2 - dx_instance_type: "mem1_ssd1_x4" - } -} - -task download_annotation { - String referenceName - Array[String] accessions - String emailAddress - - command { ncbi.py fetch_feature_tables \ ${emailAddress} \ ./ \ ${sep=' ' accessions} \ - --combinedFilePrefix ${referenceName} \ + --loglevel DEBUG + ncbi.py fetch_fastas \ + ${emailAddress} \ + ./ \ + ${sep=' ' accessions} \ + --combinedFilePrefix "${combined_fasta}" \ --loglevel DEBUG } output { - File featureTable = "${referenceName}.tbl" - Array[File] featureTables = glob("*.tbl") + File combined_fasta = "${combined_fasta}.fasta" + Array[File] genomes_fasta = glob("*.fasta") + Array[File] features_tbl = glob("*.tbl") } runtime { @@ -109,8 +58,8 @@ task download_annotation { } task annot_transfer { - File chr_mutli_aln_fasta # fasta; multiple alignments of sample sequences - File reference_fasta # fasta + File chr_mutli_aln_fasta # fasta; multiple alignments of sample sequences for a single chr + File reference_fasta # fasta (may contain multiple chrs, only one with the same name as reference_feature_table will be used) File reference_feature_table # feature table corresponding to the chr in the alignment command { @@ -124,7 +73,7 @@ task annot_transfer { } output { - Array[File] featureTables = glob("*.tbl") + Array[File] transferred_feature_tables = glob("*.tbl") } runtime { docker: "quay.io/broadinstitute/viral-ngs" @@ -143,6 +92,7 @@ task prepare_genbank { File biosampleMap String sequencingTech String comment + String out_prefix = "ncbi_package" command { set -ex -o pipefail @@ -157,12 +107,12 @@ task prepare_genbank { --coverage_table ${assemblySummary} \ --comment ${comment} \ --loglevel DEBUG - tar -czpvf ncbi_package.tar.gz *.val *.cmt *.fsa *.gbf *.sqn *.src *.tbl + tar -czpvf ${out_prefix}.tar.gz *.val *.cmt *.fsa *.gbf *.sqn *.src *.tbl } output { Array[File] sequin_files = glob("*.sqn") - File ncbi_package = "ncbi_package.tar.gz" + File ncbi_package = "${out_prefix}.tar.gz" File errorSummary = "errorsummary.val" } diff --git a/pipes/WDL/workflows/tasks/taxon_filter.wdl b/pipes/WDL/workflows/tasks/taxon_filter.wdl index 97b5ffe58..7028784c4 100644 --- a/pipes/WDL/workflows/tasks/taxon_filter.wdl +++ b/pipes/WDL/workflows/tasks/taxon_filter.wdl @@ -117,6 +117,27 @@ task filter_to_taxon { } } +task build_lastal_db { + File sequences_fasta + String db_name = basename(sequences_fasta, ".fasta") + + command { + set -ex -o pipefail + taxon_filter.py lastal_build_db ${sequences_fasta} ./ --loglevel=DEBUG + tar -c ${db_name}* | lz4 -9 > ${db_name}.tar.lz4 + } + + output { + File lastal_db = "${db_name}.tar.lz4" + } + + runtime { + docker: "quay.io/broadinstitute/viral-ngs" + memory: "7 GB" + cpu: 2 + dx_instance_type: "mem1_ssd1_x4" + } +} task merge_one_per_sample { String out_bam_basename From a42ed0de500d82cf822db670089dcf8b02e1627c Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 8 Mar 2018 22:37:59 -0500 Subject: [PATCH 14/35] proper indexing of WDL Pair --- pipes/WDL/workflows/align_and_annot.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl index 5fbd2f57c..f4725bfc7 100644 --- a/pipes/WDL/workflows/align_and_annot.wdl +++ b/pipes/WDL/workflows/align_and_annot.wdl @@ -13,12 +13,12 @@ workflow align_and_annot { assemblies_fasta = assemblies_fasta } - scatter(aln_by_chr, ref_annot_by_chr in zip(mafft.alignments_by_chr, annotations_tbl)) { + scatter(by_chr in zip(mafft.alignments_by_chr, annotations_tbl)) { call ncbi.annot_transfer as annot { input: - chr_mutli_aln_fasta = aln_by_chr, + chr_mutli_aln_fasta = by_chr.left, reference_fasta = reference_fasta, - reference_feature_table = ref_annot_by_chr + reference_feature_table = by_chr.right } call ncbi.prepare_genbank as genbank { input: From c5aa557de231f1edcd3966d9704cc8616beb4baa Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 8 Mar 2018 22:47:14 -0500 Subject: [PATCH 15/35] fix WDL bug --- pipes/WDL/workflows/tasks/ncbi.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl index 3a2668e21..63381b69b 100644 --- a/pipes/WDL/workflows/tasks/ncbi.wdl +++ b/pipes/WDL/workflows/tasks/ncbi.wdl @@ -26,7 +26,7 @@ task download_fasta { task download_annotations { Array[String]+ accessions String emailAddress - String combined_fasta + String combined_out_prefix command { set -ex -o pipefail @@ -39,12 +39,12 @@ task download_annotations { ${emailAddress} \ ./ \ ${sep=' ' accessions} \ - --combinedFilePrefix "${combined_fasta}" \ + --combinedFilePrefix "${combined_out_prefix}" \ --loglevel DEBUG } output { - File combined_fasta = "${combined_fasta}.fasta" + File combined_fasta = "${combined_out_prefix}.fasta" Array[File] genomes_fasta = glob("*.fasta") Array[File] features_tbl = glob("*.tbl") } From 07d235a58a0a3a29b0243a69a90d0650d7314e9c Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 8 Mar 2018 22:59:43 -0500 Subject: [PATCH 16/35] fix input bindings in align_and_annot workflow, make optional params optional in prep_genbank_files task --- pipes/WDL/workflows/align_and_annot.wdl | 2 +- pipes/WDL/workflows/tasks/ncbi.wdl | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl index f4725bfc7..d26b9741f 100644 --- a/pipes/WDL/workflows/align_and_annot.wdl +++ b/pipes/WDL/workflows/align_and_annot.wdl @@ -24,7 +24,7 @@ workflow align_and_annot { input: assemblies_fasta = assemblies_fasta, annotations_tbl = annot.transferred_feature_tables, - out_prefix = basename(annotations_tbl, '.tbl') + out_prefix = basename(by_chr.right, '.tbl') } } } diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl index 63381b69b..a83f1cc06 100644 --- a/pipes/WDL/workflows/tasks/ncbi.wdl +++ b/pipes/WDL/workflows/tasks/ncbi.wdl @@ -87,11 +87,11 @@ task prepare_genbank { Array[File]+ assemblies_fasta Array[File]+ annotations_tbl File authors_sbt - File assemblySummary # summary.assembly.txt - File genbankSourceTable - File biosampleMap - String sequencingTech - String comment + File? coverage_table # summary.assembly.txt + File? genbankSourceTable + File? biosampleMap + String? sequencingTech + String? comment String out_prefix = "ncbi_package" command { @@ -101,11 +101,11 @@ task prepare_genbank { ${authors_sbt} \ ${sep=' ' assemblies_fasta} \ . \ - --master_source_table ${genbankSourceTable} \ - --sequencing_tech ${sequencingTech} \ - --biosample_map ${biosampleMap} \ - --coverage_table ${assemblySummary} \ - --comment ${comment} \ + ${'--master_source_table=' + genbankSourceTable} \ + ${'--sequencing_tech=' + sequencingTech} \ + ${'--biosample_map=' + biosampleMap} \ + ${'--coverage_table=' + coverage_table} \ + ${'--comment=' + comment} \ --loglevel DEBUG tar -czpvf ${out_prefix}.tar.gz *.val *.cmt *.fsa *.gbf *.sqn *.src *.tbl } From 4a3053bf8f01b11c626fa4a9b7ae5f93a2e88c09 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 8 Mar 2018 23:17:35 -0500 Subject: [PATCH 17/35] dummy commit for travis --- pipes/WDL/workflows/tasks/ncbi.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl index a83f1cc06..2005e05e2 100644 --- a/pipes/WDL/workflows/tasks/ncbi.wdl +++ b/pipes/WDL/workflows/tasks/ncbi.wdl @@ -87,7 +87,7 @@ task prepare_genbank { Array[File]+ assemblies_fasta Array[File]+ annotations_tbl File authors_sbt - File? coverage_table # summary.assembly.txt + File? coverage_table # summary.assembly.txt (from Snakemake) File? genbankSourceTable File? biosampleMap String? sequencingTech From c39120b8925f908c3a4177f69d51c5c880945f0a Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 8 Mar 2018 23:44:54 -0500 Subject: [PATCH 18/35] bump dxWDL version fro 0.59 to 0.60.2 --- travis/install-wdl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/travis/install-wdl.sh b/travis/install-wdl.sh index 9fda7f316..40d9617ce 100755 --- a/travis/install-wdl.sh +++ b/travis/install-wdl.sh @@ -19,7 +19,7 @@ cached_fetch_jar_from_github () { cached_fetch_jar_from_github broadinstitute cromwell womtool 30.2 cached_fetch_jar_from_github broadinstitute cromwell cromwell 30.2 -cached_fetch_jar_from_github dnanexus dxWDL dxWDL 0.59 +cached_fetch_jar_from_github dnanexus dxWDL dxWDL 0.60.2 TGZ=dx-toolkit-v0.240.1-ubuntu-14.04-amd64.tar.gz if [ ! -f $CACHE_DIR/$TGZ ]; then From 9d6a5afbf94e7f59c5f52b7b3e01c8222dd09574 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 9 Mar 2018 00:07:01 -0500 Subject: [PATCH 19/35] add new required -imports param for dxWDL --- travis/build-dx.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/travis/build-dx.sh b/travis/build-dx.sh index 3426dd74c..88ba4a67e 100755 --- a/travis/build-dx.sh +++ b/travis/build-dx.sh @@ -42,6 +42,7 @@ for workflow in pipes/WDL/workflows/*.wdl; do dx_id=$(java -jar dxWDL.jar compile \ $workflow $CMD_INPUT $CMD_DEFAULTS -f \ + -imports pipes/WDL/workflows/ \ -destination /build/$VERSION/$workflow_name) echo "Succeeded: $workflow_name = $dx_id" echo -e "$workflow_name\t$dx_id" >> $COMPILE_SUCCESS From 5efc81f7523b4f468baa308da26b1eca3886eee3 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 9 Mar 2018 00:36:31 -0500 Subject: [PATCH 20/35] use -imports and --imports params to dxWDL and cromwell invocations and remove "tasks/" from all WDL import statements. comment out align_and_annot for now to get compile working again --- pipes/WDL/workflows/align_and_annot.wdl | 6 ++++-- pipes/WDL/workflows/align_and_plot.wdl | 2 +- pipes/WDL/workflows/assemble_denovo.wdl | 4 ++-- pipes/WDL/workflows/assemble_denovo_with_deplete.wdl | 4 ++-- pipes/WDL/workflows/assemble_refbased.wdl | 4 ++-- pipes/WDL/workflows/classify_kraken.wdl | 2 +- pipes/WDL/workflows/contigs.wdl | 6 +++--- pipes/WDL/workflows/demux_metag.wdl | 10 +++++----- pipes/WDL/workflows/demux_only.wdl | 4 ++-- pipes/WDL/workflows/demux_plus.wdl | 10 +++++----- pipes/WDL/workflows/deplete_only.wdl | 4 ++-- pipes/WDL/workflows/download_annotations.wdl | 2 +- pipes/WDL/workflows/scaffold_and_refine.wdl | 4 ++-- pipes/WDL/workflows/spikein.wdl | 2 +- pipes/WDL/workflows/tasks/interhost.wdl | 2 +- travis/build-dx.sh | 2 +- travis/tests-cromwell.sh | 1 + 17 files changed, 36 insertions(+), 33 deletions(-) diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl index d26b9741f..14939bf48 100644 --- a/pipes/WDL/workflows/align_and_annot.wdl +++ b/pipes/WDL/workflows/align_and_annot.wdl @@ -1,5 +1,7 @@ -import "tasks/interhost.wdl" as interhost -import "tasks/ncbi.wdl" as ncbi +import "interhost.wdl" as interhost +import "ncbi.wdl" as ncbi + +# DX_SKIP_WORKFLOW workflow align_and_annot { diff --git a/pipes/WDL/workflows/align_and_plot.wdl b/pipes/WDL/workflows/align_and_plot.wdl index a8c8e2862..fd43049d8 100644 --- a/pipes/WDL/workflows/align_and_plot.wdl +++ b/pipes/WDL/workflows/align_and_plot.wdl @@ -1,4 +1,4 @@ -import "tasks/reports.wdl" as reports +import "reports.wdl" as reports workflow align_and_plot { call reports.plot_coverage { diff --git a/pipes/WDL/workflows/assemble_denovo.wdl b/pipes/WDL/workflows/assemble_denovo.wdl index acd90aba7..2317cbd92 100644 --- a/pipes/WDL/workflows/assemble_denovo.wdl +++ b/pipes/WDL/workflows/assemble_denovo.wdl @@ -1,5 +1,5 @@ -import "tasks/taxon_filter.wdl" as taxon_filter -import "tasks/assembly.wdl" as assembly +import "taxon_filter.wdl" as taxon_filter +import "assembly.wdl" as assembly workflow assemble_denovo { File reads_unmapped_bam diff --git a/pipes/WDL/workflows/assemble_denovo_with_deplete.wdl b/pipes/WDL/workflows/assemble_denovo_with_deplete.wdl index cf7419d90..e28cbda20 100644 --- a/pipes/WDL/workflows/assemble_denovo_with_deplete.wdl +++ b/pipes/WDL/workflows/assemble_denovo_with_deplete.wdl @@ -1,5 +1,5 @@ -import "tasks/taxon_filter.wdl" as taxon_filter -import "tasks/assembly.wdl" as assembly +import "taxon_filter.wdl" as taxon_filter +import "assembly.wdl" as assembly workflow assemble_denovo_with_deplete { diff --git a/pipes/WDL/workflows/assemble_refbased.wdl b/pipes/WDL/workflows/assemble_refbased.wdl index 3dc563af7..3a8c00476 100644 --- a/pipes/WDL/workflows/assemble_refbased.wdl +++ b/pipes/WDL/workflows/assemble_refbased.wdl @@ -1,5 +1,5 @@ -import "tasks/assembly.wdl" as assembly +import "assembly.wdl" as assembly workflow assemble_refbased { call assembly.refine_2x_and_plot -} \ No newline at end of file +} diff --git a/pipes/WDL/workflows/classify_kraken.wdl b/pipes/WDL/workflows/classify_kraken.wdl index a1af021e9..abbd40d4d 100644 --- a/pipes/WDL/workflows/classify_kraken.wdl +++ b/pipes/WDL/workflows/classify_kraken.wdl @@ -1,4 +1,4 @@ -import "tasks/metagenomics.wdl" as metagenomics +import "metagenomics.wdl" as metagenomics workflow classify_kraken { call metagenomics.kraken diff --git a/pipes/WDL/workflows/contigs.wdl b/pipes/WDL/workflows/contigs.wdl index baa9d8fcf..f71be3152 100644 --- a/pipes/WDL/workflows/contigs.wdl +++ b/pipes/WDL/workflows/contigs.wdl @@ -1,6 +1,6 @@ -import "tasks/metagenomics.wdl" as metagenomics -import "tasks/taxon_filter.wdl" as taxon_filter -import "tasks/assembly.wdl" as assembly +import "metagenomics.wdl" as metagenomics +import "taxon_filter.wdl" as taxon_filter +import "assembly.wdl" as assembly workflow contigs { diff --git a/pipes/WDL/workflows/demux_metag.wdl b/pipes/WDL/workflows/demux_metag.wdl index f48b74ff6..de2e467fa 100644 --- a/pipes/WDL/workflows/demux_metag.wdl +++ b/pipes/WDL/workflows/demux_metag.wdl @@ -1,10 +1,10 @@ #DX_SKIP_WORKFLOW -import "tasks/demux.wdl" as demux -import "tasks/metagenomics.wdl" as metagenomics -import "tasks/taxon_filter.wdl" as taxon_filter -import "tasks/assembly.wdl" as assembly -import "tasks/reports.wdl" as reports +import "demux.wdl" as demux +import "metagenomics.wdl" as metagenomics +import "taxon_filter.wdl" as taxon_filter +import "assembly.wdl" as assembly +import "reports.wdl" as reports workflow demux_metag { File krona_taxonomy_db_tgz diff --git a/pipes/WDL/workflows/demux_only.wdl b/pipes/WDL/workflows/demux_only.wdl index 745eabe60..4fcf8971d 100644 --- a/pipes/WDL/workflows/demux_only.wdl +++ b/pipes/WDL/workflows/demux_only.wdl @@ -1,5 +1,5 @@ -import "tasks/demux.wdl" as tasks_demux +import "demux.wdl" as tasks_demux workflow demux_only { call tasks_demux.illumina_demux -} \ No newline at end of file +} diff --git a/pipes/WDL/workflows/demux_plus.wdl b/pipes/WDL/workflows/demux_plus.wdl index ef37579f3..2582aef98 100644 --- a/pipes/WDL/workflows/demux_plus.wdl +++ b/pipes/WDL/workflows/demux_plus.wdl @@ -1,8 +1,8 @@ -import "tasks/demux.wdl" as demux -import "tasks/metagenomics.wdl" as metagenomics -import "tasks/taxon_filter.wdl" as taxon_filter -import "tasks/assembly.wdl" as assembly -import "tasks/reports.wdl" as reports +import "demux.wdl" as demux +import "metagenomics.wdl" as metagenomics +import "taxon_filter.wdl" as taxon_filter +import "assembly.wdl" as assembly +import "reports.wdl" as reports workflow demux_plus { diff --git a/pipes/WDL/workflows/deplete_only.wdl b/pipes/WDL/workflows/deplete_only.wdl index 67aa73bb5..36914fdfa 100644 --- a/pipes/WDL/workflows/deplete_only.wdl +++ b/pipes/WDL/workflows/deplete_only.wdl @@ -1,5 +1,5 @@ -import "tasks/taxon_filter.wdl" as taxon_filter +import "taxon_filter.wdl" as taxon_filter workflow deplete_only { call taxon_filter.deplete_taxa -} \ No newline at end of file +} diff --git a/pipes/WDL/workflows/download_annotations.wdl b/pipes/WDL/workflows/download_annotations.wdl index a3ef7c6f2..296673624 100644 --- a/pipes/WDL/workflows/download_annotations.wdl +++ b/pipes/WDL/workflows/download_annotations.wdl @@ -1,4 +1,4 @@ -import "tasks/ncbi.wdl" as ncbi +import "ncbi.wdl" as ncbi workflow download_annotations { diff --git a/pipes/WDL/workflows/scaffold_and_refine.wdl b/pipes/WDL/workflows/scaffold_and_refine.wdl index b6ff3745c..44916720a 100644 --- a/pipes/WDL/workflows/scaffold_and_refine.wdl +++ b/pipes/WDL/workflows/scaffold_and_refine.wdl @@ -1,4 +1,4 @@ -import "tasks/assembly.wdl" as assembly +import "assembly.wdl" as assembly workflow scaffold_and_refine { File reads_unmapped_bam @@ -13,4 +13,4 @@ workflow scaffold_and_refine { assembly_fasta = scaffold.scaffold_fasta, reads_unmapped_bam = reads_unmapped_bam } -} \ No newline at end of file +} diff --git a/pipes/WDL/workflows/spikein.wdl b/pipes/WDL/workflows/spikein.wdl index 583d364c7..1ecedb63a 100644 --- a/pipes/WDL/workflows/spikein.wdl +++ b/pipes/WDL/workflows/spikein.wdl @@ -1,4 +1,4 @@ -import "tasks/reports.wdl" as reports +import "reports.wdl" as reports workflow spikein { diff --git a/pipes/WDL/workflows/tasks/interhost.wdl b/pipes/WDL/workflows/tasks/interhost.wdl index 636ff55b9..f4e1ce423 100644 --- a/pipes/WDL/workflows/tasks/interhost.wdl +++ b/pipes/WDL/workflows/tasks/interhost.wdl @@ -110,7 +110,7 @@ task multi_align_mafft_ref { task multi_align_mafft { Array[File]+ assemblies_fasta # fasta files, one per sample, multiple chrs per file okay - String? out_prefix = basename(select_first(assemblies_fasta), '.fasta') + String out_prefix Int? mafft_maxIters Int? mafft_ep diff --git a/travis/build-dx.sh b/travis/build-dx.sh index 88ba4a67e..c10207ba5 100755 --- a/travis/build-dx.sh +++ b/travis/build-dx.sh @@ -42,7 +42,7 @@ for workflow in pipes/WDL/workflows/*.wdl; do dx_id=$(java -jar dxWDL.jar compile \ $workflow $CMD_INPUT $CMD_DEFAULTS -f \ - -imports pipes/WDL/workflows/ \ + -imports pipes/WDL/workflows/tasks/ \ -destination /build/$VERSION/$workflow_name) echo "Succeeded: $workflow_name = $dx_id" echo -e "$workflow_name\t$dx_id" >> $COMPILE_SUCCESS diff --git a/travis/tests-cromwell.sh b/travis/tests-cromwell.sh index 0ae71ae60..13e92c1b5 100755 --- a/travis/tests-cromwell.sh +++ b/travis/tests-cromwell.sh @@ -13,6 +13,7 @@ for workflow in pipes/WDL/workflows/*.wdl; do # the "cat" is to allow a pipe failure (otherwise it halts because of set -e) java -jar cromwell.jar run \ workflows/$workflow_name.wdl \ + --imports tasks \ -i $input_json | tee cromwell.out if [ ${PIPESTATUS[0]} -gt 0 ]; then echo "error running $workflow_name" From f3555bb7ca5d152e0e866d9aac7685adbb5c727a Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 9 Mar 2018 08:55:47 -0500 Subject: [PATCH 21/35] fix validate wdl script --- travis/validate-wdl.sh | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/travis/validate-wdl.sh b/travis/validate-wdl.sh index 91e4be947..cdf78d3ec 100755 --- a/travis/validate-wdl.sh +++ b/travis/validate-wdl.sh @@ -1,9 +1,23 @@ #!/bin/bash set -e -o pipefail -ln -s pipes/WDL/workflows/tasks . -for workflow in pipes/WDL/workflows/*.wdl; do +# validate each imported library of tasks on its own +for tasks in pipes/WDL/workflows/tasks/*.wdl; do + echo "validating tasks $tasks" + java -jar womtool.jar validate $tasks +done + +# validate the workflow files +# unfortunately, dxWDL now requires the -imports parameter and cromwell supports +# it as well but womtool validate does not yet support it! so we have to copy +# everything to a temp dir +mkdir wdl_validate_test +cd wdl_validate_test +cp ../pipes/WDL/workflows/tasks/*.wdl ../pipes/WDL/workflows/*.wdl . +for workflow in ../pipes/WDL/workflows/*.wdl; do + workflow=`basename $workflow` echo "validating $workflow" java -jar womtool.jar validate $workflow done -rm tasks +cd - +rm -r wdl_validate_test From 71bc09397337fc9dfd46b081464a5b75a927012f Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 9 Mar 2018 12:53:43 -0500 Subject: [PATCH 22/35] updates to tbl2asn invocations --- ncbi.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ncbi.py b/ncbi.py index 6f40589fd..39914d9f5 100755 --- a/ncbi.py +++ b/ncbi.py @@ -396,7 +396,7 @@ def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=N def prep_genbank_files(templateFile, fasta_files, annotDir, master_source_table=None, comment=None, sequencing_tech=None, - coverage_table=None, biosample_map=None): + coverage_table=None, biosample_map=None, organism=None): ''' Prepare genbank submission files. Requires .fasta and .tbl files as input, as well as numerous other metadata files for the submission. Creates a directory full of files (.sqn in particular) that can be sent to GenBank. @@ -451,7 +451,11 @@ def prep_genbank_files(templateFile, fasta_files, annotDir, # run tbl2asn (relies on filesnames matching by prefix) tbl2asn = tools.tbl2asn.Tbl2AsnTool() - tbl2asn.execute(templateFile, annotDir, comment=comment, per_genome_comment=True) + source_quals = [] + if organism: + source_quals = [('organism', organism)] + tbl2asn.execute(templateFile, annotDir, comment=comment, + per_genome_comment=True, source_quals=source_quals) def parser_prep_genbank_files(parser=argparse.ArgumentParser()): @@ -462,6 +466,7 @@ def parser_prep_genbank_files(parser=argparse.ArgumentParser()): parser.add_argument('--comment', default=None, help='comment field') parser.add_argument('--sequencing_tech', default=None, help='sequencing technology (e.g. Illumina HiSeq 2500)') parser.add_argument('--master_source_table', default=None, help='source modifier table') + parser.add_argument('--organism', default=None, help='species name') parser.add_argument("--biosample_map", help="""A file with two columns and a header: sample and BioSample. This file may refer to samples that are not included in this submission.""") From b8cf6ea42514f5405d35fad990058a0a2b21fb0e Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 9 Mar 2018 12:58:45 -0500 Subject: [PATCH 23/35] oops --- travis/validate-wdl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/travis/validate-wdl.sh b/travis/validate-wdl.sh index cdf78d3ec..ce27c7bfe 100755 --- a/travis/validate-wdl.sh +++ b/travis/validate-wdl.sh @@ -17,7 +17,7 @@ cp ../pipes/WDL/workflows/tasks/*.wdl ../pipes/WDL/workflows/*.wdl . for workflow in ../pipes/WDL/workflows/*.wdl; do workflow=`basename $workflow` echo "validating $workflow" - java -jar womtool.jar validate $workflow + java -jar ../womtool.jar validate $workflow done cd - rm -r wdl_validate_test From 82bda37f47091ba952e425128529939439c7df0e Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 9 Mar 2018 13:42:59 -0500 Subject: [PATCH 24/35] change deprecated dx instance --- pipes/WDL/workflows/tasks/taxon_filter.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/workflows/tasks/taxon_filter.wdl b/pipes/WDL/workflows/tasks/taxon_filter.wdl index 7028784c4..1736af871 100644 --- a/pipes/WDL/workflows/tasks/taxon_filter.wdl +++ b/pipes/WDL/workflows/tasks/taxon_filter.wdl @@ -175,6 +175,6 @@ task merge_one_per_sample { memory: "7 GB" cpu: 4 docker: "quay.io/broadinstitute/viral-ngs" - dx_instance_type: "mem2_hdd2_x4" + dx_instance_type: "mem1_hdd2_x8" } } From 291d10467af9ed94dc7ce3c7956e80506e2ccf06 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 9 Mar 2018 14:04:10 -0500 Subject: [PATCH 25/35] dx is deprecating all hdd instances? --- pipes/WDL/workflows/tasks/metagenomics.wdl | 2 +- pipes/WDL/workflows/tasks/taxon_filter.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/workflows/tasks/metagenomics.wdl b/pipes/WDL/workflows/tasks/metagenomics.wdl index a0a6faba9..d3b31a25c 100644 --- a/pipes/WDL/workflows/tasks/metagenomics.wdl +++ b/pipes/WDL/workflows/tasks/metagenomics.wdl @@ -117,7 +117,7 @@ task krona { docker: "quay.io/broadinstitute/viral-ngs" memory: "4 GB" cpu: 1 - dx_instance_type: "mem2_hdd2_x2" + dx_instance_type: "mem1_ssd2_x2" } } diff --git a/pipes/WDL/workflows/tasks/taxon_filter.wdl b/pipes/WDL/workflows/tasks/taxon_filter.wdl index 1736af871..3ce406c0b 100644 --- a/pipes/WDL/workflows/tasks/taxon_filter.wdl +++ b/pipes/WDL/workflows/tasks/taxon_filter.wdl @@ -175,6 +175,6 @@ task merge_one_per_sample { memory: "7 GB" cpu: 4 docker: "quay.io/broadinstitute/viral-ngs" - dx_instance_type: "mem1_hdd2_x8" + dx_instance_type: "mem1_ssd2_x4" } } From 67403501dea415c71252263bc5a8acac48602961 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 9 Mar 2018 14:24:43 -0500 Subject: [PATCH 26/35] avoid workflow applet namespace clashes --- pipes/WDL/dx-defaults-spikein.json | 2 +- pipes/WDL/workflows/download_annotations.wdl | 2 +- pipes/WDL/workflows/spikein.wdl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipes/WDL/dx-defaults-spikein.json b/pipes/WDL/dx-defaults-spikein.json index 6c1c6b406..667d48095 100644 --- a/pipes/WDL/dx-defaults-spikein.json +++ b/pipes/WDL/dx-defaults-spikein.json @@ -1,4 +1,4 @@ { - "spikein.spikein.spikein_db": + "spikein.spikein_report.spikein_db": "dx://file-F6PXkF00Yqp3zVXq14fF98Kz" } diff --git a/pipes/WDL/workflows/download_annotations.wdl b/pipes/WDL/workflows/download_annotations.wdl index 296673624..c723b0e6b 100644 --- a/pipes/WDL/workflows/download_annotations.wdl +++ b/pipes/WDL/workflows/download_annotations.wdl @@ -2,6 +2,6 @@ import "ncbi.wdl" as ncbi workflow download_annotations { - call ncbi.download_annotations + call ncbi.download_annotations as download } diff --git a/pipes/WDL/workflows/spikein.wdl b/pipes/WDL/workflows/spikein.wdl index 1ecedb63a..4eb0d285c 100644 --- a/pipes/WDL/workflows/spikein.wdl +++ b/pipes/WDL/workflows/spikein.wdl @@ -2,6 +2,6 @@ import "reports.wdl" as reports workflow spikein { - call reports.spikein_report as spikein + call reports.spikein_report as spikein_report } From bab7e362dd18d360f88b87e49c1f44c5b6e1f719 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 9 Mar 2018 15:09:00 -0500 Subject: [PATCH 27/35] rename workflow to avoid clash in namespace --- .../{download_annotations.wdl => fetch_annotations.wdl} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename pipes/WDL/workflows/{download_annotations.wdl => fetch_annotations.wdl} (70%) diff --git a/pipes/WDL/workflows/download_annotations.wdl b/pipes/WDL/workflows/fetch_annotations.wdl similarity index 70% rename from pipes/WDL/workflows/download_annotations.wdl rename to pipes/WDL/workflows/fetch_annotations.wdl index c723b0e6b..c53e46cab 100644 --- a/pipes/WDL/workflows/download_annotations.wdl +++ b/pipes/WDL/workflows/fetch_annotations.wdl @@ -1,6 +1,6 @@ import "ncbi.wdl" as ncbi -workflow download_annotations { +workflow fetch_annotations { call ncbi.download_annotations as download From fd7f35646f4f5b7d0dc08a3e2da0169f371c7939 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 9 Mar 2018 15:45:12 -0500 Subject: [PATCH 28/35] cromwell doesnt like the --imports from a directory, even though it says it can take that.. boo --- travis/tests-cromwell.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/travis/tests-cromwell.sh b/travis/tests-cromwell.sh index 13e92c1b5..5437ac908 100755 --- a/travis/tests-cromwell.sh +++ b/travis/tests-cromwell.sh @@ -2,7 +2,8 @@ set -e # intentionally allow for pipe failures below ln -s $GATK_PATH/GenomeAnalysisTK.jar . -ln -s pipes/WDL/workflows pipes/WDL/workflows/tasks . +mkdir -p workflows +ln -s pipes/WDL/workflows/*.wdl pipes/WDL/workflows/tasks/*.wdl workflows for workflow in pipes/WDL/workflows/*.wdl; do workflow_name=$(basename $workflow .wdl) @@ -13,7 +14,6 @@ for workflow in pipes/WDL/workflows/*.wdl; do # the "cat" is to allow a pipe failure (otherwise it halts because of set -e) java -jar cromwell.jar run \ workflows/$workflow_name.wdl \ - --imports tasks \ -i $input_json | tee cromwell.out if [ ${PIPESTATUS[0]} -gt 0 ]; then echo "error running $workflow_name" From 7bba86e851e21ef88256cb2ef50f901681ad30c9 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 9 Mar 2018 16:18:06 -0500 Subject: [PATCH 29/35] fix invocation of cromwell for tests --- travis/tests-cromwell.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/travis/tests-cromwell.sh b/travis/tests-cromwell.sh index 5437ac908..7f312df74 100755 --- a/travis/tests-cromwell.sh +++ b/travis/tests-cromwell.sh @@ -4,8 +4,9 @@ set -e # intentionally allow for pipe failures below ln -s $GATK_PATH/GenomeAnalysisTK.jar . mkdir -p workflows ln -s pipes/WDL/workflows/*.wdl pipes/WDL/workflows/tasks/*.wdl workflows +cd workflows -for workflow in pipes/WDL/workflows/*.wdl; do +for workflow in ../pipes/WDL/workflows/*.wdl; do workflow_name=$(basename $workflow .wdl) input_json="test/input/WDL/test_inputs-$workflow_name-local.json" if [ -f $input_json ]; then @@ -13,7 +14,7 @@ for workflow in pipes/WDL/workflows/*.wdl; do echo "Executing $workflow_name using Cromwell on local instance" # the "cat" is to allow a pipe failure (otherwise it halts because of set -e) java -jar cromwell.jar run \ - workflows/$workflow_name.wdl \ + $workflow_name.wdl \ -i $input_json | tee cromwell.out if [ ${PIPESTATUS[0]} -gt 0 ]; then echo "error running $workflow_name" @@ -29,5 +30,6 @@ for workflow in pipes/WDL/workflows/*.wdl; do fi done +cd - date echo "note: there is no testing of output correctness yet..." From 35570f64aa2d4e0373278e104cf592a32ed89a79 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 9 Mar 2018 16:28:02 -0500 Subject: [PATCH 30/35] add moltype --- ncbi.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ncbi.py b/ncbi.py index 39914d9f5..6df2d032a 100755 --- a/ncbi.py +++ b/ncbi.py @@ -396,7 +396,7 @@ def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=N def prep_genbank_files(templateFile, fasta_files, annotDir, master_source_table=None, comment=None, sequencing_tech=None, - coverage_table=None, biosample_map=None, organism=None): + coverage_table=None, biosample_map=None, organism=None, mol_type=None): ''' Prepare genbank submission files. Requires .fasta and .tbl files as input, as well as numerous other metadata files for the submission. Creates a directory full of files (.sqn in particular) that can be sent to GenBank. @@ -453,7 +453,9 @@ def prep_genbank_files(templateFile, fasta_files, annotDir, tbl2asn = tools.tbl2asn.Tbl2AsnTool() source_quals = [] if organism: - source_quals = [('organism', organism)] + source_quals.append(('organism', organism)) + if mol_type: + source_quals.append(('mol_type', mol_type)) tbl2asn.execute(templateFile, annotDir, comment=comment, per_genome_comment=True, source_quals=source_quals) @@ -467,6 +469,7 @@ def parser_prep_genbank_files(parser=argparse.ArgumentParser()): parser.add_argument('--sequencing_tech', default=None, help='sequencing technology (e.g. Illumina HiSeq 2500)') parser.add_argument('--master_source_table', default=None, help='source modifier table') parser.add_argument('--organism', default=None, help='species name') + parser.add_argument('--mol_type', default=None, help='molecule type') parser.add_argument("--biosample_map", help="""A file with two columns and a header: sample and BioSample. This file may refer to samples that are not included in this submission.""") From e01434cf987fe984ed5d67da5fec859451bfe79d Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 9 Mar 2018 16:56:17 -0500 Subject: [PATCH 31/35] fix src stuff --- ncbi.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/ncbi.py b/ncbi.py index 6df2d032a..56ed48558 100755 --- a/ncbi.py +++ b/ncbi.py @@ -436,13 +436,22 @@ def prep_genbank_files(templateFile, fasta_files, annotDir, Bio.SeqIO.write(seq_obj, out_chr_fasta, "fasta") # make .fsa files - fasta2fsa(out_file_name, annotDir, biosample=biosample.get(sample)) + fasta2fsa(out_file_name, annotDir, biosample=biosample.get(sample_base)) # remove the .fasta file os.unlink(out_file_name) # make .src files if master_source_table: - shutil.copy(master_source_table, os.path.join(annotDir, sample + '.src')) + out_src_fname = os.path.join(annotDir, sample + '.src') + with open(master_source_table, 'rt') as inf: + with open(out_src_fname, 'wt') as outf: + outf.write(inf.readline()) + for line in inf: + row = line.rsrtrip('\n').split('\t') + if row[0] == sample_base: + row[0] = sample + outf.write('\t'.join(row) + '\n') + # make .cmt files make_structured_comment_file(os.path.join(annotDir, sample + '.cmt'), name=sample, From 11cbd1229eecf96958587176ffe27cb737fe3df5 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Fri, 9 Mar 2018 16:56:33 -0500 Subject: [PATCH 32/35] fix src stuff again --- ncbi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ncbi.py b/ncbi.py index 56ed48558..8db5bc229 100755 --- a/ncbi.py +++ b/ncbi.py @@ -447,7 +447,7 @@ def prep_genbank_files(templateFile, fasta_files, annotDir, with open(out_src_fname, 'wt') as outf: outf.write(inf.readline()) for line in inf: - row = line.rsrtrip('\n').split('\t') + row = line.rstrip('\n').split('\t') if row[0] == sample_base: row[0] = sample outf.write('\t'.join(row) + '\n') From d39d10982757a8dc0943206e09a3577c98a15274 Mon Sep 17 00:00:00 2001 From: Ilya Shlyakhter Date: Tue, 20 Mar 2018 21:44:16 -0400 Subject: [PATCH 33/35] fixed tests of cromwell on local instance (#798) * fixed tests of cromwell on local instance --- travis/tests-cromwell.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/travis/tests-cromwell.sh b/travis/tests-cromwell.sh index 7f312df74..03790b6ea 100755 --- a/travis/tests-cromwell.sh +++ b/travis/tests-cromwell.sh @@ -3,7 +3,8 @@ set -e # intentionally allow for pipe failures below ln -s $GATK_PATH/GenomeAnalysisTK.jar . mkdir -p workflows -ln -s pipes/WDL/workflows/*.wdl pipes/WDL/workflows/tasks/*.wdl workflows +cp *.jar pipes/WDL/workflows/*.wdl pipes/WDL/workflows/tasks/*.wdl workflows +cp -r test workflows/ cd workflows for workflow in ../pipes/WDL/workflows/*.wdl; do From beb91a3a93f818b31314802bc1f1f320bf68d591 Mon Sep 17 00:00:00 2001 From: Chris Tomkins-Tinch Date: Fri, 23 Mar 2018 15:46:27 -0400 Subject: [PATCH 34/35] pass sample name in WDL call to plot_coverage (#799) * pass sample name in WDL call to plot_coverage pass sample name in WDL call to plot_coverage so the same name shows up in the coverage plot PDF * pass sample name in second WDL call to plot_coverage within refine_2x_and_plot --- pipes/WDL/workflows/tasks/assembly.wdl | 1 + pipes/WDL/workflows/tasks/reports.wdl | 1 + 2 files changed, 2 insertions(+) diff --git a/pipes/WDL/workflows/tasks/assembly.wdl b/pipes/WDL/workflows/tasks/assembly.wdl index 12aa15f64..fe943b0a1 100644 --- a/pipes/WDL/workflows/tasks/assembly.wdl +++ b/pipes/WDL/workflows/tasks/assembly.wdl @@ -325,6 +325,7 @@ task refine_2x_and_plot { --plotWidth 1100 \ --plotHeight 850 \ --plotDPI 100 \ + --plotTitle "${sample_name} coverage plot" \ --loglevel=DEBUG else touch ${sample_name}.coverage_plot.pdf diff --git a/pipes/WDL/workflows/tasks/reports.wdl b/pipes/WDL/workflows/tasks/reports.wdl index 22652dbc6..1f6c7f3cc 100644 --- a/pipes/WDL/workflows/tasks/reports.wdl +++ b/pipes/WDL/workflows/tasks/reports.wdl @@ -65,6 +65,7 @@ task plot_coverage { --plotWidth 1100 \ --plotHeight 850 \ --plotDPI 100 \ + --plotTitle "${sample_name} coverage plot" \ --loglevel=DEBUG else touch ${sample_name}.coverage_plot.pdf From 2e2e05af036c1e840380c098cb75ce8135024832 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Mon, 26 Mar 2018 13:31:25 -0400 Subject: [PATCH 35/35] bump viral-baseimage 0.1.8 to 0.1.9 (#802) --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 2cefcd13d..599ee8e89 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM quay.io/broadinstitute/viral-baseimage:0.1.8 +FROM quay.io/broadinstitute/viral-baseimage:0.1.9 LABEL maintainer "viral-ngs@broadinstitute.org"