From 1309f4abe35ac2e0bd0f530ac81b21dd97b8bee3 Mon Sep 17 00:00:00 2001
From: Chris Tomkins-Tinch <tomkinsc@gmail.com>
Date: Tue, 27 Feb 2018 14:34:43 -0500
Subject: [PATCH 01/35] skip blank lines in samples-*.txt files (#794)

---
 pipes/rules/common.rules | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pipes/rules/common.rules b/pipes/rules/common.rules
index 9527f1b05..92ec54338 100644
--- a/pipes/rules/common.rules
+++ b/pipes/rules/common.rules
@@ -31,6 +31,8 @@ def read_tab_file(fname):
     with util.file.open_or_gzopen(fname, 'rU') as inf:
         header = [item.strip() for item in inf.readline().strip().rstrip('\n').split('\t')]
         for line in inf:
+            if len(line.strip())==0:
+                continue
             row = [item.strip() for item in line.rstrip('\n').split('\t')]
             if len(row) > len(header):
                 # truncate the row to the header length, and only include extra items if they are not spaces 
@@ -44,6 +46,8 @@ def read_samples_file(fname, number_of_chromosomes=1, append_chrom_num=False):
         return []
     with util.file.open_or_gzopen(fname, 'rU') as inf:
         for line in inf:
+            if len(line.strip())==0:
+                continue
             if not append_chrom_num:
                 yield line.strip()
             else:
@@ -56,6 +60,8 @@ def read_accessions_file(fname):
         return []
     with util.file.open_or_gzopen(fname, 'rU') as inf:
         for line in inf:
+            if len(line.strip())==0:
+                continue
             yield line.strip()
 
 def download_file(uriToGet, dest, destFileName=None):
@@ -102,7 +108,7 @@ def strip_protocol(uri, relative=False):
         return uri
 
 
-import botocore
+import botocore.session
 
 def objectify_remote(file_address, *args, **kwargs):
     if file_address is None:

From 909c7751a968b94f7ca391db82cb1d1ccb35b794 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 1 Mar 2018 09:26:35 -0500
Subject: [PATCH 02/35] add deplete-and-spades workflow

---
 pipes/WDL/workflows/contigs.wdl | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 pipes/WDL/workflows/contigs.wdl

diff --git a/pipes/WDL/workflows/contigs.wdl b/pipes/WDL/workflows/contigs.wdl
new file mode 100644
index 000000000..baa9d8fcf
--- /dev/null
+++ b/pipes/WDL/workflows/contigs.wdl
@@ -0,0 +1,17 @@
+import "tasks/metagenomics.wdl" as metagenomics
+import "tasks/taxon_filter.wdl" as taxon_filter
+import "tasks/assembly.wdl" as assembly
+
+workflow contigs {
+
+  call taxon_filter.deplete_taxa as deplete
+
+  call assembly.assemble as spades {
+    input:
+      assembler = "spades",
+      reads_unmapped_bam = deplete.cleaned_bam
+  }
+
+  # TO DO: taxonomic classification of contigs
+
+}

From 1bd140fdf468ec332ac42aaebb48a2389920f46e Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 1 Mar 2018 10:27:21 -0500
Subject: [PATCH 03/35] add default dbs for contigs workflow

---
 pipes/WDL/dx-defaults-contigs.json | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 pipes/WDL/dx-defaults-contigs.json

diff --git a/pipes/WDL/dx-defaults-contigs.json b/pipes/WDL/dx-defaults-contigs.json
new file mode 100644
index 000000000..ba193eff8
--- /dev/null
+++ b/pipes/WDL/dx-defaults-contigs.json
@@ -0,0 +1,14 @@
+{
+  "contigs.deplete.bwaDbs": [
+    "dx://file-F9k7Bx00Z3ybJjvY3ZVj7Z9P"
+  ],
+  "contigs.deplete.blastDbs": [
+    "dx://file-F8B3B6Q09y3bZg3j1FqK7bJ9",
+    "dx://file-F8BjgXj09y3gkfZGPPQZbZkK",
+    "dx://file-F8B3Pp809y3jBpXq7xjxbq94",
+    "dx://file-F8B3B6809y3kK1JP5X8Pg361"
+  ],
+
+  "contigs.spades.trim_clip_db":
+    "dx://file-BXF0vYQ0QyBF509G9J12g927"
+}

From ca5b92670a2e10fe11361dd63e5cb08d229a20a1 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 1 Mar 2018 12:47:43 -0500
Subject: [PATCH 04/35] convert default dnanexus depletion databases from hg19
 bmtagger to hg19 bwa

---
 pipes/WDL/dx-defaults-assemble_denovo_with_deplete.json | 4 ++--
 pipes/WDL/dx-defaults-demux_plus.json                   | 4 ++--
 pipes/WDL/dx-defaults-deplete_only.json                 | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pipes/WDL/dx-defaults-assemble_denovo_with_deplete.json b/pipes/WDL/dx-defaults-assemble_denovo_with_deplete.json
index 887ad1ba5..118d1fe5e 100644
--- a/pipes/WDL/dx-defaults-assemble_denovo_with_deplete.json
+++ b/pipes/WDL/dx-defaults-assemble_denovo_with_deplete.json
@@ -1,6 +1,6 @@
 {
-  "assemble_denovo_with_deplete.deplete_taxa.bmtaggerDbs": [
-    "dx://file-BYF8y0Q06PJ7G1fPvkB9q3fK"
+  "assemble_denovo_with_deplete.deplete_taxa.bwaDbs": [
+    "dx://file-F9k7Bx00Z3ybJjvY3ZVj7Z9P"
   ],
   "assemble_denovo_with_deplete.deplete_taxa.blastDbs": [
     "dx://file-F8B3B6Q09y3bZg3j1FqK7bJ9",
diff --git a/pipes/WDL/dx-defaults-demux_plus.json b/pipes/WDL/dx-defaults-demux_plus.json
index b532f8c81..cd3d53ef8 100644
--- a/pipes/WDL/dx-defaults-demux_plus.json
+++ b/pipes/WDL/dx-defaults-demux_plus.json
@@ -2,8 +2,8 @@
   "demux_plus.spikein.spikein_db":
     "dx://file-F6PXkF00Yqp3zVXq14fF98Kz",
 
-  "demux_plus.deplete.bmtaggerDbs": [
-    "dx://file-BYF8y0Q06PJ7G1fPvkB9q3fK"
+  "demux_plus.deplete.bwaDbs": [
+    "dx://file-F9k7Bx00Z3ybJjvY3ZVj7Z9P"
   ],
   "demux_plus.deplete.blastDbs": [
     "dx://file-F8B3B6Q09y3bZg3j1FqK7bJ9",
diff --git a/pipes/WDL/dx-defaults-deplete_only.json b/pipes/WDL/dx-defaults-deplete_only.json
index a3c962261..cf065c8ef 100644
--- a/pipes/WDL/dx-defaults-deplete_only.json
+++ b/pipes/WDL/dx-defaults-deplete_only.json
@@ -1,6 +1,6 @@
 {
-  "deplete_only.deplete_taxa.bmtaggerDbs": [
-    "dx://file-BYF8y0Q06PJ7G1fPvkB9q3fK"
+  "deplete_only.deplete_taxa.bwaDbs": [
+    "dx://file-F9k7Bx00Z3ybJjvY3ZVj7Z9P"
   ],
   "deplete_only.deplete_taxa.blastDbs": [
     "dx://file-F8B3B6Q09y3bZg3j1FqK7bJ9",

From 24c84115258076901127fe7c20ed30e495d4fb3c Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 1 Mar 2018 14:39:37 -0500
Subject: [PATCH 05/35] shape up ncbi.wdl tasks a bit

---
 pipes/WDL/workflows/tasks/ncbi.wdl | 58 ++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 6 deletions(-)

diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl
index bc9bc4400..73408a283 100644
--- a/pipes/WDL/workflows/tasks/ncbi.wdl
+++ b/pipes/WDL/workflows/tasks/ncbi.wdl
@@ -24,6 +24,9 @@ task download_reference_genome {
   }
   runtime {
     docker: "quay.io/broadinstitute/viral-ngs"
+    memory: "3 GB"
+    cpu: 2
+    dx_instance_type: "mem1_ssd1_x2"
   }
 }
 
@@ -49,23 +52,61 @@ task download_lastal_sources {
   }
   runtime {
     docker: "quay.io/broadinstitute/viral-ngs"
+    memory: "3 GB"
+    cpu: 2
+    dx_instance_type: "mem1_ssd1_x2"
   }
 }
 
 task build_lastal_db {
-  File sequences # fasta file
+  File    sequences_fasta
+
+  String  db_name = basename(sequences_fasta, ".fasta")
 
   command {
-    taxon_filter.py lastal_build_db \
-        "${sequences}" \
-        "./"
+    set -ex -o pipefail
+    taxon_filter.py lastal_build_db ${sequences_fasta} ./ --loglevel=DEBUG
+    tar -c ${db_name}* | lz4 -9 > ${db_name}.tar.lz4
   }
 
   output {
-        Array[File] lastalDbFiles = glob("lastal.*")
+    File lastal_db = "${db_name}.tar.lz4"
+  }
+
+  runtime {
+    docker: "quay.io/broadinstitute/viral-ngs"
+    memory: "7 GB"
+    cpu: 2
+    dx_instance_type: "mem1_ssd1_x4"
   }
+}
+
+task download_annotation {
+  String        referenceName
+  Array[String] accessions
+  String        emailAddress
+
+  command {
+    set -ex -o pipefail
+    ncbi.py fetch_feature_tables \
+        ${emailAddress} \
+        ./ \
+        ${sep=' ' accessions} \
+        --combinedFilePrefix ${referenceName} \
+        --loglevel DEBUG
+  }
+
+  output {
+    File        referenceFasta  = "${referenceName}.fasta"
+    File        featureTable    = "${referenceName}.tbl"
+    Array[File] featureTables   = glob("*.tbl")
+  }
+
   runtime {
     docker: "quay.io/broadinstitute/viral-ngs"
+    memory: "3 GB"
+    cpu: 2
+    dx_instance_type: "mem1_ssd1_x2"
   }
 }
 
@@ -89,8 +130,10 @@ task annot_transfer {
     Array[File] featureTables = glob(".tbl")
   }
   runtime {
-    memory: "4GB"
     docker: "quay.io/broadinstitute/viral-ngs"
+    memory: "3 GB"
+    cpu: 2
+    dx_instance_type: "mem1_ssd1_x2"
   }
 }
 
@@ -122,5 +165,8 @@ task prepare_genbank {
   }
   runtime {
     docker: "quay.io/broadinstitute/viral-ngs"
+    memory: "3 GB"
+    cpu: 2
+    dx_instance_type: "mem1_ssd1_x2"
   }
 }
\ No newline at end of file

From f4908f22b96f69899309a832ee8c66a8aff30d8f Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 2 Mar 2018 14:19:02 -0500
Subject: [PATCH 06/35] wdl updates

---
 pipes/WDL/workflows/tasks/interhost.wdl | 15 +++++++--------
 pipes/WDL/workflows/tasks/ncbi.wdl      |  1 -
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/pipes/WDL/workflows/tasks/interhost.wdl b/pipes/WDL/workflows/tasks/interhost.wdl
index 5324385ad..57fa55cee 100644
--- a/pipes/WDL/workflows/tasks/interhost.wdl
+++ b/pipes/WDL/workflows/tasks/interhost.wdl
@@ -79,23 +79,22 @@ task multi_align_mafft {
   Array[File] inputAssemblies # fasta files, one per sample
   File referenceGenome # fasta
 
-  Int? threads
   Int? maxIters
   Int? ep
 
 
   command {
     interhost.py multichr_mafft \
-      "${referenceGenome}" \
-      "${sep=' ' inputAssemblies+}" \
-      "./" \
-      "${'--ep' + ep}" \
-      "${'--maxiters' + maxIters}" \
+      ${referenceGenome} \
+      ${sep=' ' inputAssemblies+} \
+      ./ \
+      ${'--ep' + ep} \
+      ${'--maxiters' + maxIters} \
       --preservecase \
       --localpair \
       --outFilePrefix aligned \
-      --sampleNameListFile "sampleNameList.txt" \
-      "${'--threads' + threads}"
+      --sampleNameListFile sampleNameList.txt \
+      --loglevel DEBUG
   }
 
   output {
diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl
index 73408a283..1d5b02f25 100644
--- a/pipes/WDL/workflows/tasks/ncbi.wdl
+++ b/pipes/WDL/workflows/tasks/ncbi.wdl
@@ -97,7 +97,6 @@ task download_annotation {
   }
 
   output {
-    File        referenceFasta  = "${referenceName}.fasta"
     File        featureTable    = "${referenceName}.tbl"
     Array[File] featureTables   = glob("*.tbl")
   }

From ccdb5e321a1564ac0b9ccb224ecaead30c4b8dcc Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 2 Mar 2018 14:21:26 -0500
Subject: [PATCH 07/35] add spikein-only workflow

---
 pipes/WDL/dx-defaults-spikein.json | 4 ++++
 pipes/WDL/workflows/spikein.wdl    | 7 +++++++
 2 files changed, 11 insertions(+)
 create mode 100644 pipes/WDL/dx-defaults-spikein.json
 create mode 100644 pipes/WDL/workflows/spikein.wdl

diff --git a/pipes/WDL/dx-defaults-spikein.json b/pipes/WDL/dx-defaults-spikein.json
new file mode 100644
index 000000000..ce3f58951
--- /dev/null
+++ b/pipes/WDL/dx-defaults-spikein.json
@@ -0,0 +1,4 @@
+{
+  "demux_plus.spikein.spikein_db":
+    "dx://file-F6PXkF00Yqp3zVXq14fF98Kz"
+}
diff --git a/pipes/WDL/workflows/spikein.wdl b/pipes/WDL/workflows/spikein.wdl
new file mode 100644
index 000000000..583d364c7
--- /dev/null
+++ b/pipes/WDL/workflows/spikein.wdl
@@ -0,0 +1,7 @@
+import "tasks/reports.wdl" as reports
+
+workflow spikein {
+
+  call reports.spikein_report as spikein
+
+}

From 53432ab749000e685f88c9052659c7bd55bc044a Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 2 Mar 2018 14:31:34 -0500
Subject: [PATCH 08/35] oops rename default param

---
 pipes/WDL/dx-defaults-spikein.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipes/WDL/dx-defaults-spikein.json b/pipes/WDL/dx-defaults-spikein.json
index ce3f58951..6c1c6b406 100644
--- a/pipes/WDL/dx-defaults-spikein.json
+++ b/pipes/WDL/dx-defaults-spikein.json
@@ -1,4 +1,4 @@
 {
-  "demux_plus.spikein.spikein_db":
+  "spikein.spikein.spikein_db":
     "dx://file-F6PXkF00Yqp3zVXq14fF98Kz"
 }

From c75db830c376a0ddc9e6656ebd1558ca4e38d5c3 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 8 Mar 2018 09:01:13 -0500
Subject: [PATCH 09/35] change filename of scaffold.scaffold_fasta from
 samplename.scaffold.fasta to samplename.scaffolded_imputed.fasta in order to
 make clear which fasta has imputed bases. add a String output for
 scaffolding_chosen_ref_name on top of existing fasta output. change
 mean_coverage from Int to Float.

---
 pipes/WDL/workflows/tasks/assembly.wdl | 26 +++++++++++++++-----------
 pipes/WDL/workflows/tasks/reports.wdl  |  5 +++--
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/pipes/WDL/workflows/tasks/assembly.wdl b/pipes/WDL/workflows/tasks/assembly.wdl
index 8389b6347..f073fc111 100644
--- a/pipes/WDL/workflows/tasks/assembly.wdl
+++ b/pipes/WDL/workflows/tasks/assembly.wdl
@@ -118,6 +118,8 @@ task scaffold {
       --outAlternateContigs ${sample_name}.scaffolding_alt_contigs.fasta \
       --loglevel=DEBUG
 
+    grep '^>' ${sample_name}.scaffolding_chosen_ref.fasta | cut -c 2- | tr '\n' '\t' > ${sample_name}.scaffolding_chosen_ref.txt
+
     assembly.py gapfill_gap2seq \
       ${sample_name}.intermediate_scaffold.fasta \
       ${reads_bam} \
@@ -132,7 +134,7 @@ task scaffold {
     assembly.py impute_from_reference \
       ${sample_name}.intermediate_gapfill.fasta \
       ${sample_name}.scaffolding_chosen_ref.fasta \
-      ${sample_name}.scaffold.fasta \
+      ${sample_name}.scaffolded_imputed.fasta \
       --newName ${sample_name} \
       ${'--replaceLength=' + replace_length} \
       ${'--minLengthFraction=' + min_length_fraction} \
@@ -142,14 +144,15 @@ task scaffold {
   }
 
   output {
-    File scaffold_fasta              = "${sample_name}.scaffold.fasta"
-    File intermediate_scaffold_fasta = "${sample_name}.intermediate_scaffold.fasta"
-    File intermediate_gapfill_fasta  = "${sample_name}.intermediate_gapfill.fasta"
-    Int  assembly_preimpute_length             = read_int("assembly_preimpute_length")
-    Int  assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous")
-    File scaffolding_chosen_ref      = "${sample_name}.scaffolding_chosen_ref.fasta"
-    File scaffolding_stats           = "${sample_name}.scaffolding_stats.txt"
-    File scaffolding_alt_contigs     = "${sample_name}.scaffolding_alt_contigs.fasta"
+    File   scaffold_fasta              = "${sample_name}.scaffolded_imputed.fasta"
+    File   intermediate_scaffold_fasta = "${sample_name}.intermediate_scaffold.fasta"
+    File   intermediate_gapfill_fasta  = "${sample_name}.intermediate_gapfill.fasta"
+    Int    assembly_preimpute_length             = read_int("assembly_preimpute_length")
+    Int    assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous")
+    String scaffolding_chosen_ref_name = read_string("${sample_name}.scaffolding_chosen_ref.txt")
+    File   scaffolding_chosen_ref      = "${sample_name}.scaffolding_chosen_ref.fasta"
+    File   scaffolding_stats           = "${sample_name}.scaffolding_stats.txt"
+    File   scaffolding_alt_contigs     = "${sample_name}.scaffolding_alt_contigs.fasta"
   }
 
   runtime {
@@ -307,7 +310,8 @@ task refine_2x_and_plot {
     samtools flagstat ${sample_name}.all.bam | tee ${sample_name}.all.bam.flagstat.txt
     grep properly ${sample_name}.all.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned
     samtools view ${sample_name}.mapped.bam | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned
-    echo $(( $(cat bases_aligned) / $(cat assembly_length) )) | tee mean_coverage
+    #echo $(( $(cat bases_aligned) / $(cat assembly_length) )) | tee mean_coverage
+    python -c "print (float("`cat bases_aligned`")/"`cat assembly_length`") if "`cat assembly_length`">0 else 0" > mean_coverage
 
     # fastqc mapped bam
     reports.py fastqc ${sample_name}.mapped.bam ${sample_name}.mapped_fastqc.html
@@ -342,7 +346,7 @@ task refine_2x_and_plot {
     Int  reads_aligned               = read_int("reads_aligned")
     Int  read_pairs_aligned          = read_int("read_pairs_aligned")
     Int  bases_aligned               = read_int("bases_aligned")
-    Int  mean_coverage               = read_int("mean_coverage")
+    Float mean_coverage              = read_float("mean_coverage")
   }
 
   runtime {
diff --git a/pipes/WDL/workflows/tasks/reports.wdl b/pipes/WDL/workflows/tasks/reports.wdl
index 8a9121fbb..22b57cfbc 100644
--- a/pipes/WDL/workflows/tasks/reports.wdl
+++ b/pipes/WDL/workflows/tasks/reports.wdl
@@ -50,7 +50,8 @@ task plot_coverage {
     samtools flagstat ${sample_name}.bam | tee ${sample_name}.bam.flagstat.txt
     grep properly ${sample_name}.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned
     samtools view ${sample_name}.mapped.bam | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned
-    echo $(( $(cat bases_aligned) / $(cat assembly_length) )) | tee mean_coverage
+    #echo $(( $(cat bases_aligned) / $(cat assembly_length) )) | tee mean_coverage
+    python -c "print (float("`cat bases_aligned`")/"`cat assembly_length`") if "`cat assembly_length`">0 else 0" > mean_coverage
 
     # fastqc mapped bam
     reports.py fastqc ${sample_name}.mapped.bam ${sample_name}.mapped_fastqc.html
@@ -81,7 +82,7 @@ task plot_coverage {
     Int reads_aligned               = read_int("reads_aligned")
     Int read_pairs_aligned          = read_int("read_pairs_aligned")
     Int bases_aligned               = read_int("bases_aligned")
-    Int mean_coverage               = read_int("mean_coverage")
+    Float mean_coverage             = read_float("mean_coverage")
   }
 
   runtime {

From 38a2bd57ea882e3521878f17e9f5b641b586a345 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 8 Mar 2018 16:43:52 -0500
Subject: [PATCH 10/35] tune up ncbi steps

---
 ncbi.py                                 |  2 +-
 pipes/WDL/workflows/align_and_annot.wdl | 29 ++++++++++++
 pipes/WDL/workflows/tasks/interhost.wdl | 63 +++++++++++++++++++------
 pipes/WDL/workflows/tasks/ncbi.wdl      | 62 ++++++++++++------------
 util/genbank.py                         |  7 +--
 5 files changed, 113 insertions(+), 50 deletions(-)
 create mode 100644 pipes/WDL/workflows/align_and_annot.wdl

diff --git a/ncbi.py b/ncbi.py
index af96da9f3..08e8db299 100755
--- a/ncbi.py
+++ b/ncbi.py
@@ -176,7 +176,7 @@ def tbl_transfer_prealigned(inputFasta, refFasta, refAnnotTblFiles, outputDir, o
         # identify the correct feature table as the one that has an ID that is
         # part of the ref seq ID
         fileAccession = util.genbank.get_feature_table_id(tblFilename)
-        if fileAccession in matchingRefSeq.id:
+        if fileAccession == matchingRefSeq.id.split('|')[0]:
             ref_tbl = tblFilename
             break
     if ref_tbl == "":
diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl
new file mode 100644
index 000000000..aa8e618e4
--- /dev/null
+++ b/pipes/WDL/workflows/align_and_annot.wdl
@@ -0,0 +1,29 @@
+import "tasks/interhost.wdl" as interhost
+import "tasks/ncbi.wdl" as ncbi
+import "tasks/reports.wdl" as reports
+
+
+workflow align_and_annot {
+
+  File        reference_fasta
+  Array[File] assemblies_fasta
+
+  call interhost.multi_align_mafft_ref as mafft {
+    input:
+      reference_fasta = reference_fasta,
+      assemblies_fasta = assemblies_fasta
+  }
+
+  scatter(mutli_aln_fasta in mafft.alignments_by_chr) {
+    call ncbi.annot_transfer as annot {
+      input:
+        chr_mutli_aln_fasta = mutli_aln_fasta,
+        reference_fasta = reference_fasta
+    }
+    call ncbi.prepare_genbank as genbank {
+      input:
+        assemblies_fasta = assemblies_fasta,
+        annotations_tbl = annot.featureTables # I'm worried that the order got messed up
+    }
+  }
+}
diff --git a/pipes/WDL/workflows/tasks/interhost.wdl b/pipes/WDL/workflows/tasks/interhost.wdl
index 57fa55cee..636ff55b9 100644
--- a/pipes/WDL/workflows/tasks/interhost.wdl
+++ b/pipes/WDL/workflows/tasks/interhost.wdl
@@ -75,35 +75,68 @@ task ref_guided_consensus_aligned_with_dups {
 #  }
 #}
 
-task multi_align_mafft {
-  Array[File] inputAssemblies # fasta files, one per sample
-  File referenceGenome # fasta
+task multi_align_mafft_ref {
+  File           reference_fasta
+  Array[File]+   assemblies_fasta # fasta files, one per sample, multiple chrs per file okay
+  String?        out_prefix = basename(reference_fasta, '.fasta')
+  Int?           mafft_maxIters
+  Int?           mafft_ep
+
+  command {
+    interhost.py multichr_mafft \
+      ${reference_fasta} ${sep=' ' assemblies_fasta} \
+      . \
+      ${'--ep' + mafft_ep} \
+      ${'--maxiters' + mafft_maxIters} \
+      --outFilePrefix ${out_prefix} \
+      --preservecase \
+      --localpair \
+      --sampleNameListFile ${out_prefix}-sample_names.txt \
+      --loglevel DEBUG
+  }
+
+  output {
+    File        sampleNamesFile   = "${out_prefix}-sample_names.txt"
+    Array[File] alignments_by_chr = glob("${out_prefix}*.fasta")
+  }
 
-  Int? maxIters
-  Int? ep
+  runtime {
+    docker: "quay.io/broadinstitute/viral-ngs"
+    memory: "7 GB"
+    cpu: 4
+    dx_instance_type: "mem1_ssd1_x4"
+  }
+}
 
+task multi_align_mafft {
+  Array[File]+   assemblies_fasta # fasta files, one per sample, multiple chrs per file okay
+  String?        out_prefix = basename(select_first(assemblies_fasta), '.fasta')
+  Int?           mafft_maxIters
+  Int?           mafft_ep
 
   command {
     interhost.py multichr_mafft \
-      ${referenceGenome} \
-      ${sep=' ' inputAssemblies+} \
-      ./ \
-      ${'--ep' + ep} \
-      ${'--maxiters' + maxIters} \
+      ${sep=' ' assemblies_fasta} \
+      . \
+      ${'--ep' + mafft_ep} \
+      ${'--maxiters' + mafft_maxIters} \
+      --outFilePrefix ${out_prefix} \
       --preservecase \
       --localpair \
-      --outFilePrefix aligned \
-      --sampleNameListFile sampleNameList.txt \
+      --sampleNameListFile ${out_prefix}-sample_names.txt \
       --loglevel DEBUG
   }
 
   output {
-    File sampleNamesFile = "sampleNamesList.txt"
-    Array[File] chrAlignedFiles = glob("aligned_*.fasta")
+    File        sampleNamesFile   = "${out_prefix}-sample_names.txt"
+    Array[File] alignments_by_chr = glob("${out_prefix}*.fasta")
   }
+
   runtime {
-    memory: "8 GB"
     docker: "quay.io/broadinstitute/viral-ngs"
+    memory: "7 GB"
+    cpu: 4
+    dx_instance_type: "mem1_ssd1_x4"
   }
 }
 
diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl
index 1d5b02f25..35008ff6c 100644
--- a/pipes/WDL/workflows/tasks/ncbi.wdl
+++ b/pipes/WDL/workflows/tasks/ncbi.wdl
@@ -110,23 +110,22 @@ task download_annotation {
 }
 
 task annot_transfer {
-  # TODO: Iterate over chr-specifc MSAs in workflow rather than in task
-
-  File chrMultipleAlignment # fasta; multiple alignments of sample sequences
-  File referenceFeatureTable # feature table corresponding to the chr in the alignment
-  File referenceGenome # fasta
+  File chr_mutli_aln_fasta # fasta; multiple alignments of sample sequences
+  File reference_fasta # fasta
+  File reference_feature_table # feature table corresponding to the chr in the alignment
 
   command {
     ncbi.py tbl_transfer_prealigned \
-        "${chrMultipleAlignment}" \
-        "${referenceGenome}" \
-        "${referenceFeatureTable}" \
-        "./" \
-        --oob_clip
+        ${chr_mutli_aln_fasta} \
+        ${reference_fasta} \
+        ${reference_feature_table} \
+        . \
+        --oob_clip \
+        --loglevel DEBUG
   }
 
   output {
-    Array[File] featureTables = glob(".tbl")
+    Array[File] featureTables = glob("*.tbl")
   }
   runtime {
     docker: "quay.io/broadinstitute/viral-ngs"
@@ -137,31 +136,36 @@ task annot_transfer {
 }
 
 task prepare_genbank {
-  Array[File] fastaFiles
-  File assemblySummary # summary.assembly.txt
-  File featureTableDir
-
-  String genbankTemplate
-  String genbankSourceTable
-  String biosampleMap
-  String sequencingTech
-  String comment
+  Array[File]+ assemblies_fasta
+  Array[File]+ annotations_tbl
+  File         authors_sbt
+  File         assemblySummary # summary.assembly.txt
+  File         genbankSourceTable
+  File         biosampleMap
+  String       sequencingTech
+  String       comment
 
   command {
+    cp ${sep=' ' annotations_tbl} .
     ncbi.py prep_genbank_files \
-        "${genbankTemplate}" \
-        "${sep=' ' fastaFiles}" \
-        "${featureTableDir}" \
-        --master_source_table "${genbankSourceTable}" \
-        --sequencing_tech "${sequencingTech}" \
-        --biosample_map "${biosampleMap}" \
-        --coverage_table "${assemblySummary}" \
-        --comment "${comment}"
+        ${authors_sbt} \
+        ${sep=' ' assemblies_fasta} \
+        . \
+        --master_source_table ${genbankSourceTable} \
+        --sequencing_tech ${sequencingTech} \
+        --biosample_map ${biosampleMap} \
+        --coverage_table ${assemblySummary} \
+        --comment ${comment} \
+        --loglevel DEBUG
+    tar -czpvf ncbi_package.tar.gz *.val *.cmt *.fsa *.gbf *.sqn *.src *.tbl
   }
 
   output {
-    File errorSummary = "${featureTableDir}/errorsummary.val"
+    Array[File] sequin_files = glob("*.sqn")
+    File        ncbi_package = "ncbi_package.tar.gz"
+    File        errorSummary = "errorsummary.val"
   }
+
   runtime {
     docker: "quay.io/broadinstitute/viral-ngs"
     memory: "3 GB"
diff --git a/util/genbank.py b/util/genbank.py
index 453a780e3..6ea381c4d 100644
--- a/util/genbank.py
+++ b/util/genbank.py
@@ -23,7 +23,6 @@ def parse_accession_str(chr_ref):
     return chr_ref
 
 def get_feature_table_id(featureTableFile):
-    seqid = ""
     with open(featureTableFile, 'rt') as inf:
         for line in inf:
             line = line.rstrip('\r\n')
@@ -37,10 +36,8 @@ def get_feature_table_id(featureTableFile):
                 if not (
                     (seqid.startswith('gb|') or seqid.startswith('ref|')) and seqid.endswith('|') and len(seqid) > 4):
                     raise Exception("reference annotation does not refer to a GenBank or RefSeq accession")
-                seqid = seqid[seqid.find("|") + 1:-1]
-    if len(seqid) > 0:
-        return seqid
-
+                seqid = '|'.join(seqid.split('|')[1:-1])
+                return seqid
 
 def _seq_chunks(seq, n):
     # http://stackoverflow.com/a/312464/190597 (Ned Batchelder)

From 735330454c31c53474a92c1c5c7e0374ae0d6598 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 8 Mar 2018 21:04:54 -0500
Subject: [PATCH 11/35] fix some linking of input files

---
 pipes/WDL/workflows/align_and_annot.wdl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl
index aa8e618e4..666cf2632 100644
--- a/pipes/WDL/workflows/align_and_annot.wdl
+++ b/pipes/WDL/workflows/align_and_annot.wdl
@@ -1,12 +1,11 @@
 import "tasks/interhost.wdl" as interhost
 import "tasks/ncbi.wdl" as ncbi
-import "tasks/reports.wdl" as reports
-
 
 workflow align_and_annot {
 
-  File        reference_fasta
-  Array[File] assemblies_fasta
+  File          reference_fasta
+  Array[File]+  assemblies_fasta
+  Array[File]+  annotations_tbl
 
   call interhost.multi_align_mafft_ref as mafft {
     input:
@@ -14,16 +13,17 @@ workflow align_and_annot {
       assemblies_fasta = assemblies_fasta
   }
 
-  scatter(mutli_aln_fasta in mafft.alignments_by_chr) {
+  scatter(chr_num in range(len(mafft.alignments_by_chr))) {
     call ncbi.annot_transfer as annot {
       input:
-        chr_mutli_aln_fasta = mutli_aln_fasta,
-        reference_fasta = reference_fasta
+        chr_mutli_aln_fasta = mafft.alignments_by_chr[chr_num],
+        reference_fasta = reference_fasta,
+        reference_feature_table = annotations_tbl[chr_num]
     }
     call ncbi.prepare_genbank as genbank {
       input:
         assemblies_fasta = assemblies_fasta,
-        annotations_tbl = annot.featureTables # I'm worried that the order got messed up
+        annotations_tbl = annot.featureTables # I'm worried that the order got messed up and we'll have to remap?
     }
   }
 }

From 7a9141d84c6a9219bd3a720bb2ce76d60549e7ca Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 8 Mar 2018 21:40:35 -0500
Subject: [PATCH 12/35] add bam index files to output of refine_2x_and_plot and
 plot_coverage. Make plot_coverage more consistent with refine_2x_and_plot.

---
 pipes/WDL/workflows/tasks/assembly.wdl |  2 ++
 pipes/WDL/workflows/tasks/ncbi.wdl     |  2 +-
 pipes/WDL/workflows/tasks/reports.wdl  | 28 ++++++++++++++------------
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/pipes/WDL/workflows/tasks/assembly.wdl b/pipes/WDL/workflows/tasks/assembly.wdl
index 8389b6347..8dfa4e73f 100644
--- a/pipes/WDL/workflows/tasks/assembly.wdl
+++ b/pipes/WDL/workflows/tasks/assembly.wdl
@@ -333,8 +333,10 @@ task refine_2x_and_plot {
     File refine2_sites_vcf_gz        = "${sample_name}.refine2.pre_fasta.vcf.gz"
     File final_assembly_fasta        = "${sample_name}.fasta"
     File aligned_bam                 = "${sample_name}.all.bam"
+    File aligned_bam_idx             = "${sample_name}.all.bai"
     File aligned_bam_flagstat        = "${sample_name}.all.bam.flagstat.txt"
     File aligned_only_reads_bam      = "${sample_name}.mapped.bam"
+    File aligned_only_reads_bam_idx  = "${sample_name}.mapped.bai"
     File aligned_only_reads_fastqc   = "${sample_name}.mapped_fastqc.html"
     File coverage_plot               = "${sample_name}.coverage_plot.pdf"
     Int  assembly_length             = read_int("assembly_length")
diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl
index 35008ff6c..d58f10586 100644
--- a/pipes/WDL/workflows/tasks/ncbi.wdl
+++ b/pipes/WDL/workflows/tasks/ncbi.wdl
@@ -87,7 +87,6 @@ task download_annotation {
   String        emailAddress
 
   command {
-    set -ex -o pipefail
     ncbi.py fetch_feature_tables \
         ${emailAddress} \
         ./ \
@@ -146,6 +145,7 @@ task prepare_genbank {
   String       comment
 
   command {
+    set -ex -o pipefail
     cp ${sep=' ' annotations_tbl} .
     ncbi.py prep_genbank_files \
         ${authors_sbt} \
diff --git a/pipes/WDL/workflows/tasks/reports.wdl b/pipes/WDL/workflows/tasks/reports.wdl
index 8a9121fbb..e811fcf34 100644
--- a/pipes/WDL/workflows/tasks/reports.wdl
+++ b/pipes/WDL/workflows/tasks/reports.wdl
@@ -33,7 +33,7 @@ task plot_coverage {
     read_utils.py align_and_fix \
       ${reads_unmapped_bam} \
       assembly.fasta \
-      --outBamAll ${sample_name}.bam \
+      --outBamAll ${sample_name}.all.bam \
       --outBamFiltered ${sample_name}.mapped.bam \
       --GATK_PATH gatk/ \
       --aligner ${aligner} \
@@ -66,22 +66,24 @@ task plot_coverage {
         --plotDPI 100 \
         --loglevel=DEBUG
     else
-      touch ${sample_name}.coverage_plot.pdf ${sample_name}.mapped_fastqc.html
+      touch ${sample_name}.coverage_plot.pdf
     fi
   }
 
   output {
-    File reads_bam                  = "${sample_name}.bam"
-    File reads_bam_flagstat         = "${sample_name}.bam.flagstat.txt"
-    File mapped_reads_bam           = "${sample_name}.mapped.bam"
-    File mapped_reads_fastqc        = "${sample_name}.mapped_fastqc.html"
-    File coverage_plot              = "${sample_name}.coverage_plot.pdf"
-    Int assembly_length             = read_int("assembly_length")
-    Int assembly_length_unambiguous = read_int("assembly_length_unambiguous")
-    Int reads_aligned               = read_int("reads_aligned")
-    Int read_pairs_aligned          = read_int("read_pairs_aligned")
-    Int bases_aligned               = read_int("bases_aligned")
-    Int mean_coverage               = read_int("mean_coverage")
+    File aligned_bam                 = "${sample_name}.all.bam"
+    File aligned_bam_idx             = "${sample_name}.all.bai"
+    File aligned_bam_flagstat        = "${sample_name}.all.bam.flagstat.txt"
+    File aligned_only_reads_bam      = "${sample_name}.mapped.bam"
+    File aligned_only_reads_bam_idx  = "${sample_name}.mapped.bai"
+    File aligned_only_reads_fastqc   = "${sample_name}.mapped_fastqc.html"
+    File coverage_plot               = "${sample_name}.coverage_plot.pdf"
+    Int  assembly_length             = read_int("assembly_length")
+    Int  assembly_length_unambiguous = read_int("assembly_length_unambiguous")
+    Int  reads_aligned               = read_int("reads_aligned")
+    Int  read_pairs_aligned          = read_int("read_pairs_aligned")
+    Int  bases_aligned               = read_int("bases_aligned")
+    Int  mean_coverage               = read_int("mean_coverage")
   }
 
   runtime {

From 8053789cf986374ac48e19949b6dd2cab436e5b5 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 8 Mar 2018 22:24:58 -0500
Subject: [PATCH 13/35] rework the scatter in align_and_annot, a few other
 fixes

---
 ncbi.py                                      |   2 +-
 pipes/WDL/workflows/align_and_annot.wdl      |   9 +-
 pipes/WDL/workflows/download_annotations.wdl |   7 ++
 pipes/WDL/workflows/tasks/ncbi.wdl           | 106 +++++--------------
 pipes/WDL/workflows/tasks/taxon_filter.wdl   |  21 ++++
 5 files changed, 62 insertions(+), 83 deletions(-)
 create mode 100644 pipes/WDL/workflows/download_annotations.wdl

diff --git a/ncbi.py b/ncbi.py
index 08e8db299..6f40589fd 100755
--- a/ncbi.py
+++ b/ncbi.py
@@ -51,7 +51,7 @@ def tbl_transfer_common(cmap, ref_tbl, out_tbl, alt_chrlens, oob_clip=False):
                     if not ((refID.startswith('gb|') or refID.startswith('ref|')) and refID.endswith('|') and
                                 len(refID) > 4):
                         raise Exception("reference annotation does not refer to a GenBank or RefSeq accession")
-                    refID = refID[refID.find("|") + 1:-1]
+                    refID = '|'.join(refID.split('|')[1:-1])
                     refSeqID = [x for x in cmap.keys() if refID in x][0]
                     #altid = cmap.mapChr(refSeqID, altid)
                     altid = list(set(cmap.keys()) - set([refSeqID]))[0]  # cmap.mapChr(refSeqID, altid)
diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl
index 666cf2632..5fbd2f57c 100644
--- a/pipes/WDL/workflows/align_and_annot.wdl
+++ b/pipes/WDL/workflows/align_and_annot.wdl
@@ -13,17 +13,18 @@ workflow align_and_annot {
       assemblies_fasta = assemblies_fasta
   }
 
-  scatter(chr_num in range(len(mafft.alignments_by_chr))) {
+  scatter(aln_by_chr, ref_annot_by_chr in zip(mafft.alignments_by_chr, annotations_tbl)) {
     call ncbi.annot_transfer as annot {
       input:
-        chr_mutli_aln_fasta = mafft.alignments_by_chr[chr_num],
+        chr_mutli_aln_fasta = aln_by_chr,
         reference_fasta = reference_fasta,
-        reference_feature_table = annotations_tbl[chr_num]
+        reference_feature_table = ref_annot_by_chr
     }
     call ncbi.prepare_genbank as genbank {
       input:
         assemblies_fasta = assemblies_fasta,
-        annotations_tbl = annot.featureTables # I'm worried that the order got messed up and we'll have to remap?
+        annotations_tbl = annot.transferred_feature_tables,
+        out_prefix = basename(annotations_tbl, '.tbl')
     }
   }
 }
diff --git a/pipes/WDL/workflows/download_annotations.wdl b/pipes/WDL/workflows/download_annotations.wdl
new file mode 100644
index 000000000..a3ef7c6f2
--- /dev/null
+++ b/pipes/WDL/workflows/download_annotations.wdl
@@ -0,0 +1,7 @@
+import "tasks/ncbi.wdl" as ncbi
+
+workflow download_annotations {
+
+  call ncbi.download_annotations
+
+}
diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl
index d58f10586..3a2668e21 100644
--- a/pipes/WDL/workflows/tasks/ncbi.wdl
+++ b/pipes/WDL/workflows/tasks/ncbi.wdl
@@ -1,54 +1,19 @@
-task download_reference_genome {
-  String referenceName
-  Array[String] accessions # NCBI accessions to include in the reference
-  String emailAddress
 
-  command {
-    ncbi.py fetch_fastas \
-        "${emailAddress}" \
-        "./" \
-        "${sep=' ' accessions}" \
-        --combinedFilePrefix "${referenceName}" \
-        --removeSeparateFiles \
-        --forceOverwrite
-    ncbi.py fetch_feature_tables \
-        "${emailAddress}" \
-        "./" \
-        "${sep=' ' accessions}" \
-        --forceOverwrite
-  }
-
-  output {
-    File referenceFasta = "${referenceName}.fasta"
-    Array[File] featureTables = glob("*.tbl")
-  }
-  runtime {
-    docker: "quay.io/broadinstitute/viral-ngs"
-    memory: "3 GB"
-    cpu: 2
-    dx_instance_type: "mem1_ssd1_x2"
-  }
-}
-
-task download_lastal_sources {
-  String referenceName
-  Array[String] accessions # NCBI accessions to include in the lastal db
-  String emailAddress
+task download_fasta {
+  String         out_prefix
+  Array[String]+ accessions
+  String         emailAddress
 
   command {
     ncbi.py fetch_fastas \
-        "${emailAddress}" \
-        "./" \
-        "${sep=' ' accessions}" \
-        --combinedFilePrefix lastal \
-        --removeSeparateFiles \
-        --forceOverwrite \
-        --chunkSize 300
+        ${emailAddress} \
+        . \
+        ${sep=' ' accessions} \
+        --combinedFilePrefix ${out_prefix} \
   }
 
   output {
-    File referenceFasta = "lastal.fasta"
-    Array[File] featureTables = glob("*.tbl")
+    File sequences_fasta = "${out_prefix}.fasta"
   }
   runtime {
     docker: "quay.io/broadinstitute/viral-ngs"
@@ -58,46 +23,30 @@ task download_lastal_sources {
   }
 }
 
-task build_lastal_db {
-  File    sequences_fasta
-
-  String  db_name = basename(sequences_fasta, ".fasta")
+task download_annotations {
+  Array[String]+ accessions
+  String         emailAddress
+  String         combined_fasta
 
   command {
     set -ex -o pipefail
-    taxon_filter.py lastal_build_db ${sequences_fasta} ./ --loglevel=DEBUG
-    tar -c ${db_name}* | lz4 -9 > ${db_name}.tar.lz4
-  }
-
-  output {
-    File lastal_db = "${db_name}.tar.lz4"
-  }
-
-  runtime {
-    docker: "quay.io/broadinstitute/viral-ngs"
-    memory: "7 GB"
-    cpu: 2
-    dx_instance_type: "mem1_ssd1_x4"
-  }
-}
-
-task download_annotation {
-  String        referenceName
-  Array[String] accessions
-  String        emailAddress
-
-  command {
     ncbi.py fetch_feature_tables \
         ${emailAddress} \
         ./ \
         ${sep=' ' accessions} \
-        --combinedFilePrefix ${referenceName} \
+        --loglevel DEBUG
+    ncbi.py fetch_fastas \
+        ${emailAddress} \
+        ./ \
+        ${sep=' ' accessions} \
+        --combinedFilePrefix "${combined_fasta}" \
         --loglevel DEBUG
   }
 
   output {
-    File        featureTable    = "${referenceName}.tbl"
-    Array[File] featureTables   = glob("*.tbl")
+    File        combined_fasta = "${combined_fasta}.fasta"
+    Array[File] genomes_fasta  = glob("*.fasta")
+    Array[File] features_tbl   = glob("*.tbl")
   }
 
   runtime {
@@ -109,8 +58,8 @@ task download_annotation {
 }
 
 task annot_transfer {
-  File chr_mutli_aln_fasta # fasta; multiple alignments of sample sequences
-  File reference_fasta # fasta
+  File chr_mutli_aln_fasta # fasta; multiple alignments of sample sequences for a single chr
+  File reference_fasta # fasta (may contain multiple chrs, only one with the same name as reference_feature_table will be used)
   File reference_feature_table # feature table corresponding to the chr in the alignment
 
   command {
@@ -124,7 +73,7 @@ task annot_transfer {
   }
 
   output {
-    Array[File] featureTables = glob("*.tbl")
+    Array[File] transferred_feature_tables = glob("*.tbl")
   }
   runtime {
     docker: "quay.io/broadinstitute/viral-ngs"
@@ -143,6 +92,7 @@ task prepare_genbank {
   File         biosampleMap
   String       sequencingTech
   String       comment
+  String       out_prefix = "ncbi_package"
 
   command {
     set -ex -o pipefail
@@ -157,12 +107,12 @@ task prepare_genbank {
         --coverage_table ${assemblySummary} \
         --comment ${comment} \
         --loglevel DEBUG
-    tar -czpvf ncbi_package.tar.gz *.val *.cmt *.fsa *.gbf *.sqn *.src *.tbl
+    tar -czpvf ${out_prefix}.tar.gz *.val *.cmt *.fsa *.gbf *.sqn *.src *.tbl
   }
 
   output {
     Array[File] sequin_files = glob("*.sqn")
-    File        ncbi_package = "ncbi_package.tar.gz"
+    File        ncbi_package = "${out_prefix}.tar.gz"
     File        errorSummary = "errorsummary.val"
   }
 
diff --git a/pipes/WDL/workflows/tasks/taxon_filter.wdl b/pipes/WDL/workflows/tasks/taxon_filter.wdl
index 97b5ffe58..7028784c4 100644
--- a/pipes/WDL/workflows/tasks/taxon_filter.wdl
+++ b/pipes/WDL/workflows/tasks/taxon_filter.wdl
@@ -117,6 +117,27 @@ task filter_to_taxon {
   }
 }
 
+task build_lastal_db {
+  File    sequences_fasta
+  String  db_name = basename(sequences_fasta, ".fasta")
+
+  command {
+    set -ex -o pipefail
+    taxon_filter.py lastal_build_db ${sequences_fasta} ./ --loglevel=DEBUG
+    tar -c ${db_name}* | lz4 -9 > ${db_name}.tar.lz4
+  }
+
+  output {
+    File lastal_db = "${db_name}.tar.lz4"
+  }
+
+  runtime {
+    docker: "quay.io/broadinstitute/viral-ngs"
+    memory: "7 GB"
+    cpu: 2
+    dx_instance_type: "mem1_ssd1_x4"
+  }
+}
 
 task merge_one_per_sample {
   String       out_bam_basename

From a42ed0de500d82cf822db670089dcf8b02e1627c Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 8 Mar 2018 22:37:59 -0500
Subject: [PATCH 14/35] proper indexing of WDL Pair

---
 pipes/WDL/workflows/align_and_annot.wdl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl
index 5fbd2f57c..f4725bfc7 100644
--- a/pipes/WDL/workflows/align_and_annot.wdl
+++ b/pipes/WDL/workflows/align_and_annot.wdl
@@ -13,12 +13,12 @@ workflow align_and_annot {
       assemblies_fasta = assemblies_fasta
   }
 
-  scatter(aln_by_chr, ref_annot_by_chr in zip(mafft.alignments_by_chr, annotations_tbl)) {
+  scatter(by_chr in zip(mafft.alignments_by_chr, annotations_tbl)) {
     call ncbi.annot_transfer as annot {
       input:
-        chr_mutli_aln_fasta = aln_by_chr,
+        chr_mutli_aln_fasta = by_chr.left,
         reference_fasta = reference_fasta,
-        reference_feature_table = ref_annot_by_chr
+        reference_feature_table = by_chr.right
     }
     call ncbi.prepare_genbank as genbank {
       input:

From c5aa557de231f1edcd3966d9704cc8616beb4baa Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 8 Mar 2018 22:47:14 -0500
Subject: [PATCH 15/35] fix WDL bug

---
 pipes/WDL/workflows/tasks/ncbi.wdl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl
index 3a2668e21..63381b69b 100644
--- a/pipes/WDL/workflows/tasks/ncbi.wdl
+++ b/pipes/WDL/workflows/tasks/ncbi.wdl
@@ -26,7 +26,7 @@ task download_fasta {
 task download_annotations {
   Array[String]+ accessions
   String         emailAddress
-  String         combined_fasta
+  String         combined_out_prefix
 
   command {
     set -ex -o pipefail
@@ -39,12 +39,12 @@ task download_annotations {
         ${emailAddress} \
         ./ \
         ${sep=' ' accessions} \
-        --combinedFilePrefix "${combined_fasta}" \
+        --combinedFilePrefix "${combined_out_prefix}" \
         --loglevel DEBUG
   }
 
   output {
-    File        combined_fasta = "${combined_fasta}.fasta"
+    File        combined_fasta = "${combined_out_prefix}.fasta"
     Array[File] genomes_fasta  = glob("*.fasta")
     Array[File] features_tbl   = glob("*.tbl")
   }

From 07d235a58a0a3a29b0243a69a90d0650d7314e9c Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 8 Mar 2018 22:59:43 -0500
Subject: [PATCH 16/35] fix input bindings in align_and_annot workflow, make
 optional params optional in prep_genbank_files task

---
 pipes/WDL/workflows/align_and_annot.wdl |  2 +-
 pipes/WDL/workflows/tasks/ncbi.wdl      | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl
index f4725bfc7..d26b9741f 100644
--- a/pipes/WDL/workflows/align_and_annot.wdl
+++ b/pipes/WDL/workflows/align_and_annot.wdl
@@ -24,7 +24,7 @@ workflow align_and_annot {
       input:
         assemblies_fasta = assemblies_fasta,
         annotations_tbl = annot.transferred_feature_tables,
-        out_prefix = basename(annotations_tbl, '.tbl')
+        out_prefix = basename(by_chr.right, '.tbl')
     }
   }
 }
diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl
index 63381b69b..a83f1cc06 100644
--- a/pipes/WDL/workflows/tasks/ncbi.wdl
+++ b/pipes/WDL/workflows/tasks/ncbi.wdl
@@ -87,11 +87,11 @@ task prepare_genbank {
   Array[File]+ assemblies_fasta
   Array[File]+ annotations_tbl
   File         authors_sbt
-  File         assemblySummary # summary.assembly.txt
-  File         genbankSourceTable
-  File         biosampleMap
-  String       sequencingTech
-  String       comment
+  File?        coverage_table # summary.assembly.txt
+  File?        genbankSourceTable
+  File?        biosampleMap
+  String?      sequencingTech
+  String?      comment
   String       out_prefix = "ncbi_package"
 
   command {
@@ -101,11 +101,11 @@ task prepare_genbank {
         ${authors_sbt} \
         ${sep=' ' assemblies_fasta} \
         . \
-        --master_source_table ${genbankSourceTable} \
-        --sequencing_tech ${sequencingTech} \
-        --biosample_map ${biosampleMap} \
-        --coverage_table ${assemblySummary} \
-        --comment ${comment} \
+        ${'--master_source_table=' + genbankSourceTable} \
+        ${'--sequencing_tech=' + sequencingTech} \
+        ${'--biosample_map=' + biosampleMap} \
+        ${'--coverage_table=' + coverage_table} \
+        ${'--comment=' + comment} \
         --loglevel DEBUG
     tar -czpvf ${out_prefix}.tar.gz *.val *.cmt *.fsa *.gbf *.sqn *.src *.tbl
   }

From 4a3053bf8f01b11c626fa4a9b7ae5f93a2e88c09 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 8 Mar 2018 23:17:35 -0500
Subject: [PATCH 17/35] dummy commit for travis

---
 pipes/WDL/workflows/tasks/ncbi.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl
index a83f1cc06..2005e05e2 100644
--- a/pipes/WDL/workflows/tasks/ncbi.wdl
+++ b/pipes/WDL/workflows/tasks/ncbi.wdl
@@ -87,7 +87,7 @@ task prepare_genbank {
   Array[File]+ assemblies_fasta
   Array[File]+ annotations_tbl
   File         authors_sbt
-  File?        coverage_table # summary.assembly.txt
+  File?        coverage_table # summary.assembly.txt (from Snakemake)
   File?        genbankSourceTable
   File?        biosampleMap
   String?      sequencingTech

From c39120b8925f908c3a4177f69d51c5c880945f0a Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 8 Mar 2018 23:44:54 -0500
Subject: [PATCH 18/35] bump dxWDL version fro 0.59 to 0.60.2

---
 travis/install-wdl.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/travis/install-wdl.sh b/travis/install-wdl.sh
index 9fda7f316..40d9617ce 100755
--- a/travis/install-wdl.sh
+++ b/travis/install-wdl.sh
@@ -19,7 +19,7 @@ cached_fetch_jar_from_github () {
 
 cached_fetch_jar_from_github broadinstitute cromwell womtool 30.2
 cached_fetch_jar_from_github broadinstitute cromwell cromwell 30.2
-cached_fetch_jar_from_github dnanexus dxWDL dxWDL 0.59
+cached_fetch_jar_from_github dnanexus dxWDL dxWDL 0.60.2
 
 TGZ=dx-toolkit-v0.240.1-ubuntu-14.04-amd64.tar.gz
 if [ ! -f $CACHE_DIR/$TGZ ]; then

From 9d6a5afbf94e7f59c5f52b7b3e01c8222dd09574 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 9 Mar 2018 00:07:01 -0500
Subject: [PATCH 19/35] add new required -imports param for dxWDL

---
 travis/build-dx.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/travis/build-dx.sh b/travis/build-dx.sh
index 3426dd74c..88ba4a67e 100755
--- a/travis/build-dx.sh
+++ b/travis/build-dx.sh
@@ -42,6 +42,7 @@ for workflow in pipes/WDL/workflows/*.wdl; do
 
 	  dx_id=$(java -jar dxWDL.jar compile \
       $workflow $CMD_INPUT $CMD_DEFAULTS -f \
+      -imports pipes/WDL/workflows/ \
       -destination /build/$VERSION/$workflow_name)
 	  echo "Succeeded: $workflow_name = $dx_id"
     echo -e "$workflow_name\t$dx_id" >> $COMPILE_SUCCESS

From 5efc81f7523b4f468baa308da26b1eca3886eee3 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 9 Mar 2018 00:36:31 -0500
Subject: [PATCH 20/35] use -imports and --imports params to dxWDL and cromwell
 invocations and remove "tasks/" from all WDL import statements. comment out
 align_and_annot for now to get compile working again

---
 pipes/WDL/workflows/align_and_annot.wdl              |  6 ++++--
 pipes/WDL/workflows/align_and_plot.wdl               |  2 +-
 pipes/WDL/workflows/assemble_denovo.wdl              |  4 ++--
 pipes/WDL/workflows/assemble_denovo_with_deplete.wdl |  4 ++--
 pipes/WDL/workflows/assemble_refbased.wdl            |  4 ++--
 pipes/WDL/workflows/classify_kraken.wdl              |  2 +-
 pipes/WDL/workflows/contigs.wdl                      |  6 +++---
 pipes/WDL/workflows/demux_metag.wdl                  | 10 +++++-----
 pipes/WDL/workflows/demux_only.wdl                   |  4 ++--
 pipes/WDL/workflows/demux_plus.wdl                   | 10 +++++-----
 pipes/WDL/workflows/deplete_only.wdl                 |  4 ++--
 pipes/WDL/workflows/download_annotations.wdl         |  2 +-
 pipes/WDL/workflows/scaffold_and_refine.wdl          |  4 ++--
 pipes/WDL/workflows/spikein.wdl                      |  2 +-
 pipes/WDL/workflows/tasks/interhost.wdl              |  2 +-
 travis/build-dx.sh                                   |  2 +-
 travis/tests-cromwell.sh                             |  1 +
 17 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl
index d26b9741f..14939bf48 100644
--- a/pipes/WDL/workflows/align_and_annot.wdl
+++ b/pipes/WDL/workflows/align_and_annot.wdl
@@ -1,5 +1,7 @@
-import "tasks/interhost.wdl" as interhost
-import "tasks/ncbi.wdl" as ncbi
+import "interhost.wdl" as interhost
+import "ncbi.wdl" as ncbi
+
+# DX_SKIP_WORKFLOW
 
 workflow align_and_annot {
 
diff --git a/pipes/WDL/workflows/align_and_plot.wdl b/pipes/WDL/workflows/align_and_plot.wdl
index a8c8e2862..fd43049d8 100644
--- a/pipes/WDL/workflows/align_and_plot.wdl
+++ b/pipes/WDL/workflows/align_and_plot.wdl
@@ -1,4 +1,4 @@
-import "tasks/reports.wdl" as reports
+import "reports.wdl" as reports
 
 workflow align_and_plot {
   call reports.plot_coverage {
diff --git a/pipes/WDL/workflows/assemble_denovo.wdl b/pipes/WDL/workflows/assemble_denovo.wdl
index acd90aba7..2317cbd92 100644
--- a/pipes/WDL/workflows/assemble_denovo.wdl
+++ b/pipes/WDL/workflows/assemble_denovo.wdl
@@ -1,5 +1,5 @@
-import "tasks/taxon_filter.wdl" as taxon_filter
-import "tasks/assembly.wdl" as assembly
+import "taxon_filter.wdl" as taxon_filter
+import "assembly.wdl" as assembly
 
 workflow assemble_denovo {
   File reads_unmapped_bam
diff --git a/pipes/WDL/workflows/assemble_denovo_with_deplete.wdl b/pipes/WDL/workflows/assemble_denovo_with_deplete.wdl
index cf7419d90..e28cbda20 100644
--- a/pipes/WDL/workflows/assemble_denovo_with_deplete.wdl
+++ b/pipes/WDL/workflows/assemble_denovo_with_deplete.wdl
@@ -1,5 +1,5 @@
-import "tasks/taxon_filter.wdl" as taxon_filter
-import "tasks/assembly.wdl" as assembly
+import "taxon_filter.wdl" as taxon_filter
+import "assembly.wdl" as assembly
 
 workflow assemble_denovo_with_deplete {
 
diff --git a/pipes/WDL/workflows/assemble_refbased.wdl b/pipes/WDL/workflows/assemble_refbased.wdl
index 3dc563af7..3a8c00476 100644
--- a/pipes/WDL/workflows/assemble_refbased.wdl
+++ b/pipes/WDL/workflows/assemble_refbased.wdl
@@ -1,5 +1,5 @@
-import "tasks/assembly.wdl" as assembly
+import "assembly.wdl" as assembly
 
 workflow assemble_refbased {
   call assembly.refine_2x_and_plot
-}
\ No newline at end of file
+}
diff --git a/pipes/WDL/workflows/classify_kraken.wdl b/pipes/WDL/workflows/classify_kraken.wdl
index a1af021e9..abbd40d4d 100644
--- a/pipes/WDL/workflows/classify_kraken.wdl
+++ b/pipes/WDL/workflows/classify_kraken.wdl
@@ -1,4 +1,4 @@
-import "tasks/metagenomics.wdl" as metagenomics
+import "metagenomics.wdl" as metagenomics
 
 workflow classify_kraken {
   call metagenomics.kraken
diff --git a/pipes/WDL/workflows/contigs.wdl b/pipes/WDL/workflows/contigs.wdl
index baa9d8fcf..f71be3152 100644
--- a/pipes/WDL/workflows/contigs.wdl
+++ b/pipes/WDL/workflows/contigs.wdl
@@ -1,6 +1,6 @@
-import "tasks/metagenomics.wdl" as metagenomics
-import "tasks/taxon_filter.wdl" as taxon_filter
-import "tasks/assembly.wdl" as assembly
+import "metagenomics.wdl" as metagenomics
+import "taxon_filter.wdl" as taxon_filter
+import "assembly.wdl" as assembly
 
 workflow contigs {
 
diff --git a/pipes/WDL/workflows/demux_metag.wdl b/pipes/WDL/workflows/demux_metag.wdl
index f48b74ff6..de2e467fa 100644
--- a/pipes/WDL/workflows/demux_metag.wdl
+++ b/pipes/WDL/workflows/demux_metag.wdl
@@ -1,10 +1,10 @@
 #DX_SKIP_WORKFLOW
 
-import "tasks/demux.wdl" as demux
-import "tasks/metagenomics.wdl" as metagenomics
-import "tasks/taxon_filter.wdl" as taxon_filter
-import "tasks/assembly.wdl" as assembly
-import "tasks/reports.wdl" as reports
+import "demux.wdl" as demux
+import "metagenomics.wdl" as metagenomics
+import "taxon_filter.wdl" as taxon_filter
+import "assembly.wdl" as assembly
+import "reports.wdl" as reports
 
 workflow demux_metag {
   File krona_taxonomy_db_tgz
diff --git a/pipes/WDL/workflows/demux_only.wdl b/pipes/WDL/workflows/demux_only.wdl
index 745eabe60..4fcf8971d 100644
--- a/pipes/WDL/workflows/demux_only.wdl
+++ b/pipes/WDL/workflows/demux_only.wdl
@@ -1,5 +1,5 @@
-import "tasks/demux.wdl" as tasks_demux
+import "demux.wdl" as tasks_demux
 
 workflow demux_only {
   call tasks_demux.illumina_demux
-}
\ No newline at end of file
+}
diff --git a/pipes/WDL/workflows/demux_plus.wdl b/pipes/WDL/workflows/demux_plus.wdl
index ef37579f3..2582aef98 100644
--- a/pipes/WDL/workflows/demux_plus.wdl
+++ b/pipes/WDL/workflows/demux_plus.wdl
@@ -1,8 +1,8 @@
-import "tasks/demux.wdl" as demux
-import "tasks/metagenomics.wdl" as metagenomics
-import "tasks/taxon_filter.wdl" as taxon_filter
-import "tasks/assembly.wdl" as assembly
-import "tasks/reports.wdl" as reports
+import "demux.wdl" as demux
+import "metagenomics.wdl" as metagenomics
+import "taxon_filter.wdl" as taxon_filter
+import "assembly.wdl" as assembly
+import "reports.wdl" as reports
 
 workflow demux_plus {
 
diff --git a/pipes/WDL/workflows/deplete_only.wdl b/pipes/WDL/workflows/deplete_only.wdl
index 67aa73bb5..36914fdfa 100644
--- a/pipes/WDL/workflows/deplete_only.wdl
+++ b/pipes/WDL/workflows/deplete_only.wdl
@@ -1,5 +1,5 @@
-import "tasks/taxon_filter.wdl" as taxon_filter
+import "taxon_filter.wdl" as taxon_filter
 
 workflow deplete_only {
   call taxon_filter.deplete_taxa
-}
\ No newline at end of file
+}
diff --git a/pipes/WDL/workflows/download_annotations.wdl b/pipes/WDL/workflows/download_annotations.wdl
index a3ef7c6f2..296673624 100644
--- a/pipes/WDL/workflows/download_annotations.wdl
+++ b/pipes/WDL/workflows/download_annotations.wdl
@@ -1,4 +1,4 @@
-import "tasks/ncbi.wdl" as ncbi
+import "ncbi.wdl" as ncbi
 
 workflow download_annotations {
 
diff --git a/pipes/WDL/workflows/scaffold_and_refine.wdl b/pipes/WDL/workflows/scaffold_and_refine.wdl
index b6ff3745c..44916720a 100644
--- a/pipes/WDL/workflows/scaffold_and_refine.wdl
+++ b/pipes/WDL/workflows/scaffold_and_refine.wdl
@@ -1,4 +1,4 @@
-import "tasks/assembly.wdl" as assembly
+import "assembly.wdl" as assembly
 
 workflow scaffold_and_refine {
   File reads_unmapped_bam
@@ -13,4 +13,4 @@ workflow scaffold_and_refine {
       assembly_fasta = scaffold.scaffold_fasta,
       reads_unmapped_bam = reads_unmapped_bam
   }
-}
\ No newline at end of file
+}
diff --git a/pipes/WDL/workflows/spikein.wdl b/pipes/WDL/workflows/spikein.wdl
index 583d364c7..1ecedb63a 100644
--- a/pipes/WDL/workflows/spikein.wdl
+++ b/pipes/WDL/workflows/spikein.wdl
@@ -1,4 +1,4 @@
-import "tasks/reports.wdl" as reports
+import "reports.wdl" as reports
 
 workflow spikein {
 
diff --git a/pipes/WDL/workflows/tasks/interhost.wdl b/pipes/WDL/workflows/tasks/interhost.wdl
index 636ff55b9..f4e1ce423 100644
--- a/pipes/WDL/workflows/tasks/interhost.wdl
+++ b/pipes/WDL/workflows/tasks/interhost.wdl
@@ -110,7 +110,7 @@ task multi_align_mafft_ref {
 
 task multi_align_mafft {
   Array[File]+   assemblies_fasta # fasta files, one per sample, multiple chrs per file okay
-  String?        out_prefix = basename(select_first(assemblies_fasta), '.fasta')
+  String         out_prefix
   Int?           mafft_maxIters
   Int?           mafft_ep
 
diff --git a/travis/build-dx.sh b/travis/build-dx.sh
index 88ba4a67e..c10207ba5 100755
--- a/travis/build-dx.sh
+++ b/travis/build-dx.sh
@@ -42,7 +42,7 @@ for workflow in pipes/WDL/workflows/*.wdl; do
 
 	  dx_id=$(java -jar dxWDL.jar compile \
       $workflow $CMD_INPUT $CMD_DEFAULTS -f \
-      -imports pipes/WDL/workflows/ \
+      -imports pipes/WDL/workflows/tasks/ \
       -destination /build/$VERSION/$workflow_name)
 	  echo "Succeeded: $workflow_name = $dx_id"
     echo -e "$workflow_name\t$dx_id" >> $COMPILE_SUCCESS
diff --git a/travis/tests-cromwell.sh b/travis/tests-cromwell.sh
index 0ae71ae60..13e92c1b5 100755
--- a/travis/tests-cromwell.sh
+++ b/travis/tests-cromwell.sh
@@ -13,6 +13,7 @@ for workflow in pipes/WDL/workflows/*.wdl; do
 		# the "cat" is to allow a pipe failure (otherwise it halts because of set -e)
 		java -jar cromwell.jar run \
 			workflows/$workflow_name.wdl \
+			--imports tasks \
 			-i $input_json | tee cromwell.out
 		if [ ${PIPESTATUS[0]} -gt 0 ]; then
 			echo "error running $workflow_name"

From f3555bb7ca5d152e0e866d9aac7685adbb5c727a Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 9 Mar 2018 08:55:47 -0500
Subject: [PATCH 21/35] fix validate wdl script

---
 travis/validate-wdl.sh | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/travis/validate-wdl.sh b/travis/validate-wdl.sh
index 91e4be947..cdf78d3ec 100755
--- a/travis/validate-wdl.sh
+++ b/travis/validate-wdl.sh
@@ -1,9 +1,23 @@
 #!/bin/bash
 set -e -o pipefail
 
-ln -s pipes/WDL/workflows/tasks .
-for workflow in pipes/WDL/workflows/*.wdl; do
+# validate each imported library of tasks on its own
+for tasks in pipes/WDL/workflows/tasks/*.wdl; do
+  echo "validating tasks $tasks"
+  java -jar womtool.jar validate $tasks
+done
+
+# validate the workflow files
+# unfortunately, dxWDL now requires the -imports parameter and cromwell supports
+# it as well but womtool validate does not yet support it! so we have to copy
+# everything to a temp dir
+mkdir wdl_validate_test
+cd wdl_validate_test
+cp ../pipes/WDL/workflows/tasks/*.wdl ../pipes/WDL/workflows/*.wdl .
+for workflow in ../pipes/WDL/workflows/*.wdl; do
+  workflow=`basename $workflow`
   echo "validating $workflow"
   java -jar womtool.jar validate $workflow
 done
-rm tasks
+cd -
+rm -r wdl_validate_test

From 71bc09397337fc9dfd46b081464a5b75a927012f Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 9 Mar 2018 12:53:43 -0500
Subject: [PATCH 22/35] updates to tbl2asn invocations

---
 ncbi.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/ncbi.py b/ncbi.py
index 6f40589fd..39914d9f5 100755
--- a/ncbi.py
+++ b/ncbi.py
@@ -396,7 +396,7 @@ def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=N
 
 def prep_genbank_files(templateFile, fasta_files, annotDir,
                        master_source_table=None, comment=None, sequencing_tech=None,
-                       coverage_table=None, biosample_map=None):
+                       coverage_table=None, biosample_map=None, organism=None):
     ''' Prepare genbank submission files.  Requires .fasta and .tbl files as input,
         as well as numerous other metadata files for the submission.  Creates a
         directory full of files (.sqn in particular) that can be sent to GenBank.
@@ -451,7 +451,11 @@ def prep_genbank_files(templateFile, fasta_files, annotDir,
 
     # run tbl2asn (relies on filesnames matching by prefix)
     tbl2asn = tools.tbl2asn.Tbl2AsnTool()
-    tbl2asn.execute(templateFile, annotDir, comment=comment, per_genome_comment=True)
+    source_quals = []
+    if organism:
+        source_quals = [('organism', organism)]
+    tbl2asn.execute(templateFile, annotDir, comment=comment,
+        per_genome_comment=True, source_quals=source_quals)
 
 
 def parser_prep_genbank_files(parser=argparse.ArgumentParser()):
@@ -462,6 +466,7 @@ def parser_prep_genbank_files(parser=argparse.ArgumentParser()):
     parser.add_argument('--comment', default=None, help='comment field')
     parser.add_argument('--sequencing_tech', default=None, help='sequencing technology (e.g. Illumina HiSeq 2500)')
     parser.add_argument('--master_source_table', default=None, help='source modifier table')
+    parser.add_argument('--organism', default=None, help='species name')
     parser.add_argument("--biosample_map",
                         help="""A file with two columns and a header: sample and BioSample.
         This file may refer to samples that are not included in this submission.""")

From b8cf6ea42514f5405d35fad990058a0a2b21fb0e Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 9 Mar 2018 12:58:45 -0500
Subject: [PATCH 23/35] oops

---
 travis/validate-wdl.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/travis/validate-wdl.sh b/travis/validate-wdl.sh
index cdf78d3ec..ce27c7bfe 100755
--- a/travis/validate-wdl.sh
+++ b/travis/validate-wdl.sh
@@ -17,7 +17,7 @@ cp ../pipes/WDL/workflows/tasks/*.wdl ../pipes/WDL/workflows/*.wdl .
 for workflow in ../pipes/WDL/workflows/*.wdl; do
   workflow=`basename $workflow`
   echo "validating $workflow"
-  java -jar womtool.jar validate $workflow
+  java -jar ../womtool.jar validate $workflow
 done
 cd -
 rm -r wdl_validate_test

From 82bda37f47091ba952e425128529939439c7df0e Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 9 Mar 2018 13:42:59 -0500
Subject: [PATCH 24/35] change deprecated dx instance

---
 pipes/WDL/workflows/tasks/taxon_filter.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipes/WDL/workflows/tasks/taxon_filter.wdl b/pipes/WDL/workflows/tasks/taxon_filter.wdl
index 7028784c4..1736af871 100644
--- a/pipes/WDL/workflows/tasks/taxon_filter.wdl
+++ b/pipes/WDL/workflows/tasks/taxon_filter.wdl
@@ -175,6 +175,6 @@ task merge_one_per_sample {
     memory: "7 GB"
     cpu: 4
     docker: "quay.io/broadinstitute/viral-ngs"
-    dx_instance_type: "mem2_hdd2_x4"
+    dx_instance_type: "mem1_hdd2_x8"
   }
 }

From 291d10467af9ed94dc7ce3c7956e80506e2ccf06 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 9 Mar 2018 14:04:10 -0500
Subject: [PATCH 25/35] dx is deprecating all hdd instances?

---
 pipes/WDL/workflows/tasks/metagenomics.wdl | 2 +-
 pipes/WDL/workflows/tasks/taxon_filter.wdl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipes/WDL/workflows/tasks/metagenomics.wdl b/pipes/WDL/workflows/tasks/metagenomics.wdl
index a0a6faba9..d3b31a25c 100644
--- a/pipes/WDL/workflows/tasks/metagenomics.wdl
+++ b/pipes/WDL/workflows/tasks/metagenomics.wdl
@@ -117,7 +117,7 @@ task krona {
     docker: "quay.io/broadinstitute/viral-ngs"
     memory: "4 GB"
     cpu: 1
-    dx_instance_type: "mem2_hdd2_x2"
+    dx_instance_type: "mem1_ssd2_x2"
   }
 }
 
diff --git a/pipes/WDL/workflows/tasks/taxon_filter.wdl b/pipes/WDL/workflows/tasks/taxon_filter.wdl
index 1736af871..3ce406c0b 100644
--- a/pipes/WDL/workflows/tasks/taxon_filter.wdl
+++ b/pipes/WDL/workflows/tasks/taxon_filter.wdl
@@ -175,6 +175,6 @@ task merge_one_per_sample {
     memory: "7 GB"
     cpu: 4
     docker: "quay.io/broadinstitute/viral-ngs"
-    dx_instance_type: "mem1_hdd2_x8"
+    dx_instance_type: "mem1_ssd2_x4"
   }
 }

From 67403501dea415c71252263bc5a8acac48602961 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 9 Mar 2018 14:24:43 -0500
Subject: [PATCH 26/35] avoid workflow applet namespace clashes

---
 pipes/WDL/dx-defaults-spikein.json           | 2 +-
 pipes/WDL/workflows/download_annotations.wdl | 2 +-
 pipes/WDL/workflows/spikein.wdl              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipes/WDL/dx-defaults-spikein.json b/pipes/WDL/dx-defaults-spikein.json
index 6c1c6b406..667d48095 100644
--- a/pipes/WDL/dx-defaults-spikein.json
+++ b/pipes/WDL/dx-defaults-spikein.json
@@ -1,4 +1,4 @@
 {
-  "spikein.spikein.spikein_db":
+  "spikein.spikein_report.spikein_db":
     "dx://file-F6PXkF00Yqp3zVXq14fF98Kz"
 }
diff --git a/pipes/WDL/workflows/download_annotations.wdl b/pipes/WDL/workflows/download_annotations.wdl
index 296673624..c723b0e6b 100644
--- a/pipes/WDL/workflows/download_annotations.wdl
+++ b/pipes/WDL/workflows/download_annotations.wdl
@@ -2,6 +2,6 @@ import "ncbi.wdl" as ncbi
 
 workflow download_annotations {
 
-  call ncbi.download_annotations
+  call ncbi.download_annotations as download
 
 }
diff --git a/pipes/WDL/workflows/spikein.wdl b/pipes/WDL/workflows/spikein.wdl
index 1ecedb63a..4eb0d285c 100644
--- a/pipes/WDL/workflows/spikein.wdl
+++ b/pipes/WDL/workflows/spikein.wdl
@@ -2,6 +2,6 @@ import "reports.wdl" as reports
 
 workflow spikein {
 
-  call reports.spikein_report as spikein
+  call reports.spikein_report as spikein_report
 
 }

From bab7e362dd18d360f88b87e49c1f44c5b6e1f719 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 9 Mar 2018 15:09:00 -0500
Subject: [PATCH 27/35] rename workflow to avoid clash in namespace

---
 .../{download_annotations.wdl => fetch_annotations.wdl}         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename pipes/WDL/workflows/{download_annotations.wdl => fetch_annotations.wdl} (70%)

diff --git a/pipes/WDL/workflows/download_annotations.wdl b/pipes/WDL/workflows/fetch_annotations.wdl
similarity index 70%
rename from pipes/WDL/workflows/download_annotations.wdl
rename to pipes/WDL/workflows/fetch_annotations.wdl
index c723b0e6b..c53e46cab 100644
--- a/pipes/WDL/workflows/download_annotations.wdl
+++ b/pipes/WDL/workflows/fetch_annotations.wdl
@@ -1,6 +1,6 @@
 import "ncbi.wdl" as ncbi
 
-workflow download_annotations {
+workflow fetch_annotations {
 
   call ncbi.download_annotations as download
 

From fd7f35646f4f5b7d0dc08a3e2da0169f371c7939 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 9 Mar 2018 15:45:12 -0500
Subject: [PATCH 28/35] cromwell doesnt like the --imports from a directory,
 even though it says it can take that.. boo

---
 travis/tests-cromwell.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/travis/tests-cromwell.sh b/travis/tests-cromwell.sh
index 13e92c1b5..5437ac908 100755
--- a/travis/tests-cromwell.sh
+++ b/travis/tests-cromwell.sh
@@ -2,7 +2,8 @@
 set -e  # intentionally allow for pipe failures below
 
 ln -s $GATK_PATH/GenomeAnalysisTK.jar .
-ln -s pipes/WDL/workflows pipes/WDL/workflows/tasks .
+mkdir -p workflows
+ln -s pipes/WDL/workflows/*.wdl pipes/WDL/workflows/tasks/*.wdl workflows
 
 for workflow in pipes/WDL/workflows/*.wdl; do
 	workflow_name=$(basename $workflow .wdl)
@@ -13,7 +14,6 @@ for workflow in pipes/WDL/workflows/*.wdl; do
 		# the "cat" is to allow a pipe failure (otherwise it halts because of set -e)
 		java -jar cromwell.jar run \
 			workflows/$workflow_name.wdl \
-			--imports tasks \
 			-i $input_json | tee cromwell.out
 		if [ ${PIPESTATUS[0]} -gt 0 ]; then
 			echo "error running $workflow_name"

From 7bba86e851e21ef88256cb2ef50f901681ad30c9 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 9 Mar 2018 16:18:06 -0500
Subject: [PATCH 29/35] fix invocation of cromwell for tests

---
 travis/tests-cromwell.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/travis/tests-cromwell.sh b/travis/tests-cromwell.sh
index 5437ac908..7f312df74 100755
--- a/travis/tests-cromwell.sh
+++ b/travis/tests-cromwell.sh
@@ -4,8 +4,9 @@ set -e  # intentionally allow for pipe failures below
 ln -s $GATK_PATH/GenomeAnalysisTK.jar .
 mkdir -p workflows
 ln -s pipes/WDL/workflows/*.wdl pipes/WDL/workflows/tasks/*.wdl workflows
+cd workflows
 
-for workflow in pipes/WDL/workflows/*.wdl; do
+for workflow in ../pipes/WDL/workflows/*.wdl; do
 	workflow_name=$(basename $workflow .wdl)
 	input_json="test/input/WDL/test_inputs-$workflow_name-local.json"
 	if [ -f $input_json ]; then
@@ -13,7 +14,7 @@ for workflow in pipes/WDL/workflows/*.wdl; do
 		echo "Executing $workflow_name using Cromwell on local instance"
 		# the "cat" is to allow a pipe failure (otherwise it halts because of set -e)
 		java -jar cromwell.jar run \
-			workflows/$workflow_name.wdl \
+			$workflow_name.wdl \
 			-i $input_json | tee cromwell.out
 		if [ ${PIPESTATUS[0]} -gt 0 ]; then
 			echo "error running $workflow_name"
@@ -29,5 +30,6 @@ for workflow in pipes/WDL/workflows/*.wdl; do
     fi
 done
 
+cd -
 date
 echo "note: there is no testing of output correctness yet..."

From 35570f64aa2d4e0373278e104cf592a32ed89a79 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 9 Mar 2018 16:28:02 -0500
Subject: [PATCH 30/35] add moltype

---
 ncbi.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ncbi.py b/ncbi.py
index 39914d9f5..6df2d032a 100755
--- a/ncbi.py
+++ b/ncbi.py
@@ -396,7 +396,7 @@ def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=N
 
 def prep_genbank_files(templateFile, fasta_files, annotDir,
                        master_source_table=None, comment=None, sequencing_tech=None,
-                       coverage_table=None, biosample_map=None, organism=None):
+                       coverage_table=None, biosample_map=None, organism=None, mol_type=None):
     ''' Prepare genbank submission files.  Requires .fasta and .tbl files as input,
         as well as numerous other metadata files for the submission.  Creates a
         directory full of files (.sqn in particular) that can be sent to GenBank.
@@ -453,7 +453,9 @@ def prep_genbank_files(templateFile, fasta_files, annotDir,
     tbl2asn = tools.tbl2asn.Tbl2AsnTool()
     source_quals = []
     if organism:
-        source_quals = [('organism', organism)]
+        source_quals.append(('organism', organism))
+    if mol_type:
+        source_quals.append(('mol_type', mol_type))
     tbl2asn.execute(templateFile, annotDir, comment=comment,
         per_genome_comment=True, source_quals=source_quals)
 
@@ -467,6 +469,7 @@ def parser_prep_genbank_files(parser=argparse.ArgumentParser()):
     parser.add_argument('--sequencing_tech', default=None, help='sequencing technology (e.g. Illumina HiSeq 2500)')
     parser.add_argument('--master_source_table', default=None, help='source modifier table')
     parser.add_argument('--organism', default=None, help='species name')
+    parser.add_argument('--mol_type', default=None, help='molecule type')
     parser.add_argument("--biosample_map",
                         help="""A file with two columns and a header: sample and BioSample.
         This file may refer to samples that are not included in this submission.""")

From e01434cf987fe984ed5d67da5fec859451bfe79d Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 9 Mar 2018 16:56:17 -0500
Subject: [PATCH 31/35] fix src stuff

---
 ncbi.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/ncbi.py b/ncbi.py
index 6df2d032a..56ed48558 100755
--- a/ncbi.py
+++ b/ncbi.py
@@ -436,13 +436,22 @@ def prep_genbank_files(templateFile, fasta_files, annotDir,
                     Bio.SeqIO.write(seq_obj, out_chr_fasta, "fasta")
 
                 # make .fsa files
-                fasta2fsa(out_file_name, annotDir, biosample=biosample.get(sample))
+                fasta2fsa(out_file_name, annotDir, biosample=biosample.get(sample_base))
                 # remove the .fasta file
                 os.unlink(out_file_name)
 
                 # make .src files
                 if master_source_table:
-                    shutil.copy(master_source_table, os.path.join(annotDir, sample + '.src'))
+                    out_src_fname = os.path.join(annotDir, sample + '.src')
+                    with open(master_source_table, 'rt') as inf:
+                        with open(out_src_fname, 'wt') as outf:
+                            outf.write(inf.readline())
+                            for line in inf:
+                                row = line.rsrtrip('\n').split('\t')
+                                if row[0] == sample_base:
+                                    row[0] = sample
+                                    outf.write('\t'.join(row) + '\n')
+
                 # make .cmt files
                 make_structured_comment_file(os.path.join(annotDir, sample + '.cmt'),
                                              name=sample,

From 11cbd1229eecf96958587176ffe27cb737fe3df5 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Fri, 9 Mar 2018 16:56:33 -0500
Subject: [PATCH 32/35] fix src stuff again

---
 ncbi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ncbi.py b/ncbi.py
index 56ed48558..8db5bc229 100755
--- a/ncbi.py
+++ b/ncbi.py
@@ -447,7 +447,7 @@ def prep_genbank_files(templateFile, fasta_files, annotDir,
                         with open(out_src_fname, 'wt') as outf:
                             outf.write(inf.readline())
                             for line in inf:
-                                row = line.rsrtrip('\n').split('\t')
+                                row = line.rstrip('\n').split('\t')
                                 if row[0] == sample_base:
                                     row[0] = sample
                                     outf.write('\t'.join(row) + '\n')

From d39d10982757a8dc0943206e09a3577c98a15274 Mon Sep 17 00:00:00 2001
From: Ilya Shlyakhter <ilya_shl@alum.mit.edu>
Date: Tue, 20 Mar 2018 21:44:16 -0400
Subject: [PATCH 33/35] fixed tests of cromwell on local instance (#798)

* fixed tests of cromwell on local instance
---
 travis/tests-cromwell.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/travis/tests-cromwell.sh b/travis/tests-cromwell.sh
index 7f312df74..03790b6ea 100755
--- a/travis/tests-cromwell.sh
+++ b/travis/tests-cromwell.sh
@@ -3,7 +3,8 @@ set -e  # intentionally allow for pipe failures below
 
 ln -s $GATK_PATH/GenomeAnalysisTK.jar .
 mkdir -p workflows
-ln -s pipes/WDL/workflows/*.wdl pipes/WDL/workflows/tasks/*.wdl workflows
+cp *.jar pipes/WDL/workflows/*.wdl pipes/WDL/workflows/tasks/*.wdl workflows
+cp -r test workflows/
 cd workflows
 
 for workflow in ../pipes/WDL/workflows/*.wdl; do

From beb91a3a93f818b31314802bc1f1f320bf68d591 Mon Sep 17 00:00:00 2001
From: Chris Tomkins-Tinch <tomkinsc@gmail.com>
Date: Fri, 23 Mar 2018 15:46:27 -0400
Subject: [PATCH 34/35] pass sample name in WDL call to plot_coverage (#799)

* pass sample name in WDL call to plot_coverage

pass sample name in WDL call to plot_coverage so the same name shows up in the coverage plot PDF

* pass sample name in second WDL call to plot_coverage

within refine_2x_and_plot
---
 pipes/WDL/workflows/tasks/assembly.wdl | 1 +
 pipes/WDL/workflows/tasks/reports.wdl  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/pipes/WDL/workflows/tasks/assembly.wdl b/pipes/WDL/workflows/tasks/assembly.wdl
index 12aa15f64..fe943b0a1 100644
--- a/pipes/WDL/workflows/tasks/assembly.wdl
+++ b/pipes/WDL/workflows/tasks/assembly.wdl
@@ -325,6 +325,7 @@ task refine_2x_and_plot {
         --plotWidth 1100 \
         --plotHeight 850 \
         --plotDPI 100 \
+        --plotTitle "${sample_name} coverage plot" \
         --loglevel=DEBUG
     else
       touch ${sample_name}.coverage_plot.pdf
diff --git a/pipes/WDL/workflows/tasks/reports.wdl b/pipes/WDL/workflows/tasks/reports.wdl
index 22652dbc6..1f6c7f3cc 100644
--- a/pipes/WDL/workflows/tasks/reports.wdl
+++ b/pipes/WDL/workflows/tasks/reports.wdl
@@ -65,6 +65,7 @@ task plot_coverage {
         --plotWidth 1100 \
         --plotHeight 850 \
         --plotDPI 100 \
+        --plotTitle "${sample_name} coverage plot" \
         --loglevel=DEBUG
     else
       touch ${sample_name}.coverage_plot.pdf

From 2e2e05af036c1e840380c098cb75ce8135024832 Mon Sep 17 00:00:00 2001
From: Daniel Park <dpark@broadinstitute.org>
Date: Mon, 26 Mar 2018 13:31:25 -0400
Subject: [PATCH 35/35] bump viral-baseimage 0.1.8 to 0.1.9 (#802)

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 2cefcd13d..599ee8e89 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM quay.io/broadinstitute/viral-baseimage:0.1.8
+FROM quay.io/broadinstitute/viral-baseimage:0.1.9
 
 LABEL maintainer "viral-ngs@broadinstitute.org"