Merge remote-tracking branch 'origin/master' into is-add-metaspades

broadinstitute · Oct 8, 2019 · d45c9d8 · d45c9d8
2 parents 3bc174e + 9f5b2c0
commit d45c9d8
Show file tree

Hide file tree

Showing 57 changed files with 687 additions and 239 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM quay.io/broadinstitute/viral-baseimage:0.1.14
+FROM quay.io/broadinstitute/viral-baseimage:0.1.15
 
 LABEL maintainer "viral-ngs@broadinstitute.org"
 
@@ -20,17 +20,20 @@ LABEL maintainer "viral-ngs@broadinstitute.org"
 ENV \
 	INSTALL_PATH="/opt/viral-ngs" \
 	VIRAL_NGS_PATH="/opt/viral-ngs/source" \
-	MINICONDA_PATH="/opt/miniconda"
+	MINICONDA_PATH="/opt/miniconda" \
+	CONDA_DEFAULT_ENV=viral-ngs-env
 ENV \
-	PATH="$VIRAL_NGS_PATH:$MINICONDA_PATH/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" \
-	CONDA_DEFAULT_ENV=$MINICONDA_PATH \
-	CONDA_PREFIX=$MINICONDA_PATH \
+	PATH="$VIRAL_NGS_PATH:$MINICONDA_PATH/envs/$CONDA_DEFAULT_ENV/bin:$MINICONDA_PATH/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" \
+	CONDA_PREFIX=$MINICONDA_PATH/envs/$CONDA_DEFAULT_ENV \
 	JAVA_HOME=$MINICONDA_PATH
 
 # Prepare viral-ngs user and installation directory
 # Set it up so that this slow & heavy build layer is cached
 # unless the requirements* files or the install scripts actually change
 WORKDIR $INSTALL_PATH
+RUN conda create -n $CONDA_DEFAULT_ENV python=3.6
+RUN echo "source activate $CONDA_DEFAULT_ENV" > ~/.bashrc
+RUN hash -r
 COPY docker/install-viral-ngs.sh $VIRAL_NGS_PATH/docker/
 COPY requirements-minimal.txt $VIRAL_NGS_PATH/
 RUN $VIRAL_NGS_PATH/docker/install-viral-ngs.sh minimal

diff --git a/conftest.py b/conftest.py
diff --git a/docker/install-viral-ngs.sh b/docker/install-viral-ngs.sh
@@ -15,15 +15,14 @@
 
 set -e -o pipefail
 
-CONDA_CHANNEL_STRING="--override-channels -c broad-viral -c conda-forge -c bioconda -c defaults"
+echo "PATH:              ${PATH}"
+echo "INSTALL_PATH:      ${INSTALL_PATH}"
+echo "CONDA_PREFIX:      ${CONDA_PREFIX}"
+echo "VIRAL_NGS_PATH:    ${VIRAL_NGS_PATH}"
+echo "MINICONDA_PATH:    ${MINICONDA_PATH}"
+echo "CONDA_DEFAULT_ENV: ${CONDA_DEFAULT_ENV}"
 
-mkdir -p $INSTALL_PATH/viral-ngs-etc
-if [ ! -f $INSTALL_PATH/viral-ngs-etc/viral-ngs ]; then
-	ln -s $VIRAL_NGS_PATH $INSTALL_PATH/viral-ngs-etc/viral-ngs
-fi
-if [ ! -f $INSTALL_PATH/viral-ngs-etc/conda-env ]; then
-	ln -s $CONDA_DEFAULT_ENV $INSTALL_PATH/viral-ngs-etc/conda-env
-fi
+CONDA_CHANNEL_STRING="--override-channels -c broad-viral -c conda-forge -c bioconda -c defaults"
 
 # setup/install viral-ngs directory tree and conda dependencies
 sync
@@ -33,13 +32,15 @@ if [[ "$1" == "minimal" ]]; then
 	# a more minimal set of tools (smaller docker image?)
 	conda install -y \
 		-q $CONDA_CHANNEL_STRING \
-		--file "$VIRAL_NGS_PATH/requirements-minimal.txt"
+		--file "$VIRAL_NGS_PATH/requirements-minimal.txt" \
+		-p "${CONDA_PREFIX}"
 else
 	conda install -y \
 		-q $CONDA_CHANNEL_STRING \
 		--file "$VIRAL_NGS_PATH/requirements-py3.txt" \
 		--file "$VIRAL_NGS_PATH/requirements-conda.txt" \
-		--file "$VIRAL_NGS_PATH/requirements-conda-tests.txt"
+		--file "$VIRAL_NGS_PATH/requirements-conda-tests.txt" \
+		-p "${CONDA_PREFIX}"
 fi
 
 # clean up

diff --git a/docker/rundocker.sh b/docker/rundocker.sh
@@ -4,7 +4,7 @@
 # The following paths have to be modified according to end-user environment
 NOVOALIGN_PATH="/opt/novocraft" # Directory where novoalign.lic license file can befound
 GATK_PATH="/opt/GenomeAnalysisTK-3.8" # Directory where the correct GATK jar file can be found
-IMAGE_HASH_OR_TAG="local/viral-ngs:1.16.0" # This can be found by running this command 'docker images'
+IMAGE_HASH_OR_TAG="quay.io/broadinstitute/viral-ngs:latest" # This can be found by running this command 'docker images'
 DATA_DIR="$1"; shift
 GID=$(id -g $USER)
 

diff --git a/docs/install.rst b/docs/install.rst
@@ -24,6 +24,9 @@ the pipeline using graphical and command-line interfaces. Instructions
 for the cloud analysis pipeline are available at
 https://github.com/dnanexus/viral-ngs/wiki
 
+The latest versions of viral-ngs are available on DNAnexus within the project here:
+https://platform.dnanexus.com/projects/F8PQ6380xf5bK0Qk0YPjB17P/data/build/quay.io/broadinstitute/viral-ngs
+
 
 Google Cloud Platform: deploy to GCE VM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -45,12 +48,12 @@ All of the command line functions in viral-ngs are accessible from the docker im
 
 Here is an example invocation of ``illumina.py illumina_demux`` (replace the project with your GCP project, and the input, output-recursive, and logging parameters with URIs within your GCS buckets)::
 
-  dsub --project gcid-viral-seq --zones "us-central1-*" \
+  dsub --project my-google-project-id --zones "us-central1-*" \
     --image quay.io/broadinstitute/viral-ngs \
-    --name illumina_demux-test \
-    --logging gs://viral-temp-30d/dpark/test-demux/logs \
-    --input FC_TGZ=gs://viral-sequencing/flowcells/broad-walkup/160907_M04004_0066_000000000-AJH8U.tar.gz \
-    --output-recursive OUTDIR=gs://viral-temp-30d/dpark/test-demux \
+    --name illumina_demux \
+    --logging gs://mybucket/logs \
+    --input FC_TGZ=gs://mybucket/flowcells/160907_M04004_0066_000000000-AJH8U.tar.gz \
+    --output-recursive OUTDIR=gs://mybucket/demux \
     --command 'illumina.py illumina_demux ${FC_TGZ} 1 ${OUTDIR}' \
     --min-ram 30 \
     --min-cores 8 \
@@ -97,7 +100,7 @@ In order to finish installing viral-ngs, you will need to activate its conda env
 
   source activate viral-ngs-env
 
-Due to license restrictions, the viral-ngs conda package cannot distribute and install GATK directly. To fully install GATK, you must download a licensed copy of GATK v3.6 `from the Broad Institute <https://software.broadinstitute.org/gatk/download/archive>`_, and call "gatk3-register," which will copy GATK into your viral-ngs conda environment::
+Due to license restrictions, the viral-ngs conda package cannot distribute and install GATK directly. To fully install GATK, you must download a licensed copy of GATK v3.8 `from the Broad Institute <https://software.broadinstitute.org/gatk/download/archive>`_, and call "gatk3-register," which will copy GATK into your viral-ngs conda environment::
 
   mkdir -p /path/to/gatk_dir
   wget -O - 'https://software.broadinstitute.org/gatk/download/auth?package=GATK-archive&version=3.6-0-g89b7209' | tar -xjvC /path/to/gatk_dir

diff --git a/errors.py b/errors.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+
+class QCError(RuntimeError):
+    '''Indicates a failure at a QC step.'''
+
+    def __init__(self, reason):
+        super(QCError, self).__init__(reason)
diff --git a/illumina.py b/illumina.py
@@ -722,7 +722,11 @@ def _detect_and_load_sheet(self, infile):
                 header = None
                 miseq_skip = False
                 row_num = 0
-                for line in inf:
+                for line_no, line in enumerate(inf):
+                    if line_no==0:
+                        # remove BOM, if present
+                        line = line.replace('\ufeff','')
+
                     # if this is a blank line, skip parsing and continue to the next line...
                     if len(line.rstrip('\r\n').strip()) == 0:
                         continue

diff --git a/metagenomics.py b/metagenomics.py
@@ -1172,7 +1172,7 @@ def indent_len(in_string):
             should_process = False
             indent_of_selection = -1
             currently_being_processed = ""
-            for line in inf:
+            for lineno, line in enumerate(inf):
                 if len(line.rstrip('\r\n').strip()) == 0 or ( report_type != None and line.startswith("#") or line.startswith("%")):
                     continue
 
@@ -1210,7 +1210,12 @@ def indent_len(in_string):
 
                 row = next(csv.DictReader([line.strip().rstrip('\n')], fieldnames=fieldnames, dialect="kraken_report"))
 
-                indent_of_line = indent_len(row["sci_name"])
+                try:
+                    indent_of_line = indent_len(row["sci_name"])
+                except AttributeError as e:
+                    log.warning("Report type: '{}'".format(report_type))
+                    log.warning("Issue with line {}: '{}'".format(lineno,line.strip().rstrip('\n')))
+                    log.warning("From file: {}".format(f))
                 # remove leading/trailing whitespace from each item
                 row = { k:v.strip() for k, v in row.items()}
 

diff --git a/packaging/conda-recipe/render-recipe.py b/packaging/conda-recipe/render-recipe.py
@@ -105,7 +105,7 @@ def __str__(self):
                     "+{0}".format("".join(str(x) for x in self.version_re.match(self.v).group("local")))
                 )
         except:
-            raise argparse.ArgumentTypeError("String '%s' does not match required PEP440 format"%(self.v,))
+            raise argparse.ArgumentTypeError("String '%s' does not match required PEP440 format" % (self.v,))
 
         return "".join(parts)
 

diff --git a/packaging/conda-recipe/viral-ngs-template/meta.yaml b/packaging/conda-recipe/viral-ngs-template/meta.yaml
@@ -19,15 +19,18 @@ build:
 
 requirements:
   build:
+    - {{ "{{ compiler('c') }}" }}
+    # - gcc   # [not osx]
+    # - llvm  # [osx]
+
+  host:
     - python
     - openjdk >=8
     - perl
     {% for item in build_requirements %}
     {{ item }}
     {%- endfor %}
     # C lib or compilation-related
-    - gcc   # [not osx]
-    - llvm  # [osx]
 
   run:
     - python

diff --git a/pipes/WDL/dx-defaults-demux_plus.json b/pipes/WDL/dx-defaults-demux_plus.json
@@ -1,6 +1,6 @@
 {
   "demux_plus.spikein_db":
-    "dx://file-F6PXkF00Yqp3zVXq14fF98Kz",
+    "dx://file-FZY2v7Q0xf5VBy5FFY3z5fz7",
 
   "demux_plus.bwaDbs": [
     "dx://file-F9k7Bx00Z3ybJjvY3ZVj7Z9P"

diff --git a/pipes/WDL/dx-defaults-spikein.json b/pipes/WDL/dx-defaults-spikein.json
@@ -1,4 +1,4 @@
 {
   "spikein.spikein_report.spikein_db":
-    "dx://file-F6PXkF00Yqp3zVXq14fF98Kz"
+    "dx://file-FZY2v7Q0xf5VBy5FFY3z5fz7"
 }
diff --git a/pipes/WDL/dx-extras.json b/pipes/WDL/dx-extras.json
@@ -0,0 +1,11 @@
+{
+  "default_task_dx_attributes" : {
+    "runSpec": {
+      "timeoutPolicy": {
+        "*": {
+          "hours": 9
+        }
+      }
+    }
+  }
+}
diff --git a/pipes/WDL/dx-launcher/demux_launcher.yml b/pipes/WDL/dx-launcher/demux_launcher.yml
@@ -97,36 +97,50 @@ runSpec:
 
       demux_instance_type="mem1_ssd1_x4"
       demux_threads=$(echo "$demux_instance_type" | cut -dx -f2)
+      min_base_quality=25
+      max_reads_in_ram_per_tile=1000000
+      max_records_in_ram=2000000
       if [ "$total_tile_count" -le 50 ]; then 
           tar_consolidation_instance_size="mem1_ssd1_x4"
           demux_instance_type="mem1_ssd1_x4"
-          min_base_quality=25
           demux_threads=$(echo "$demux_instance_type" | cut -dx -f2)
           echo "Detected $total_tile_count tiles, interpreting as MiSeq run, executing on a $demux_instance_type machine."
       elif [ "$total_tile_count" -le 150 ]; then
           tar_consolidation_instance_size="mem1_ssd2_x4"
           demux_instance_type="mem1_ssd2_x4"
-          min_base_quality=25
           demux_threads=$(echo "$demux_instance_type" | cut -dx -f2)
           echo "Detected $total_tile_count tiles, interpreting as HiSeq2k run, executing on a $demux_instance_type machine."
+      elif [ "$total_tile_count" -le 288 ]; then
+          # increase the number of reads in ram per-tile for NextSeq, since the tiles are larger
+          # without this setting, reads will spill to disk and may read the limit
+          # on the number of files that can be opened
+          max_reads_in_ram_per_tile=1500000
+          max_records_in_ram=2000000
+          echo "Detected $total_tile_count tiles, interpreting as NextSeq (mid-output) run."
+      elif [ "$total_tile_count" -le 864 ]; then
+          # increase the number of reads in ram per-tile for NextSeq, since the tiles are larger
+          # without this setting, reads will spill to disk and may read the limit
+          # on the number of files that can be opened
+          max_reads_in_ram_per_tile=200000 # reduce the number of reads per tile since the NovaSeq has so many
+          max_records_in_ram=1500000
+          echo "Detected $total_tile_count tiles, interpreting as NextSeq (high-output) run."
       elif [ "$total_tile_count" -le 896 ]; then
           tar_consolidation_instance_size="mem1_ssd1_x32"
           demux_instance_type="mem1_ssd1_x32"
-          min_base_quality=25
           demux_threads=$(echo "$demux_instance_type" | cut -dx -f2)
           echo "Detected $total_tile_count tiles, interpreting as HiSeq4k run, executing on a $demux_instance_type machine."
       elif [ "$total_tile_count" -le 1408 ]; then
           tar_consolidation_instance_size="mem1_ssd2_x36"
           demux_instance_type="mem1_ssd2_x36"
           min_base_quality=20
+          max_reads_in_ram_per_tile=750000
           demux_threads=20 # with NovaSeq-size output, OOM errors can sporadically occur with higher thread counts
           echo "Detected $total_tile_count tiles, interpreting as NovaSeq run, executing on a $demux_instance_type machine."
           echo "  **Note: Q20 threshold used since NovaSeq with RTA3 writes only four Q-score values: 2, 12, 23, and 37.**"
           echo "    See: https://www.illumina.com/content/dam/illumina-marketing/documents/products/appnotes/novaseq-hiseq-q30-app-note-770-2017-010.pdf"
       elif [ "$total_tile_count" -gt 1408 ]; then
           tar_consolidation_instance_size="mem1_ssd2_x36"
           demux_instance_type="mem1_ssd2_x36"
-          min_base_quality=25
           demux_threads=$(echo "$demux_instance_type" | cut -dx -f2)
           echo "Tile count: $total_tile_count tiles, (unknown instrument type), executing on a $demux_instance_type machine."
       fi
@@ -163,7 +177,7 @@ runSpec:
         fi
         for i in $(seq "$lane_count"); do
           folder2=$(printf "%s/%s/reads/L%d" "$folder" "$run_id" $i)
-          runcmd="dx run $demux_workflow_id -i stage-1.flowcell_tgz=$run_tarball -i illumina_demux.lane=$i -i illumina_demux.minimumBaseQuality=$min_base_quality -i illumina_demux.threads=$demux_threads $sequencing_center_input --folder $folder2 --instance-type illumina_demux=$demux_instance_type --name demux:$run_id:L$i -y --brief"
+          runcmd="dx run $demux_workflow_id -i stage-1.flowcell_tgz=$run_tarball -i illumina_demux.lane=$i -i illumina_demux.maxReadsInRamPerTile=$max_reads_in_ram_per_tile -i illumina_demux.maxRecordsInRam=$max_records_in_ram -i illumina_demux.minimumBaseQuality=$min_base_quality -i illumina_demux.threads=$demux_threads $sequencing_center_input --folder $folder2 --instance-type illumina_demux=$demux_instance_type --name demux:$run_id:L$i -y --brief"
           echo "$runcmd"
           set +x
           if [ -n "$api_token" ]; then

diff --git a/pipes/WDL/workflows/classify_krakenuniq.wdl b/pipes/WDL/workflows/classify_krakenuniq.wdl
@@ -1,5 +1,11 @@
 import "tasks_metagenomics.wdl" as metagenomics
+import "tasks_reports.wdl" as reports
 
 workflow classify_krakenuniq {
     call metagenomics.krakenuniq
+
+    call reports.aggregate_metagenomics_reports as metag_summary_report {
+        input:
+            kraken_summary_reports = krakenuniq.krakenuniq_summary_reports
+    }
 }
diff --git a/pipes/WDL/workflows/demux_metag.wdl b/pipes/WDL/workflows/demux_metag.wdl
@@ -7,8 +7,6 @@ import "tasks_assembly.wdl" as assembly
 import "tasks_reports.wdl" as reports
 
 workflow demux_metag {
-  File krona_taxonomy_db_tgz
-
   call demux.illumina_demux as illumina_demux
 
   scatter(raw_reads in illumina_demux.raw_reads_unaligned_bams) {
@@ -31,6 +29,14 @@ workflow demux_metag {
     input:
       reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams,
   }
+  call reports.aggregate_metagenomics_reports as metag_summary_report {
+      input:
+          kraken_summary_reports = kraken.krakenuniq_summary_reports
+  }
+  call reports.spikein_summary as spike_summary {
+      input:
+          spikein_count_txt = spikein.report
+  }
   call metagenomics.kaiju as kaiju {
     input:
       reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams,

diff --git a/pipes/WDL/workflows/demux_plus.wdl b/pipes/WDL/workflows/demux_plus.wdl
@@ -39,4 +39,14 @@ workflow demux_plus {
         input:
             reads_unmapped_bam = illumina_demux.raw_reads_unaligned_bams
     }
+
+    call reports.spikein_summary as spike_summary {
+        input:
+            spikein_count_txt = spikein.report
+    }
+
+    call reports.aggregate_metagenomics_reports as metag_summary_report {
+        input:
+            kraken_summary_reports = krakenuniq.krakenuniq_summary_reports
+    }
 }
diff --git a/pipes/WDL/workflows/tasks/tasks_assembly.wdl b/pipes/WDL/workflows/tasks/tasks_assembly.wdl
@@ -104,7 +104,7 @@ task scaffold {
     Int?    nucmer_max_gap
     Int?    nucmer_min_match
     Int?    nucmer_min_cluster
-    Int?    scaffold_min_pct_contig_aligned
+    Float?    scaffold_min_pct_contig_aligned
 
     # do this in multiple steps in case the input doesn't actually have "assembly1-x" in the name
     String  sample_name = basename(basename(basename(contigs_fasta, ".fasta"), ".assembly1-trinity"), ".assembly1-spades")

diff --git a/pipes/WDL/workflows/tasks/tasks_demux.wdl b/pipes/WDL/workflows/tasks/tasks_demux.wdl
@@ -36,8 +36,8 @@ task illumina_demux {
   String? sequencingCenter
 
   String? flowcell
-  Int?    minimumBaseQuality = 10
-  Int?    maxMismatches = 1
+  Int?    minimumBaseQuality = 25
+  Int?    maxMismatches = 0
   Int?    minMismatchDelta
   Int?    maxNoCalls
   String? readStructure
@@ -105,11 +105,27 @@ task illumina_demux {
         echo "Detected $total_tile_count tiles, interpreting as MiSeq run."
     elif [ "$total_tile_count" -le 150 ]; then
         echo "Detected $total_tile_count tiles, interpreting as HiSeq2k run."
+    elif [ "$total_tile_count" -le 288 ]; then
+        # increase the number of reads in ram per-tile for NextSeq, since the tiles are larger
+        # without this setting, reads will spill to disk and may read the limit
+        # on the number of files that can be opened
+        max_reads_in_ram_per_tile=1500000
+        max_records_in_ram=2000000
+        echo "Detected $total_tile_count tiles, interpreting as NextSeq (mid-output) run."
+    elif [ "$total_tile_count" -le 864 ]; then
+        # increase the number of reads in ram per-tile for NextSeq, since the tiles are larger
+        # without this setting, reads will spill to disk and may read the limit
+        # on the number of files that can be opened
+        max_reads_in_ram_per_tile=1500000 # reduce the number of reads per tile since the NovaSeq has so many
+        max_records_in_ram=2500000
+        echo "Detected $total_tile_count tiles, interpreting as NextSeq (high-output) run."
     elif [ "$total_tile_count" -le 896 ]; then
         echo "Detected $total_tile_count tiles, interpreting as HiSeq4k run."
     elif [ "$total_tile_count" -le 1408 ]; then
         mem_in_mb=$(/opt/viral-ngs/source/docker/calc_mem.py mb 80)
         demux_threads=20 # with NovaSeq-size output, OOM errors can sporadically occur with higher thread counts
+        max_reads_in_ram_per_tile=200000 # reduce the number of reads per tile since the NovaSeq has so many
+        max_records_in_ram=1500000
         echo "Detected $total_tile_count tiles, interpreting as NovaSeq run."
         echo "  **Note: Q20 threshold used since NovaSeq with RTA3 writes only four Q-score values: 2, 12, 23, and 37.**"
         echo "    See: https://www.illumina.com/content/dam/illumina-marketing/documents/products/appnotes/novaseq-hiseq-q30-app-note-770-2017-010.pdf"