Merge branch 'master' into sy-faster-kraken-tests

broadinstitute · Aug 16, 2017 · 3071be1 · 3071be1
2 parents c074b47 + 1bcaf91
commit 3071be1
Show file tree

Hide file tree

Showing 100 changed files with 2,843 additions and 2,116 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -3,6 +3,8 @@
 [run]
 branch = True
 omit = tools/conda-tools/*
+disable_warnings =
+    module-not-imported
 
 [report]
 

diff --git a/.gitignore b/.gitignore
@@ -58,13 +58,14 @@ coverage.xml
 test/input/TestVPhaser2/in.bam.bti
 
 easy-deploy/data/Snakefile
+easy-deploy-virtualized/data/Snakefile
 
-easy-deploy/data/config.json
-easy-deploy/data/config.yaml
+easy-deploy-virtualized/data/config.json
+easy-deploy-virtualized/data/config.yaml
 
-easy-deploy/data/viral-ngs/
+easy-deploy-virtualized/data/viral-ngs/
 
-easy-deploy/.vagrant/
+**/.vagrant/
 
 tools/build/
 tools/conda-cache/

diff --git a/.travis.yml b/.travis.yml
@@ -1,15 +1,14 @@
 language: python
+sudo: false
 
 matrix:
+    fast_finish: true
     include:
         - os: linux
-          sudo: false
           python: 2.7
         - os: linux
-          sudo: false
           python: 3.4
         - os: linux
-          sudo: false
           python: 3.5
 #        - os: osx
 #          language: generic
@@ -30,7 +29,12 @@ env:
   - PIP_DIR="$HOME/virtualenv"
   - GATK_PATH="$CACHE_DIR/GenomeAnalysisTK-3.6"
   - PYTHONIOENCODING=UTF8
+  # $BUNDLE_SECRET for decrypting tarball of third-party tools
   - secure: KX7DwKRD85S7NgspxevgbulTtV+jHQIiM6NBus2/Ur/P0RMdpt0EQQ2wDq79qGN70bvvkw901N7EjSYd+GWCAM7StXtaxnLRrrZ3XI1gX7KMk8E3QzPf0zualLDs7cuQmL6l6WiElUAEqumLc7WGpLZZLdSPzNqFSg+CBKCmTI8=
+  # $ANACONDA_TOKEN for uploading builds to anaconda.org ("broad-viral" channel) 
+  - secure: SLPB86BpMIiNncMioxVk9cLrqaSNt8F1QDtxkrdLq9j7wXzFqGa7cipG6UJ6Om7GvoF49DpACfGPTA4ycr+T4cH3pWXpBHrBhV8TyKJb23cOmg5+7zqJQTzuwNqKOT7t9rnBkf1uzVXBcgqKaD6XW/nEvNFK00I0cvjlCp8vgxE=
+  # $TRAVIS_ACCESS_TOKEN_FOR_OTHER_REPO (viral-ngs-deploy)
+  - secure: ChB0K3gPr5HknxYA41xCrpgChHDmLkqc79p1NABB/tbqOEnrPzDPqE+FU4/QlmeV96jMYn4uyLVauJpzVXyBIVoOa8guqoF5VdiKlAhaUwh9UQJ75i3SKQtGBrqaTXSDVI1vJARMiGabduCrcNJxVsxV9Bm+YzTq6tuhWyqR4fs=
 
 git:
   depth: 3
@@ -44,8 +48,20 @@ install:
   - travis/install-pip.sh
 
 script:
-  - travis/tests-unit.sh
   - travis/tests-long.sh
+  - travis/tests-unit.sh
 
 after_success:
   - coveralls
+
+before_deploy:
+  - source travis/install-conda.sh
+  - conda install -y jinja2 # needed to render conda recipe
+
+deploy:
+  provider: script
+  skip_cleanup: true # retain build artifacts, including dependencies
+  script: travis/deploy.sh $TRAVIS_TAG
+  on:
+    tags: true
+    #all_branches: true
diff --git a/DEVELOPMENT_NOTES.md b/DEVELOPMENT_NOTES.md
@@ -7,13 +7,13 @@ This page lists information for developers working on viral-ngs.
 When Python and binary dependencies for viral-ngs are installed by conda, they can end up in several locations. The default and preferred method of installation assumes a conda environment is active in the current shell, complete with [environment variables we can access to specify the path of the active environment](https://github.com/broadinstitute/viral-ngs/blob/master/tools/__init__.py#L240). In this case, conda packages are installed in the active conda environment. If conda is installed and available on the path but no environment is currently active, viral-ngs dependencies are installed in isolation within `viral-ngs/tools/build/conda-tools/{default}` (unless this location is overridden in the CondaPackage() constructor). For tools without a conda recipe (as may be the case on certain platforms, like Mac OSX), or where conda install fails, custom install methods are used to download and build some tools.
 
 #### Adding a new tool or dependency
-When adding a new tool or dependency to viral-ngs, check to see if a conda package is already available either on the default channel (`conda search <package_name>`), or on the bioconda channel (`conda search -c bioconda <package_name>`). If so, it will needed to be added to the [conda recipe for viral-ngs](https://github.com/bioconda/bioconda-recipes/tree/master/recipes/viral-ngs). If a recipe is unavailable, it will first need to be added to a particular conda channel. [Bioconda](https://github.com/bioconda/bioconda-recipes) is used by default.
+When adding a new tool or dependency to viral-ngs, check to see if a conda package is already available either on the default channel (`conda search <package_name>`), or on the bioconda channel (`conda search -c bioconda <package_name>`). If so, it will needed to be added to the conda recipe template for viral-ngs. If a recipe is unavailable, it will first need to be added to a particular conda channel. [Bioconda](https://github.com/bioconda/bioconda-recipes) is used by default.
 
 #### Changing dependency versions
-The viral-ngs package installed by `conda install viral-ngs` from the [bioconda channel](https://github.com/bioconda/bioconda-recipes) depends on a conda build recipe distributed separately from this repository. The [recipe files](https://github.com/bioconda/bioconda-recipes/tree/master/recipes/viral-ngs) list the various Python and binary depedencies of viral-ngs as conda packages, including version numbers. **When the version of a tool changes in the viral-ngs repository, the version number must be changed manually to match in the conda recipe. This requires a PR to bioconda.**
+The viral-ngs package installed by `conda install viral-ngs` from the [broad-viral channel](https://anaconda.org/broad-viral/viral-ngs) depends on a conda build recipe distributed in this repository. The recipe files source the various Python and binary depedencies of viral-ngs as conda packages, including version numbers, from the `requirements-*.txt` files within this repository.
 
-#### Manual deployment and release actions
-When a new tagged version of viral-ngs is [released](https://github.com/broadinstitute/viral-ngs/releases), the [conda recipe](https://github.com/bioconda/bioconda-recipes/tree/master/recipes/viral-ngs) needs to be updated manually to reflect the new version number and source archive. This requires a PR to bioconda.
+#### Automated deployment and release actions
+When a new tagged version of viral-ngs is [released](https://github.com/broadinstitute/viral-ngs/releases), the conda package will be updated automatically by a TravisCI deploy hook in the test build for the tagged branch. First the recipe will be updated to reflect the new version number, source archive, and dependency list. Next TravisCI will build a package for the recipe via `conda build` and upload it to the [broad-viral channel](https://anaconda.org/broad-viral/viral-ngs) of anaconda.org. If the `conda build` is successful, a remote build will be triggered on TravisCI for the [broadinstitute/viral-ngs-deploy](https://github.com/broadinstitute/viral-ngs-deploy) repository in order to build and upload a Docker image to the [broadinstitute/viral-ngs](https://hub.docker.com/r/broadinstitute/viral-ngs/) repository on Docker Hub.
 
 ### (Automated) testing 
 [Travis CI](https://travis-ci.org/broadinstitute/viral-ngs) performs automated unit and integration tests for viral-ngs on each branch and pull request. Unit tests are run on each new branch commit, and longer integration tests are performed on pull requests to help ensure the stability of the `master` branch. Pull requests are gated to ensure merging to `master` is allowed only if all tests pass. The Travis configuration is specified in `.travis.yml`, and relies on files stored within `viral-ngs/travis/`.

diff --git a/README.md b/README.md
@@ -2,8 +2,8 @@
 [![Build Status](https://travis-ci.org/broadinstitute/viral-ngs.svg?branch=master)](https://travis-ci.org/broadinstitute/viral-ngs)
 [![Coverage Status](https://coveralls.io/repos/broadinstitute/viral-ngs/badge.png)](https://coveralls.io/r/broadinstitute/viral-ngs)
 [![Code Health](https://landscape.io/github/broadinstitute/viral-ngs/master/landscape.svg?style=flat)](https://landscape.io/github/broadinstitute/viral-ngs)
-[![Documentation Status](https://readthedocs.org/projects/viral-ngs/badge/?version=latest)](https://readthedocs.org/projects/viral-ngs/?badge=latest)
-[![DOI](https://zenodo.org/badge/doi/10.5281/zenodo.17560.svg)](http://dx.doi.org/10.5281/zenodo.17560)
+[![Documentation Status](https://readthedocs.org/projects/viral-ngs/badge/?version=latest)](http://viral-ngs.readthedocs.io/en/latest/?badge=latest)
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.252549.svg)](https://doi.org/10.5281/zenodo.252549)
 
 viral-ngs
 =========

diff --git a/assembly.py b/assembly.py
@@ -27,12 +27,12 @@
 import util.misc
 import util.vcf
 import read_utils
-import taxon_filter
 import tools
 import tools.picard
 import tools.samtools
 import tools.gatk
 import tools.novoalign
+import tools.trimmomatic
 import tools.trinity
 import tools.mafft
 import tools.mummer
@@ -58,7 +58,7 @@ def __init__(self, n_start, n_trimmed, n_rmdup, n_output, n_subsamp, n_unpaired_
 
 def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
     ''' Take reads through Trimmomatic, Prinseq, and subsampling.
-        This should probably move over to read_utils or taxon_filter.
+        This should probably move over to read_utils.
     '''
 
     downsamplesam = tools.picard.DownsampleSamTool()
@@ -79,7 +79,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
         for i in range(2):
             shutil.copyfile(infq[i], trimfq[i])
     else:
-        taxon_filter.trimmomatic(
+        tools.trimmomatic.TrimmomaticTool().execute(
             infq[0],
             infq[1],
             trimfq[0],
@@ -1479,7 +1479,7 @@ def dpdiff(inVcfs, outFile):
     return 0
 
 
-__commands__.append(('dpdiff', parser_dpdiff))
+#__commands__.append(('dpdiff', parser_dpdiff))
 
 
 def full_parser():

diff --git a/conftest.py b/conftest.py
@@ -0,0 +1,73 @@
+import sys
+import time
+import pytest
+import operator
+
+
+def timer():
+    if sys.version_info < (3, 3):
+        return time.time()
+    return time.perf_counter()
+
+
+def pytest_addoption(parser):
+    group = parser.getgroup("terminal reporting", "reporting", after="general")
+    group.addoption(
+        '--fixture-durations',
+        action="store",
+        type=int,
+        default=None,
+        metavar="N",
+        help="show N slowest fixture durations (N=0 for all)."
+    ),
+
+
+def pytest_configure(config):
+    reporter = FixtureReporter(config)
+    config.pluginmanager.register(reporter, 'fixturereporter')
+
+
+class FixtureReporter:
+
+    def __init__(self, config):
+        import _pytest.config
+        self.config = config
+        self.stats = {}
+        self.writer = _pytest.config.create_terminal_writer(config)
+        self.durations = config.option.fixture_durations
+
+    @pytest.hookimpl(hookwrapper=True)
+    def pytest_fixture_setup(self, fixturedef, request):
+        funcname = request._pyfuncitem.name
+        fixname = fixturedef.argname
+
+        fixturedef._timer_start = timer()
+        yield
+        duration = timer() - fixturedef._timer_start
+        fixturedef._timer_duration = duration
+
+        self.stats[(fixname, funcname)] = duration
+
+    def pytest_terminal_summary(self, terminalreporter, exitstatus):
+        if self.durations is None:
+            return
+
+        writer = terminalreporter.writer
+
+        slowest = sorted(self.stats.items(), key=operator.itemgetter(1), reverse=True)
+        if not self.durations:
+            writer.sep("=", "slowest fixture durations")
+        else:
+            writer.sep("=", "slowest %s fixture durations" % self.durations)
+            slowest = slowest[:self.durations]
+
+
+        rows = []
+        for (fixname, funcname), duration in slowest:
+            row = ['{:.2f}s'.format(duration), fixname, funcname]
+            rows.append(row)
+
+        widths = [max(map(len, col)) for col in zip(*rows)]
+        for row in rows:
+            writer.write(" ".join((val.ljust(width) for val, width in zip(row, widths))))
+            writer.line()
diff --git a/docs/conf.py b/docs/conf.py
@@ -24,7 +24,8 @@
 # -- Mock out the heavyweight pip packages, esp those that require C ----
 import mock
 MOCK_MODULES = ['scipy', 'pysam', 'Bio', 'Bio.AlignIO', 'Bio.Alphabet',
-                'Bio.Alphabet.IUPAC', 'Bio.SeqIO', 'Bio.Data.IUPACData']
+                'Bio.Alphabet.IUPAC', 'Bio.SeqIO', 'Bio.Data.IUPACData',
+		'pybedtools', 'pybedtools.BedTool']
 for mod_name in MOCK_MODULES:
     sys.modules[mod_name] = mock.Mock()
 
@@ -50,7 +51,7 @@ def _git_version():
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.pngmath', 'sphinxarg.ext',]
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.imgmath', 'sphinxarg.ext',]
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']

diff --git a/docs/install.rst b/docs/install.rst
@@ -20,6 +20,7 @@ Configure Conda
 
 The viral-ngs software and its dependencies are distributed through the bioconda channel for the conda package manager. It is necessary to add this channel to the conda config::
 
+  conda config --add channels broad-viral
   conda config --add channels bioconda
   conda config --add channels r
   conda config --add channels conda-forge
@@ -41,13 +42,12 @@ In order to finish installing viral-ngs, you will need to activate its conda env
 Due to license restrictions, the viral-ngs conda package cannot distribute and install GATK directly. To fully install GATK, you must download a licensed copy of GATK `from the Broad Institute <https://www.broadinstitute.org/gatk/download/>`_, and call "gatk-register," which will copy GATK into your viral-ngs conda environment::
 
   # (download licensed copy of GATK)
-  gath-register /path/to/GenomeAnalysisTK.jar
+  gatk-register /path/to/GenomeAnalysisTK.jar
 
 The single-threaded version of `Novoalign <http://www.novocraft.com/products/novoalign/>`_ is installed by default. If you have a license for Novoalign to enable multi-threaded operation, viral-ngs will copy it to the viral-ngs conda environment if the ``NOVOALIGN_LICENSE_PATH`` environment variable is set. Alternatively, the conda version of Novoalign can be overridden if the ``NOVOALIGN_PATH`` environment variable is set. If you obtain a Novoalign license after viral-ngs has already been installed, it can be added to the conda environment by calling::
 
   # obtain a Novoalign license file: novoalign.lic
-  novoalign-register-license /path/to/novoalign.lic
-
+  novoalign-license-register /path/to/novoalign.lic
 
 Activating viral-ngs once installed
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/docs/pipeuse.rst b/docs/pipeuse.rst
@@ -6,52 +6,18 @@ commands called in isolation, it is possible to execute them as a
 complete automated pipeline, from processing raw sequencer output to
 creating files suitable for GenBank submission. This utilizes Snakemake,
 which is documented at:
-https://bitbucket.org/snakemake/snakemake/wiki/Home
+https://snakemake.readthedocs.io/en/stable/
 
 Here is an overview of the Snakemake rule graph:
 
 .. image:: rulegraph.png
 
-Setting up the Python 3 virtual environment
+Installation instructions
 -------------------------------------------
 
-Note that Python 3.4 is required to use these tools with Snakemake. It
-is recommended to create a virtual environment within which all of the
-viral-ngs dependencies can be installed:
+It is recommended to install the viral-ngs conda package from the ``broad-viral`` channel, as detailed in the installation section of this documentation.
 
-::
-
-    pyvenv-3.4 venv-viral-ngs
-    cd venv-viral-ngs
-    source bin/activate
-
-Once the virtual environment has been created and activated, the
-viral-ngs dependencies can be installed via ``pip``:
-
-::
-
-    pip install -r requirements.txt
-    pip install -r requirements-pipes.txt
-
-Note: To resume normal use of the system installation of python, call
-the “deactivate” command in your shell. See the `official venv
-documentation <https://docs.python.org/3/library/venv.html>`__ for more
-information on Python3 virtual environments.
-
-In addition to the dependencies installed via ``pip``, the pipline needs
-the standard dependencies described in the main viral-ngs installation
-section.
-
-*Note:* If running on the Broad Institute UGER cluster environment,
-import the following dotkits prior to activating the virtualenv:
-
-::
-
-    use .python-3.4.3
-    use .oracle-java-jdk-1.7.0-51-x86-64
-    use .bzip2-1.0.6
-    use .zlib-1.2.6
-    use .gcc-4.5.3
+The script ``easy-deploy-viral-ngs.sh`` can be used to install conda and the viral-ngs package on a standard Linux system, as well as to create new project analysis directories. Project directories can also be created manually as described below.
 
 Setting up an analysis directory
 --------------------------------
@@ -243,7 +209,7 @@ contaminents from reads:
 
 Pre-built databases for Trimmomatic:
 
--  `contaminants.fasta.tar.gz <https://console.cloud.google.com/m/cloudstorage/b/sabeti-public/o/depletion_dbs/contaminants.fasta.tar.gz>`__ (`*.lz4 <https://console.cloud.google.com/m/cloudstorage/b/sabeti-public/o/depletion_dbs/contaminants.fasta.lz4>`__)
+-  `contaminants.fasta.tar.gz <https://storage.googleapis.com/sabeti-public/depletion_dbs/contaminants.fasta.tar.gz>`__ (`*.lz4 <https://storage.googleapis.com/sabeti-public/depletion_dbs/contaminants.fasta.lz4>`__)
 
 A FASTA file containing spike-ins to be reported:
 
@@ -338,3 +304,38 @@ Taxonomic filtration of raw reads
 
 Starting from Illumina BCL directories
 --------------------------------------
+
+When starting from Illumina run directories, the viral-ngs Snakemake pipeline can demultiplex raw BCL files,
+and merge samples from multiple flowcell lanes or libraries. To use viral-ngs in this way, create the following files:
+
+``flowcells.txt`` (example below): A tab-delimited file describing the flowcells to demultiplex, as well as the lane to use, 
+a path to the file listing the barcodes used in the lane, the ``bustard_dir`` (the run directory as written by an Illumina sequencer), 
+and an optional column for ``max_mismatches``, which specifies how many bases are allowed to differ for a read to be assigned to a particular barcode (default: 0). The column ``max_mismatches`` may be omitted, including its header.
+
+::
+
+    flowcell        lane    barcode_file    bustard_dir     max_mismatches
+    H32G3ADXY       1       /path/to/barcodes.txt    /path/to/illumina/run/directory/run_BH32G3ADXY 1
+    H32G3ADXY       2       /path/to/barcodes.txt    /path/to/illumina/run/directory/run_BH32G3ADXY 1
+    AKJ6R   1       /path/to/barcodes.txt      /path/to/illumina/run/directory/run_AKJ6R      1
+
+
+``barcodes.txt`` (example below): A tab-delimited file describing the barcodes used for a given sample, along with a library ID.
+
+::
+
+    sample  barcode_1       barcode_2       library_id_per_sample
+    41C     TAAGGCGA        TATCCTCT        AP2
+    21P     CGTACTAG        TATCCTCT        AP2
+    42C     AGGCAGAA        TATCCTCT        AP2
+    41P     TCCTGAGC        TATCCTCT        AP2
+    42P     GGACTCCT        TATCCTCT        AP2
+    61C     TAGGCATG        TATCCTCT        AP2
+    61P     CTCTCTAC        AGAGTAGA        AP2
+    62C     CAGAGAGG        AGAGTAGA        AP2
+    62P     GCTACGCT        AGAGTAGA        AP2
+    142C    CGAGGCTG        AGAGTAGA        AP2
+    WATERCTL        AAGAGGCA        AGAGTAGA        AP2
+
+``samples-depletion.txt``: the list of sample names to deplete `as described above <#adding-input-data>`__.
+
diff --git a/easy-deploy-virtualized/.vagrant/machines/default/aws/action_provision b/easy-deploy-virtualized/.vagrant/machines/default/aws/action_provision
diff --git a/easy-deploy-virtualized/.vagrant/machines/default/aws/creator_uid b/easy-deploy-virtualized/.vagrant/machines/default/aws/creator_uid
diff --git a/easy-deploy-virtualized/.vagrant/machines/default/aws/id b/easy-deploy-virtualized/.vagrant/machines/default/aws/id
diff --git a/easy-deploy-virtualized/.vagrant/machines/default/aws/index_uuid b/easy-deploy-virtualized/.vagrant/machines/default/aws/index_uuid
diff --git a/easy-deploy-virtualized/.vagrant/machines/default/aws/synced_folders b/easy-deploy-virtualized/.vagrant/machines/default/aws/synced_folders
diff --git a/easy-deploy-virtualized/.vagrant/provisioners/ansible/inventory/vagrant_ansible_inventory b/easy-deploy-virtualized/.vagrant/provisioners/ansible/inventory/vagrant_ansible_inventory