Merge branch 'master' into ct-ag100-exploration

broadinstitute · Feb 16, 2018 · a1e88c7 · a1e88c7
2 parents 725a4aa + 6f5e93b
commit a1e88c7
Show file tree

Hide file tree

Showing 9 changed files with 52 additions and 49 deletions.
diff --git a/metagenomics.py b/metagenomics.py
@@ -753,13 +753,14 @@ def parser_krona(parser=argparse.ArgumentParser()):
     parser.add_argument('outHtml', help='Output html report.')
     parser.add_argument('--queryColumn', help='Column of query id. (default %(default)s)', type=int, default=2)
     parser.add_argument('--taxidColumn', help='Column of taxonomy id. (default %(default)s)', type=int, default=3)
-    parser.add_argument('--scoreColumn', help='Column of score. (default %(default)s)', type=int)
+    parser.add_argument('--scoreColumn', help='Column of score. (default %(default)s)', type=int, default=None)
+    parser.add_argument('--magnitudeColumn', help='Column of magnitude. (default %(default)s)', type=int, default=None)
     parser.add_argument('--noHits', help='Include wedge for no hits.', action='store_true')
     parser.add_argument('--noRank', help='Include no rank assignments.', action='store_true')
     util.cmd.common_args(parser, (('loglevel', None), ('version', None)))
     util.cmd.attach_main(parser, krona, split_args=True)
     return parser
-def krona(inTsv, db, outHtml, queryColumn=None, taxidColumn=None, scoreColumn=None, noHits=None, noRank=None):
+def krona(inTsv, db, outHtml, queryColumn=None, taxidColumn=None, scoreColumn=None, magnitudeColumn=None, noHits=None, noRank=None):
     '''
         Create an interactive HTML report from a tabular metagenomic report
     '''
@@ -773,6 +774,7 @@ def krona(inTsv, db, outHtml, queryColumn=None, taxidColumn=None, scoreColumn=No
                 to_import = [tmp_tsv]
     else:
         to_import = [inTsv]
+    root_name = os.path.basename(inTsv)
 
     krona_tool.import_taxonomy(
         db,
@@ -781,6 +783,8 @@ def krona(inTsv, db, outHtml, queryColumn=None, taxidColumn=None, scoreColumn=No
         query_column=queryColumn,
         taxid_column=taxidColumn,
         score_column=scoreColumn,
+        magnitude_column=magnitudeColumn,
+        root_name=root_name,
         no_hits=noHits,
         no_rank=noRank
     )

diff --git a/pipes/WDL/workflows/tasks/assembly.wdl b/pipes/WDL/workflows/tasks/assembly.wdl
@@ -10,7 +10,8 @@ task assemble {
   String? assembler="trinity"  # trinity, spades, or trinity-spades
   String  cleaned_assembler = select_first([assembler, ""]) # workaround for https://gatkforums.broadinstitute.org/wdl/discussion/10462/string-type-in-output-section
 
-  String  sample_name = basename(reads_unmapped_bam, ".bam")
+  # do this in two steps in case the input doesn't actually have "taxfilt" in the name
+  String  sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt")
 
   command {
     set -ex -o pipefail
@@ -95,7 +96,8 @@ task scaffold {
   Int?    nucmer_min_cluster
   Int?    scaffold_min_pct_contig_aligned
 
-  String  sample_name = basename(contigs_fasta, ".fasta")
+  # do this in multiple steps in case the input doesn't actually have "assembly1-x" in the name
+  String  sample_name = basename(basename(basename(contigs_fasta, ".fasta"), ".assembly1-trinity"), ".assembly1-spades")
 
   command {
     set -ex -o pipefail
@@ -169,7 +171,7 @@ task refine {
   Float?  major_cutoff=0.5
   Int?    min_coverage=1
 
-  String  assembly_basename=basename(assembly_fasta, ".fasta")
+  String  assembly_basename=basename(basename(assembly_fasta, ".fasta"), ".scaffold")
 
   command {
     set -ex -o pipefail
@@ -241,8 +243,8 @@ task refine_2x_and_plot {
 
   String? plot_coverage_novoalign_options="-r Random -l 40 -g 40 -x 20 -t 100 -k"
 
-  String  assembly_basename = basename(assembly_fasta, ".fasta")
-  String  sample_name       = basename(reads_unmapped_bam, ".bam")
+  # do this in two steps in case the input doesn't actually have "cleaned" in the name
+  String  sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned")
 
   command {
     set -ex -o pipefail
@@ -265,8 +267,8 @@ task refine_2x_and_plot {
     assembly.py refine_assembly \
       assembly.fasta \
       ${reads_unmapped_bam} \
-      ${assembly_basename}.refine1.fasta \
-      --outVcf ${assembly_basename}.refine1.pre_fasta.vcf.gz \
+      ${sample_name}.refine1.fasta \
+      --outVcf ${sample_name}.refine1.pre_fasta.vcf.gz \
       --min_coverage ${refine1_min_coverage} \
       --major_cutoff ${refine1_major_cutoff} \
       --GATK_PATH gatk/ \
@@ -276,10 +278,10 @@ task refine_2x_and_plot {
 
     # refine 2
     assembly.py refine_assembly \
-      ${assembly_basename}.refine1.fasta \
+      ${sample_name}.refine1.fasta \
       ${reads_unmapped_bam} \
-      ${assembly_basename}.refine2.fasta \
-      --outVcf ${assembly_basename}.refine2.pre_fasta.vcf.gz \
+      ${sample_name}.fasta \
+      --outVcf ${sample_name}.refine2.pre_fasta.vcf.gz \
       --min_coverage ${refine2_min_coverage} \
       --major_cutoff ${refine2_major_cutoff} \
       --GATK_PATH gatk/ \
@@ -290,30 +292,20 @@ task refine_2x_and_plot {
     # final alignment
     read_utils.py align_and_fix \
       ${reads_unmapped_bam} \
-      ${assembly_basename}.refine2.fasta \
-      --outBamAll ${sample_name}.bam \
+      ${sample_name}.fasta \
+      --outBamAll ${sample_name}.all.bam \
       --outBamFiltered ${sample_name}.mapped.bam \
       --GATK_PATH gatk/ \
       --aligner_options "${plot_coverage_novoalign_options}" \
       --JVMmemory "$mem_in_mb"m \
       --loglevel=DEBUG
 
-    # plot_coverage
-    reports.py plot_coverage \
-      ${sample_name}.mapped.bam \
-      ${sample_name}.coverage_plot.pdf \
-      --plotFormat pdf \
-      --plotWidth 1100 \
-      --plotHeight 850 \
-      --plotDPI 100 \
-      --loglevel=DEBUG
-
     # collect figures of merit
-    grep -v '^>' assembly.fasta | tr -d '\n' | wc -c | tee assembly_length
-    grep -v '^>' assembly.fasta | tr -d '\nNn' | wc -c | tee assembly_length_unambiguous
+    grep -v '^>' ${sample_name}.fasta | tr -d '\n' | wc -c | tee assembly_length
+    grep -v '^>' ${sample_name}.fasta | tr -d '\nNn' | wc -c | tee assembly_length_unambiguous
     samtools view -c ${sample_name}.mapped.bam | tee reads_aligned
-    samtools flagstat ${sample_name}.bam | tee ${sample_name}.bam.flagstat.txt
-    grep properly ${sample_name}.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned
+    samtools flagstat ${sample_name}.all.bam | tee ${sample_name}.all.bam.flagstat.txt
+    grep properly ${sample_name}.all.bam.flagstat.txt | cut -f 1 -d ' ' | tee read_pairs_aligned
     samtools view ${sample_name}.mapped.bam | cut -f10 | tr -d '\n' | wc -c | tee bases_aligned
     echo $(( $(cat bases_aligned) / $(cat assembly_length) )) | tee mean_coverage
 
@@ -331,17 +323,17 @@ task refine_2x_and_plot {
         --plotDPI 100 \
         --loglevel=DEBUG
     else
-      touch ${sample_name}.coverage_plot.pdf ${sample_name}.mapped_fastqc.html
+      touch ${sample_name}.coverage_plot.pdf
     fi
   }
 
   output {
-    File refine1_sites_vcf_gz        = "${assembly_basename}.refine1.pre_fasta.vcf.gz"
-    File refine1_assembly_fasta      = "${assembly_basename}.refine1.fasta"
-    File refine2_sites_vcf_gz        = "${assembly_basename}.refine2.pre_fasta.vcf.gz"
-    File refine2_assembly_fasta      = "${assembly_basename}.refine2.fasta"
-    File aligned_bam                 = "${sample_name}.bam"
-    File aligned_bam_flagstat        = "${sample_name}.bam.flagstat.txt"
+    File refine1_sites_vcf_gz        = "${sample_name}.refine1.pre_fasta.vcf.gz"
+    File refine1_assembly_fasta      = "${sample_name}.refine1.fasta"
+    File refine2_sites_vcf_gz        = "${sample_name}.refine2.pre_fasta.vcf.gz"
+    File final_assembly_fasta        = "${sample_name}.fasta"
+    File aligned_bam                 = "${sample_name}.all.bam"
+    File aligned_bam_flagstat        = "${sample_name}.all.bam.flagstat.txt"
     File aligned_only_reads_bam      = "${sample_name}.mapped.bam"
     File aligned_only_reads_fastqc   = "${sample_name}.mapped_fastqc.html"
     File coverage_plot               = "${sample_name}.coverage_plot.pdf"

diff --git a/pipes/rules/ncbi.rules b/pipes/rules/ncbi.rules
@@ -79,7 +79,7 @@ rule annot_transfer:
         LSF  = config.get('LSF_queues', {}).get('short', '-W 4:00'),
         UGER = config.get('UGER_queues', {}).get('short', '-l h_rt=04:00:00')
     run:
-        output_path_prefix = os.path.dirname(output.feature_tables)
+        output_path_prefix = os.path.dirname(output.feature_tables[0])
         for alignmentFile in expand("{data_dir}/{subdir}/aligned_{chrom}.fasta",
                                         data_dir=config["data_dir"],
                                         subdir=config["subdirs"]["multialign_ref"],
@@ -94,9 +94,9 @@ rule prepare_genbank:
             subdir=config["subdirs"]["annot"],
             samp=read_samples_file(config["samples_assembly"]),
             chrom=range(1, len(config["accessions_for_ref_genome_build"])+1)),
-        fasta_files=" ".join(expand("{dir}/{subdir}/{sample}.fasta",
+        fasta_files=expand("{dir}/{subdir}/{sample}.fasta",
             dir=[config["data_dir"]], subdir=[config["subdirs"]["assembly"]],
-            sample=list(read_samples_file(config["samples_assembly"])))),
+            sample=list(read_samples_file(config["samples_assembly"]))),
     output:
         config["data_dir"]+'/'+config["subdirs"]["annot"]+'/errorsummary.val'
     params:

diff --git a/requirements-conda.txt b/requirements-conda.txt
@@ -24,7 +24,7 @@ prinseq=0.20.4
 samtools=1.6
 snpeff=4.1l
 spades=3.11.1
-tbl2asn=25.3
+tbl2asn=25.6
 trimmomatic=0.36
 trinity=date.2011_11_26
 unzip=6.0

diff --git a/test/unit/test_metagenomics.py b/test/unit/test_metagenomics.py
@@ -106,7 +106,7 @@ def test_krona_import_taxonomy(self):
                            noHits=True, noRank=True)
         self.mock_krona().import_taxonomy.assert_called_once_with(
             self.db, [self.inTsv], out_html, query_column=3, taxid_column=5, score_column=7,
-            no_hits=True, no_rank=True)
+            no_hits=True, no_rank=True, magnitude_column=None, root_name=os.path.basename(self.inTsv))
 
 
 @pytest.fixture

diff --git a/tools/krona.py b/tools/krona.py
@@ -35,6 +35,8 @@ def import_taxonomy(self,
                         query_column=None,
                         taxid_column=None,
                         score_column=None,
+                        magnitude_column=None,
+                        root_name=None,
                         no_hits=None,
                         no_rank=None):
         self.install()
@@ -48,6 +50,10 @@ def import_taxonomy(self,
             cmd.extend(['-t', str(taxid_column)])
         if score_column is not None:
             cmd.extend(['-s', str(score_column)])
+        if magnitude_column is not None:
+            cmd.extend(['-m', str(magnitude_column)])
+        if root_name is not None:
+            cmd.extend(['-n', root_name])
         if no_hits is not None:
             cmd.append('-i')
         if no_rank is not None:

diff --git a/tools/tbl2asn.py b/tools/tbl2asn.py
@@ -74,10 +74,10 @@ def execute(
         # See: https://www.ncbi.nlm.nih.gov/IEB/ToolBox/C_DOC/lxr/source/demo/tbl2asn.c#L9674
         # We can try to work around this by examining the output for the upgrade message
         try:
-            subprocess.check_output(tool_cmd)
+            subprocess.check_output(tool_cmd, stderr=subprocess.STDOUT)
         except subprocess.CalledProcessError as e:
             old_version_expected_output = "This copy of tbl2asn is more than a year old.  Please download the current version."
-            if old_version_expected_output in e.output:
+            if old_version_expected_output in e.output.decode('UTF-8'):
                 pass
             else:
                 raise

diff --git a/travis/install-wdl.sh b/travis/install-wdl.sh
@@ -3,22 +3,23 @@ set -e -o pipefail
 
 cached_fetch_jar_from_github () {
 	_github_org=$1
-	_tool_name=$2
-	_jar_version=$3
+	_repo_name=$2
+	_tool_name=$3
+	_jar_version=$4
 	_jar_fname="$_tool_name-$_jar_version.jar"
 	if [ ! -f $CACHE_DIR/$_jar_fname ]; then
 		echo "Fetching $_jar_fname"
-		wget --quiet https://github.com/$_github_org/$_tool_name/releases/download/$_jar_version/$_jar_fname
+		wget --quiet https://github.com/$_github_org/$_repo_name/releases/download/$_jar_version/$_jar_fname
 		mv $_jar_fname $CACHE_DIR
 	else
 		echo "Using cached $_jar_fname"
 	fi
 	ln -s $CACHE_DIR/$_jar_fname $_tool_name.jar
 }
 
-cached_fetch_jar_from_github broadinstitute wdltool 0.14
-cached_fetch_jar_from_github broadinstitute cromwell 30.2
-cached_fetch_jar_from_github dnanexus dxWDL 0.58.1
+cached_fetch_jar_from_github broadinstitute cromwell womtool 30.2
+cached_fetch_jar_from_github broadinstitute cromwell cromwell 30.2
+cached_fetch_jar_from_github dnanexus dxWDL dxWDL 0.59
 
 TGZ=dx-toolkit-v0.240.1-ubuntu-14.04-amd64.tar.gz
 if [ ! -f $CACHE_DIR/$TGZ ]; then

diff --git a/travis/validate-wdl.sh b/travis/validate-wdl.sh
@@ -4,6 +4,6 @@ set -e -o pipefail
 ln -s pipes/WDL/workflows/tasks .
 for workflow in pipes/WDL/workflows/*.wdl; do
   echo "validating $workflow"
-  java -jar wdltool.jar validate $workflow
+  java -jar womtool.jar validate $workflow
 done
 rm tasks