Merge branch 'master' into is-metadata-recording-1

broadinstitute · Mar 29, 2018 · cf70e71 · cf70e71
2 parents 6dc1197 + f61c71c
commit cf70e71
Show file tree

Hide file tree

Showing 14 changed files with 420 additions and 146 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM quay.io/broadinstitute/viral-baseimage:0.1.8
+FROM quay.io/broadinstitute/viral-baseimage:0.1.9
 
 LABEL maintainer "viral-ngs@broadinstitute.org"
 

diff --git a/metagenomics.py b/metagenomics.py
@@ -23,6 +23,7 @@
 import sys
 import subprocess
 import tempfile
+import json
 
 from Bio import SeqIO
 from Bio.Seq import Seq
@@ -664,6 +665,8 @@ def rank_code(rank):
         return "K"
     elif rank == "superkingdom":
         return "D"
+    elif rank == "unclassified":
+        return "U"
     else:
         return "-"
 
@@ -1143,6 +1146,199 @@ class KrakenBuildError(Exception):
     '''Error while building kraken database.'''
 
 
+def parser_kraken_taxlevel_summary(parser=argparse.ArgumentParser()):
+    parser.add_argument('summary_files_in', nargs="+", help='Kraken-format summary text file with tab-delimited taxonomic levels.')
+    parser.add_argument('--jsonOut', dest="json_out", type=argparse.FileType('w'), help='The path to a json file containing the relevant parsed summary data in json format.')
+    parser.add_argument('--csvOut', dest="csv_out", type=argparse.FileType('w'), help='The path to a csv file containing sample-specific counts.')
+    parser.add_argument('--taxHeading', nargs="+", dest="tax_headings", help='The taxonomic heading to analyze (default: %(default)s). More than one can be specified.', default="Viruses")
+    parser.add_argument('--taxlevelFocus', dest="taxlevel_focus", help='The taxonomic heading to summarize (totals by Genus, etc.) (default: %(default)s).', default="species", 
+                        choices=["species", "genus", "family", "order", "class", "phylum", "kingdom", "superkingdom"])
+    parser.add_argument('--topN', type=int, dest="top_n_entries", help='Only include the top N most abundant taxa by read count (default: %(default)s)', default=100)
+    parser.add_argument('--countThreshold', type=int, dest="count_threshold", help='Minimum number of reads to be included (default: %(default)s)', default=1)
+    parser.add_argument('--zeroFill', action='store_true', dest="zero_fill", help='When absent from a sample, write zeroes (rather than leaving blank).')
+    parser.add_argument('--noHist', action='store_true', dest="no_hist", help='Write out a report by-sample rather than a histogram.')
+    parser.add_argument('--includeRoot', action='store_true', dest="include_root", help='Include the count of reads at the root level and the unclassified bin.')
+    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
+    util.cmd.attach_main(parser, taxlevel_summary, split_args=True)
+    return parser
+
+def taxlevel_summary(summary_files_in, json_out, csv_out, tax_headings, taxlevel_focus, top_n_entries, count_threshold, no_hist, zero_fill, include_root):
+    """
+        Aggregates taxonomic abundance data from multiple Kraken-format summary files.
+        It is intended to report information on a particular taxonomic level (--taxlevelFocus; ex. 'species'), 
+        within a higher-level grouping (--taxHeading; ex. 'Viruses'). By default, when --taxHeading 
+        is at the same level as --taxlevelFocus a summary with lines for each sample is emitted. 
+        Otherwise, a histogram is returned. If per-sample information is desired, --noHist can be specified.
+        In per-sample data, the suffix "-pt" indicates percentage, so a value of 0.02 is 0.0002 of the total number of reads for the sample.
+        If --topN is specified, only the top N most abundant taxa are included in the histogram count or per-sample output. 
+        If a number is specified for --countThreshold, only taxa with that number of reads (or greater) are included. 
+        Full data returned via --jsonOut (filtered by --topN and --countThreshold), whereas -csvOut returns a summary.
+    """
+
+    samples = {}
+    same_level = False
+
+    Abundance = collections.namedtuple("Abundance", "percent,count")
+
+    def indent_len(in_string):
+        return len(in_string)-len(in_string.lstrip())
+
+    for f in list(summary_files_in):
+        sample_name, extension = os.path.splitext(f)
+        sample_summary = {}
+        sample_root_summary = {}
+        tax_headings_copy = [s.lower() for s in tax_headings]
+
+        with util.file.open_or_gzopen(f, 'rU') as inf:
+            should_process = False
+            indent_of_selection = -1
+            currently_being_processed = ""
+            for line in inf:
+                if len(line.rstrip('\r\n').strip()) == 0:
+                    continue
+                csv.register_dialect('kraken_report', quoting=csv.QUOTE_MINIMAL, delimiter="\t")
+                fieldnames = ["pct_of_reads","num_reads","reads_exc_children","rank","NCBI_tax_ID","sci_name"]
+                row = next(csv.DictReader([line.strip().rstrip('\n')], fieldnames=fieldnames, dialect="kraken_report"))
+
+                indent_of_line = indent_len(row["sci_name"])
+                # remove leading/trailing whitespace from each item
+                row = { k:v.strip() for k, v in row.items()}
+
+                # rows are formatted like so:
+                # 0.00  16  0   D   10239     Viruses
+                #
+                # row["pct_of_reads"] Percentage of reads covered by the clade rooted at this taxon
+                # row["num_reads"] Number of reads covered by the clade rooted at this taxon
+                # row["reads_exc_children"] Number of reads assigned directly to this taxon
+                # row["rank"] A rank code, indicating (U)nclassified, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. All other ranks are simply '-'.
+                # row["NCBI_tax_ID"] NCBI taxonomy ID
+                # row["sci_name"] indented scientific name
+
+                # if the root-level bins (root, unclassified) should be included, do so, but bypass normal 
+                # stateful parsing logic since root does not have a distinct rank level
+                if row["sci_name"].lower() in ["root","unclassified"] and include_root:
+                    sample_root_summary[row["sci_name"]] = collections.OrderedDict()
+                    sample_root_summary[row["sci_name"]][row["sci_name"]] = Abundance(float(row["pct_of_reads"]), int(row["num_reads"]))
+                    continue
+
+                if indent_of_line <= indent_of_selection:
+                    should_process = False
+                    indent_of_selection=-1
+
+                if indent_of_selection == -1:
+                    if row["sci_name"].lower() in tax_headings_copy:
+                        tax_headings_copy.remove(row["sci_name"].lower())
+
+                        should_process = True
+                        indent_of_selection = indent_of_line
+                        currently_being_processed = row["sci_name"]
+                        sample_summary[currently_being_processed] = collections.OrderedDict()
+                        if row["rank"] == rank_code(taxlevel_focus):
+                            same_level = True
+                        if row["rank"] == "-":
+                            log.warning("Non-taxonomic parent level selected")
+
+                if should_process:
+                    # skip "-" rank levels since they do not occur at the sample level
+                    # otherwise include the taxon row if the rank matches the desired level of focus
+                    if (row["rank"] != "-" and rank_code(taxlevel_focus) == row["rank"]):                        
+                        if int(row["num_reads"])>=count_threshold:
+                            sample_summary[currently_being_processed][row["sci_name"]] = Abundance(float(row["pct_of_reads"]), int(row["num_reads"]))
+
+
+        for k,taxa in sample_summary.items():
+            sample_summary[k] = collections.OrderedDict(sorted(taxa.items(), key=lambda item: (item[1][1]) , reverse=True)[:top_n_entries])
+
+            if len(list(sample_summary[k].items()))>0:
+                log.info("{f}: most abundant among {heading} at the {level} level: "
+                            "\"{name}\" with {reads} reads ({percent:.2%} of total); "
+                            "included since >{threshold} read{plural}".format(
+                                                                          f=f,
+                                                                          heading=k, 
+                                                                          level=taxlevel_focus, 
+                                                                          name=list(sample_summary[k].items())[0][0],
+                                                                          reads=list(sample_summary[k].items())[0][1].count,
+                                                                          percent=list(sample_summary[k].items())[0][1].percent/100.0,
+                                                                          threshold=count_threshold,
+                                                                          plural="s" if count_threshold>1 else "" )
+                )
+
+        if include_root:
+            # include root-level bins (root, unclassified) in the returned data
+            for k,taxa in sample_root_summary.items():
+                assert (k not in sample_summary), "{k} already in sample summary".format(k=k)
+                sample_summary[k] = taxa
+        samples[sample_name] = sample_summary
+
+    if json_out != None:
+        json_summary = json.dumps(samples, sort_keys=True, indent=4, separators=(',', ': '))
+        json_out.write(json_summary)
+        json_out.close()
+
+
+    if csv_out != None:
+
+        # if we're writing out at the same level as the query header
+        # write out the fractions and counts
+        if same_level or no_hist:
+
+            fieldnames = set()
+            for sample, taxa in samples.items():
+                for heading,taxon in taxa.items():
+                    if len(taxon):
+                        for k in taxon.keys():
+                            fieldnames |= set([k+"-pt",k+"-ct"])
+
+            heading_columns = ["sample"]
+            if include_root:
+                root_fields = ["root-pt","root-ct","unclassified-pt","unclassified-ct"]
+                fieldnames -= set(root_fields)
+                heading_columns += root_fields
+
+            writer = csv.DictWriter(csv_out, restval=0 if zero_fill else '', fieldnames=heading_columns+sorted(list(fieldnames)))
+            writer.writeheader()
+
+            for sample, taxa in samples.items():
+                sample_dict = {}
+                sample_dict["sample"] = sample
+                for heading,taxon in taxa.items(): 
+                    for entry in taxon.keys():
+                        sample_dict[entry+"-pt"] = taxon[entry].percent
+                        sample_dict[entry+"-ct"] = taxon[entry].count
+                writer.writerow(sample_dict)
+
+
+            csv_out.close()
+
+        # otherwise write out a histogram
+        else:
+            count = 0
+            summary_counts = collections.defaultdict(dict)
+            for sample, totals in samples.items():
+                for heading,taxa in totals.items():
+                    for taxon in taxa.keys():
+                        if taxon not in summary_counts[heading].keys():
+                            summary_counts[heading][taxon] = 1
+                        else:
+                            summary_counts[heading][taxon] += 1
+
+            for k,taxa in summary_counts.items():
+                summary_counts[k] = collections.OrderedDict(sorted(taxa.items(), key=lambda item: (item[1]) , reverse=True))
+
+
+            fieldnames = ["heading","taxon","num_samples"]
+            writer = csv.DictWriter(csv_out, restval=0 if zero_fill else '', fieldnames=fieldnames)
+            writer.writeheader()
+
+            for heading,taxa_counts in summary_counts.items():
+                writer.writerows([{"heading":heading,"taxon":taxon,"num_samples":count} for taxon,count in taxa_counts.items()])
+
+            csv_out.close()
+
+
+__commands__.append(('taxlevel_summary', parser_kraken_taxlevel_summary))
+
+
 def parser_kraken_build(parser=argparse.ArgumentParser()):
     parser.add_argument('db', help='Kraken database output directory.')
     parser.add_argument('--library', help='Input library directory of fasta files. If not specified, it will be read from the "library" subdirectory of "db".')

diff --git a/ncbi.py b/ncbi.py
@@ -381,6 +381,7 @@ def fasta2fsa(infname, outdir, biosample=None):
 def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=None):
     with open(cmt_fname, 'wt') as outf:
         outf.write("StructuredCommentPrefix\t##Genome-Assembly-Data-START##\n")
+        # note: the <tool name> v. <version name> format is required by NCBI, don't remove the " v. "
         outf.write("Assembly Method\tgithub.com/broadinstitute/viral-ngs v. {}\n".format(util.version.get_version()))
         if name:
             outf.write("Assembly Name\t{}\n".format(name))
@@ -448,7 +449,7 @@ def prep_genbank_files(templateFile, fasta_files, annotDir,
                             outf.write(inf.readline())
                             for line in inf:
                                 row = line.rstrip('\n').split('\t')
-                                if row[0] == sample_base:
+                                if row[0] == sample_base or row[0] == sample:
                                     row[0] = sample
                                     outf.write('\t'.join(row) + '\n')
 

diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl
diff --git a/pipes/WDL/workflows/coverage_table.wdl b/pipes/WDL/workflows/coverage_table.wdl
@@ -0,0 +1,5 @@
+import "reports.wdl" as reports
+
+workflow coverage_table {
+  call reports.coverage_report
+}
diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl
@@ -0,0 +1,29 @@
+import "interhost.wdl" as interhost
+import "ncbi.wdl" as ncbi
+
+workflow genbank {
+
+  File          reference_fasta
+  Array[File]+  assemblies_fasta     # one per genome
+  Array[File]+  ref_annotations_tbl  # one per chromosome
+
+  call interhost.multi_align_mafft_ref as mafft {
+    input:
+      reference_fasta = reference_fasta,
+      assemblies_fasta = assemblies_fasta
+  }
+
+  call ncbi.annot_transfer as annot {
+    input:
+      multi_aln_fasta = mafft.alignments_by_chr,
+      reference_fasta = reference_fasta,
+      reference_feature_table = ref_annotations_tbl
+  }
+
+  call ncbi.prepare_genbank as prep_genbank {
+    input:
+      assemblies_fasta = assemblies_fasta,
+      annotations_tbl = annot.transferred_feature_tables
+  }
+
+}
diff --git a/pipes/WDL/workflows/mafft.wdl b/pipes/WDL/workflows/mafft.wdl
@@ -0,0 +1,5 @@
+import "interhost.wdl" as interhost
+
+workflow mafft {
+  call interhost.multi_align_mafft
+}
diff --git a/pipes/WDL/workflows/merge_bams.wdl b/pipes/WDL/workflows/merge_bams.wdl
@@ -0,0 +1,5 @@
+import "demux.wdl" as demux
+
+workflow merge_bams {
+  call demux.merge_and_reheader_bams
+}
diff --git a/pipes/WDL/workflows/tasks/demux.wdl b/pipes/WDL/workflows/tasks/demux.wdl
@@ -111,3 +111,38 @@ task illumina_demux {
     preemptible: 0  # this is the very first operation before scatter, so let's get it done quickly & reliably
   }
 }
+
+task merge_and_reheader_bams {
+  Array[File]+  in_bams
+  File?         reheader_table # tsv with 3 cols: field, old value, new value
+  String        out_basename
+
+  command {
+    set -ex -o pipefail
+
+    if [ ${length(in_bams)} -gt 1 ]; then
+      read_utils.py merge_bams ${sep=' ' in_bams} merged.bam --loglevel DEBUG
+    else
+      echo "Skipping merge, only one input file"
+      ln -s ${select_first(in_bams)} merged.bam
+    fi    
+
+    if [ -f ${reheader_table} ]; then
+      read_utils.py merged.bam ${reheader_table} ${out_basename}.bam --loglevel DEBUG
+    else
+      echo "Skipping reheader, no mapping table specified"
+      ln -s merged.bam ${out_basename}.bam
+    fi
+  }
+
+  output {
+    File  out_bam = "${out_basename}.bam"
+  }
+
+  runtime {
+    docker: "quay.io/broadinstitute/viral-ngs"
+    memory: "2000 MB"
+    cpu: 2
+    dx_instance_type: "mem1_ssd2_x4"
+  }
+}