Skip to content

Commit

Permalink
Merge branch 'master' into is-metadata-recording-1
Browse files Browse the repository at this point in the history
  • Loading branch information
notestaff committed Mar 29, 2018
2 parents 6dc1197 + f61c71c commit cf70e71
Show file tree
Hide file tree
Showing 14 changed files with 420 additions and 146 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
@@ -1,4 +1,4 @@
FROM quay.io/broadinstitute/viral-baseimage:0.1.8
FROM quay.io/broadinstitute/viral-baseimage:0.1.9

LABEL maintainer "viral-ngs@broadinstitute.org"

Expand Down
196 changes: 196 additions & 0 deletions metagenomics.py
Expand Up @@ -23,6 +23,7 @@
import sys
import subprocess
import tempfile
import json

from Bio import SeqIO
from Bio.Seq import Seq
Expand Down Expand Up @@ -664,6 +665,8 @@ def rank_code(rank):
return "K"
elif rank == "superkingdom":
return "D"
elif rank == "unclassified":
return "U"
else:
return "-"

Expand Down Expand Up @@ -1143,6 +1146,199 @@ class KrakenBuildError(Exception):
'''Error while building kraken database.'''


def parser_kraken_taxlevel_summary(parser=argparse.ArgumentParser()):
parser.add_argument('summary_files_in', nargs="+", help='Kraken-format summary text file with tab-delimited taxonomic levels.')
parser.add_argument('--jsonOut', dest="json_out", type=argparse.FileType('w'), help='The path to a json file containing the relevant parsed summary data in json format.')
parser.add_argument('--csvOut', dest="csv_out", type=argparse.FileType('w'), help='The path to a csv file containing sample-specific counts.')
parser.add_argument('--taxHeading', nargs="+", dest="tax_headings", help='The taxonomic heading to analyze (default: %(default)s). More than one can be specified.', default="Viruses")
parser.add_argument('--taxlevelFocus', dest="taxlevel_focus", help='The taxonomic heading to summarize (totals by Genus, etc.) (default: %(default)s).', default="species",
choices=["species", "genus", "family", "order", "class", "phylum", "kingdom", "superkingdom"])
parser.add_argument('--topN', type=int, dest="top_n_entries", help='Only include the top N most abundant taxa by read count (default: %(default)s)', default=100)
parser.add_argument('--countThreshold', type=int, dest="count_threshold", help='Minimum number of reads to be included (default: %(default)s)', default=1)
parser.add_argument('--zeroFill', action='store_true', dest="zero_fill", help='When absent from a sample, write zeroes (rather than leaving blank).')
parser.add_argument('--noHist', action='store_true', dest="no_hist", help='Write out a report by-sample rather than a histogram.')
parser.add_argument('--includeRoot', action='store_true', dest="include_root", help='Include the count of reads at the root level and the unclassified bin.')
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, taxlevel_summary, split_args=True)
return parser

def taxlevel_summary(summary_files_in, json_out, csv_out, tax_headings, taxlevel_focus, top_n_entries, count_threshold, no_hist, zero_fill, include_root):
"""
Aggregates taxonomic abundance data from multiple Kraken-format summary files.
It is intended to report information on a particular taxonomic level (--taxlevelFocus; ex. 'species'),
within a higher-level grouping (--taxHeading; ex. 'Viruses'). By default, when --taxHeading
is at the same level as --taxlevelFocus a summary with lines for each sample is emitted.
Otherwise, a histogram is returned. If per-sample information is desired, --noHist can be specified.
In per-sample data, the suffix "-pt" indicates percentage, so a value of 0.02 is 0.0002 of the total number of reads for the sample.
If --topN is specified, only the top N most abundant taxa are included in the histogram count or per-sample output.
If a number is specified for --countThreshold, only taxa with that number of reads (or greater) are included.
Full data returned via --jsonOut (filtered by --topN and --countThreshold), whereas -csvOut returns a summary.
"""

samples = {}
same_level = False

Abundance = collections.namedtuple("Abundance", "percent,count")

def indent_len(in_string):
return len(in_string)-len(in_string.lstrip())

for f in list(summary_files_in):
sample_name, extension = os.path.splitext(f)
sample_summary = {}
sample_root_summary = {}
tax_headings_copy = [s.lower() for s in tax_headings]

with util.file.open_or_gzopen(f, 'rU') as inf:
should_process = False
indent_of_selection = -1
currently_being_processed = ""
for line in inf:
if len(line.rstrip('\r\n').strip()) == 0:
continue
csv.register_dialect('kraken_report', quoting=csv.QUOTE_MINIMAL, delimiter="\t")
fieldnames = ["pct_of_reads","num_reads","reads_exc_children","rank","NCBI_tax_ID","sci_name"]
row = next(csv.DictReader([line.strip().rstrip('\n')], fieldnames=fieldnames, dialect="kraken_report"))

indent_of_line = indent_len(row["sci_name"])
# remove leading/trailing whitespace from each item
row = { k:v.strip() for k, v in row.items()}

# rows are formatted like so:
# 0.00 16 0 D 10239 Viruses
#
# row["pct_of_reads"] Percentage of reads covered by the clade rooted at this taxon
# row["num_reads"] Number of reads covered by the clade rooted at this taxon
# row["reads_exc_children"] Number of reads assigned directly to this taxon
# row["rank"] A rank code, indicating (U)nclassified, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. All other ranks are simply '-'.
# row["NCBI_tax_ID"] NCBI taxonomy ID
# row["sci_name"] indented scientific name

# if the root-level bins (root, unclassified) should be included, do so, but bypass normal
# stateful parsing logic since root does not have a distinct rank level
if row["sci_name"].lower() in ["root","unclassified"] and include_root:
sample_root_summary[row["sci_name"]] = collections.OrderedDict()
sample_root_summary[row["sci_name"]][row["sci_name"]] = Abundance(float(row["pct_of_reads"]), int(row["num_reads"]))
continue

if indent_of_line <= indent_of_selection:
should_process = False
indent_of_selection=-1

if indent_of_selection == -1:
if row["sci_name"].lower() in tax_headings_copy:
tax_headings_copy.remove(row["sci_name"].lower())

should_process = True
indent_of_selection = indent_of_line
currently_being_processed = row["sci_name"]
sample_summary[currently_being_processed] = collections.OrderedDict()
if row["rank"] == rank_code(taxlevel_focus):
same_level = True
if row["rank"] == "-":
log.warning("Non-taxonomic parent level selected")

if should_process:
# skip "-" rank levels since they do not occur at the sample level
# otherwise include the taxon row if the rank matches the desired level of focus
if (row["rank"] != "-" and rank_code(taxlevel_focus) == row["rank"]):
if int(row["num_reads"])>=count_threshold:
sample_summary[currently_being_processed][row["sci_name"]] = Abundance(float(row["pct_of_reads"]), int(row["num_reads"]))


for k,taxa in sample_summary.items():
sample_summary[k] = collections.OrderedDict(sorted(taxa.items(), key=lambda item: (item[1][1]) , reverse=True)[:top_n_entries])

if len(list(sample_summary[k].items()))>0:
log.info("{f}: most abundant among {heading} at the {level} level: "
"\"{name}\" with {reads} reads ({percent:.2%} of total); "
"included since >{threshold} read{plural}".format(
f=f,
heading=k,
level=taxlevel_focus,
name=list(sample_summary[k].items())[0][0],
reads=list(sample_summary[k].items())[0][1].count,
percent=list(sample_summary[k].items())[0][1].percent/100.0,
threshold=count_threshold,
plural="s" if count_threshold>1 else "" )
)

if include_root:
# include root-level bins (root, unclassified) in the returned data
for k,taxa in sample_root_summary.items():
assert (k not in sample_summary), "{k} already in sample summary".format(k=k)
sample_summary[k] = taxa
samples[sample_name] = sample_summary

if json_out != None:
json_summary = json.dumps(samples, sort_keys=True, indent=4, separators=(',', ': '))
json_out.write(json_summary)
json_out.close()


if csv_out != None:

# if we're writing out at the same level as the query header
# write out the fractions and counts
if same_level or no_hist:

fieldnames = set()
for sample, taxa in samples.items():
for heading,taxon in taxa.items():
if len(taxon):
for k in taxon.keys():
fieldnames |= set([k+"-pt",k+"-ct"])

heading_columns = ["sample"]
if include_root:
root_fields = ["root-pt","root-ct","unclassified-pt","unclassified-ct"]
fieldnames -= set(root_fields)
heading_columns += root_fields

writer = csv.DictWriter(csv_out, restval=0 if zero_fill else '', fieldnames=heading_columns+sorted(list(fieldnames)))
writer.writeheader()

for sample, taxa in samples.items():
sample_dict = {}
sample_dict["sample"] = sample
for heading,taxon in taxa.items():
for entry in taxon.keys():
sample_dict[entry+"-pt"] = taxon[entry].percent
sample_dict[entry+"-ct"] = taxon[entry].count
writer.writerow(sample_dict)


csv_out.close()

# otherwise write out a histogram
else:
count = 0
summary_counts = collections.defaultdict(dict)
for sample, totals in samples.items():
for heading,taxa in totals.items():
for taxon in taxa.keys():
if taxon not in summary_counts[heading].keys():
summary_counts[heading][taxon] = 1
else:
summary_counts[heading][taxon] += 1

for k,taxa in summary_counts.items():
summary_counts[k] = collections.OrderedDict(sorted(taxa.items(), key=lambda item: (item[1]) , reverse=True))


fieldnames = ["heading","taxon","num_samples"]
writer = csv.DictWriter(csv_out, restval=0 if zero_fill else '', fieldnames=fieldnames)
writer.writeheader()

for heading,taxa_counts in summary_counts.items():
writer.writerows([{"heading":heading,"taxon":taxon,"num_samples":count} for taxon,count in taxa_counts.items()])

csv_out.close()


__commands__.append(('taxlevel_summary', parser_kraken_taxlevel_summary))


def parser_kraken_build(parser=argparse.ArgumentParser()):
parser.add_argument('db', help='Kraken database output directory.')
parser.add_argument('--library', help='Input library directory of fasta files. If not specified, it will be read from the "library" subdirectory of "db".')
Expand Down
3 changes: 2 additions & 1 deletion ncbi.py
Expand Up @@ -381,6 +381,7 @@ def fasta2fsa(infname, outdir, biosample=None):
def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=None):
with open(cmt_fname, 'wt') as outf:
outf.write("StructuredCommentPrefix\t##Genome-Assembly-Data-START##\n")
# note: the <tool name> v. <version name> format is required by NCBI, don't remove the " v. "
outf.write("Assembly Method\tgithub.com/broadinstitute/viral-ngs v. {}\n".format(util.version.get_version()))
if name:
outf.write("Assembly Name\t{}\n".format(name))
Expand Down Expand Up @@ -448,7 +449,7 @@ def prep_genbank_files(templateFile, fasta_files, annotDir,
outf.write(inf.readline())
for line in inf:
row = line.rstrip('\n').split('\t')
if row[0] == sample_base:
if row[0] == sample_base or row[0] == sample:
row[0] = sample
outf.write('\t'.join(row) + '\n')

Expand Down
32 changes: 0 additions & 32 deletions pipes/WDL/workflows/align_and_annot.wdl

This file was deleted.

5 changes: 5 additions & 0 deletions pipes/WDL/workflows/coverage_table.wdl
@@ -0,0 +1,5 @@
import "reports.wdl" as reports

workflow coverage_table {
call reports.coverage_report
}
29 changes: 29 additions & 0 deletions pipes/WDL/workflows/genbank.wdl
@@ -0,0 +1,29 @@
import "interhost.wdl" as interhost
import "ncbi.wdl" as ncbi

workflow genbank {

File reference_fasta
Array[File]+ assemblies_fasta # one per genome
Array[File]+ ref_annotations_tbl # one per chromosome

call interhost.multi_align_mafft_ref as mafft {
input:
reference_fasta = reference_fasta,
assemblies_fasta = assemblies_fasta
}

call ncbi.annot_transfer as annot {
input:
multi_aln_fasta = mafft.alignments_by_chr,
reference_fasta = reference_fasta,
reference_feature_table = ref_annotations_tbl
}

call ncbi.prepare_genbank as prep_genbank {
input:
assemblies_fasta = assemblies_fasta,
annotations_tbl = annot.transferred_feature_tables
}

}
5 changes: 5 additions & 0 deletions pipes/WDL/workflows/mafft.wdl
@@ -0,0 +1,5 @@
import "interhost.wdl" as interhost

workflow mafft {
call interhost.multi_align_mafft
}
5 changes: 5 additions & 0 deletions pipes/WDL/workflows/merge_bams.wdl
@@ -0,0 +1,5 @@
import "demux.wdl" as demux

workflow merge_bams {
call demux.merge_and_reheader_bams
}
35 changes: 35 additions & 0 deletions pipes/WDL/workflows/tasks/demux.wdl
Expand Up @@ -111,3 +111,38 @@ task illumina_demux {
preemptible: 0 # this is the very first operation before scatter, so let's get it done quickly & reliably
}
}

task merge_and_reheader_bams {
Array[File]+ in_bams
File? reheader_table # tsv with 3 cols: field, old value, new value
String out_basename

command {
set -ex -o pipefail

if [ ${length(in_bams)} -gt 1 ]; then
read_utils.py merge_bams ${sep=' ' in_bams} merged.bam --loglevel DEBUG
else
echo "Skipping merge, only one input file"
ln -s ${select_first(in_bams)} merged.bam
fi

if [ -f ${reheader_table} ]; then
read_utils.py merged.bam ${reheader_table} ${out_basename}.bam --loglevel DEBUG
else
echo "Skipping reheader, no mapping table specified"
ln -s merged.bam ${out_basename}.bam
fi
}

output {
File out_bam = "${out_basename}.bam"
}

runtime {
docker: "quay.io/broadinstitute/viral-ngs"
memory: "2000 MB"
cpu: 2
dx_instance_type: "mem1_ssd2_x4"
}
}

0 comments on commit cf70e71

Please sign in to comment.