Skip to content

Commit

Permalink
Merge pull request #342 from broadinstitute/ct-metagenomic-report-merger
Browse files Browse the repository at this point in the history
add metagenomic_report_merge()
  • Loading branch information
dpark01 committed Jun 16, 2016
2 parents ce459c7 + 1e98a5b commit 7695b5b
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 22 deletions.
108 changes: 89 additions & 19 deletions metagenomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import tools.krona
import tools.diamond
import tools.picard
import csv

__commands__ = []

Expand Down Expand Up @@ -369,7 +370,7 @@ def taxa_hits_from_tsv(f, tax_id_column=2):
return c


def kraken_dfs_report(db, taxa_hits):
def kraken_dfs_report(db, taxa_hits, prepend_column=True):
'''Return a kraken compatible DFS report of taxa hits.
Args:
Expand All @@ -379,6 +380,8 @@ def kraken_dfs_report(db, taxa_hits):
Return:
[]str lines of the report
'''
line_prefix = '\t' if prepend_column else ''

db.children = parents_to_children(db.parents)
total_hits = sum(taxa_hits.values())
lines = []
Expand All @@ -387,7 +390,7 @@ def kraken_dfs_report(db, taxa_hits):
unclassified_hits += taxa_hits.get(-1, 0)
if unclassified_hits > 0:
percent_covered = '%.2f' % (unclassified_hits / total_hits * 100)
lines.append('\t'.join([
lines.append(line_prefix+'\t'.join([
str(percent_covered), str(unclassified_hits),
str(unclassified_hits), 'U', '0', 'unclassified'
]))
Expand All @@ -410,6 +413,10 @@ def kraken_dfs(db, lines, taxa_hits, total_hits, taxid, level):

def kraken(inBam, db, outReport=None, outReads=None,
filterThreshold=None, numThreads=1):
'''
Classify reads by taxon using Kraken
'''

assert outReads or outReport, (
'Either --outReads or --outReport must be specified.')
kraken_tool = tools.kraken.Kraken()
Expand Down Expand Up @@ -437,8 +444,27 @@ def kraken(inBam, db, outReport=None, outReads=None,
kraken_tool.report(tmp_filtered_reads, db, outReport)


def parser_kraken(parser=argparse.ArgumentParser()):
parser.add_argument('inBam', help='Input unaligned reads, BAM format.')
parser.add_argument('db', help='Kraken database directory.')
parser.add_argument('--outReport', help='Kraken report output file.')
parser.add_argument('--outReads', help='Kraken per read output file.')
parser.add_argument('--filterThreshold',
default=0.05,
type=float,
help='Kraken filter threshold (default %(default)s)')
parser.add_argument('--numThreads', type=int, default=1, help='Number of threads to run. (default %(default)s)')
util.cmd.common_args(parser, (('loglevel', None), ('version', None),
('tmp_dir', None)))
util.cmd.attach_main(parser, kraken, split_args=True)
return parser


def krona(inTsv, db, outHtml, queryColumn=None, taxidColumn=None,
scoreColumn=None, noHits=None, noRank=None):
'''
Create an interactive HTML report from a tabular metagenomic report
'''

krona_tool = tools.krona.Krona()
if inTsv.endswith('.gz'):
Expand All @@ -455,22 +481,6 @@ def krona(inTsv, db, outHtml, queryColumn=None, taxidColumn=None,
score_column=scoreColumn, no_hits=noHits, no_rank=noRank)


def parser_kraken(parser=argparse.ArgumentParser()):
parser.add_argument('inBam', help='Input unaligned reads, BAM format.')
parser.add_argument('db', help='Kraken database directory.')
parser.add_argument('--outReport', help='Kraken report output file.')
parser.add_argument('--outReads', help='Kraken per read output file.')
parser.add_argument('--filterThreshold',
default=0.05,
type=float,
help='Kraken filter threshold (default %(default)s)')
parser.add_argument('--numThreads', type=int, default=1, help='Number of threads to run. (default %(default)s)')
util.cmd.common_args(parser, (('loglevel', None), ('version', None),
('tmp_dir', None)))
util.cmd.attach_main(parser, kraken, split_args=True)
return parser


def parser_krona(parser=argparse.ArgumentParser()):
parser.add_argument('inTsv', help='Input tab delimited file.')
parser.add_argument('db', help='Krona taxonomy database directory.')
Expand All @@ -491,6 +501,9 @@ def parser_krona(parser=argparse.ArgumentParser()):


def diamond(inBam, db, taxDb, outReport, outM8=None, outLca=None, numThreads=1):
'''
Classify reads by the taxon of the Lowest Common Ancestor (LCA)
'''
tmp_fastq = util.file.mkstempfname('.fastq')
tmp_fastq2 = util.file.mkstempfname('.fastq')
picard = tools.picard.SamToFastqTool()
Expand Down Expand Up @@ -530,7 +543,7 @@ def diamond(inBam, db, taxDb, outReport, outM8=None, outLca=None, numThreads=1):
with open(tmp_lca_tsv) as f:
hits = taxa_hits_from_tsv(f)
with open(outReport, 'w') as f:
for line in kraken_dfs_report(tax_db, hits):
for line in kraken_dfs_report(tax_db, hits, prepend_column=True):
print(line, file=f)


Expand All @@ -547,10 +560,67 @@ def parser_diamond(parser=argparse.ArgumentParser()):
util.cmd.attach_main(parser, diamond, split_args=True)
return parser

def metagenomic_report_merge(metagenomic_reports, out_kraken_summary, kraken_db, out_krona_input):
'''
Merge multiple metegenomic reports into a single metagenomic report.
Any Krona input files created by this
'''
assert out_kraken_summary or out_krona_input, (
"Either --outSummaryReport or --outByQueryToTaxonID must be specified")
assert kraken_db if out_kraken_summary else True, (
'A Kraken db must be provided via --krakenDB if outSummaryReport is specified')

# column numbers containing the query (sequence) ID and taxonomic ID
# these are one-indexed
# See: http://ccb.jhu.edu/software/kraken/MANUAL.html#output-format
tool_data_columns = {
"kraken": (2, 3)
}

# if we're creating a Krona input file
if out_krona_input:
# open the output file (as gz if necessary)
with util.file.open_or_gzopen(out_krona_input ,"wt") as outf:
# create a TSV writer for the output file
output_writer = csv.writer(outf, delimiter='\t', lineterminator='\n')

if metagenomic_reports:
# for each Kraken-format metag file specified, pull out the appropriate columns
# and write them to the TSV output
for metag_file in metagenomic_reports:
with util.file.open_or_gzopen(metag_file.name ,"rt") as inf:
file_reader = csv.reader(inf, delimiter='\t')
for row in file_reader:
# for only the two relevant columns
output_writer.writerow([f for f in row])
#output_writer.writerow([row[c-1] for c in tool_data_columns["kraken"]])


# create a human-readable summary of the Kraken reports
# kraken-report can only be used on kraken reports since it depends on queries being in its database
if out_kraken_summary:
# create temporary file to hold combined kraken report
tmp_metag_combined_txt = util.file.mkstempfname('.txt')

util.file.cat(tmp_metag_combined_txt, [metag_file.name for metag_file in metagenomic_reports])

kraken_tool = tools.kraken.Kraken()
kraken_tool.report(tmp_metag_combined_txt, kraken_db.name, out_kraken_summary)


def parser_metagenomic_report_merge(parser=argparse.ArgumentParser()):
parser.add_argument("metagenomic_reports", help="Input metagenomic reports with the query ID and taxon ID in the 2nd and 3rd columns (Kraken format)", nargs='+', type=argparse.FileType('r'))
parser.add_argument("--outSummaryReport", dest="out_kraken_summary", help="Path of human-readable metagenomic summary report, created by kraken-report")
parser.add_argument("--krakenDB", dest="kraken_db", help="Kraken database (needed for outSummaryReport)", type=argparse.FileType('r'))
parser.add_argument("--outByQueryToTaxonID", dest="out_krona_input", help="Output metagenomic report suitable for Krona input. ")
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, metagenomic_report_merge, split_args=True)
return parser

__commands__.append(('kraken', parser_kraken))
__commands__.append(('diamond', parser_diamond))
__commands__.append(('krona', parser_krona))
__commands__.append(('report_merge', parser_metagenomic_report_merge))


def full_parser():
Expand Down
5 changes: 4 additions & 1 deletion pipes/rules/metagenomics.rules
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ rule krona_import_taxonomy:
".".join([wildcards.sample, method_props[wildcards.method]['reads_ext']]))
output: join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{method,\w+}.krona.html")
run:
query_col, taxid_col = method_props[wildcards.method]['krona_columns']
# method_props key can be changed to allow for different tools,
# but with new changes Diamond output should look like Kraken output
# in terms of the location of the two needed columns
query_col, taxid_col = method_props["kraken"]['krona_columns']
shell("""
{config[bin_dir]}/metagenomics.py krona {input} {config[krona_db]} {output} --noRank \
--queryColumn {query_col} --taxidColumn {taxid_col}
Expand Down
4 changes: 2 additions & 2 deletions util/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,9 +326,9 @@ def replace_in_file(filename, original, new):

def cat(output_file, input_files):
'''Cat list of input filenames to output filename.'''
with open(output_file, 'wb') as wfd:
with open_or_gzopen(output_file, 'wb') as wfd:
for f in input_files:
with open(f, 'rb') as fd:
with open_or_gzopen(f, 'rb') as fd:
shutil.copyfileobj(fd, wfd, 1024*1024*10)


Expand Down

0 comments on commit 7695b5b

Please sign in to comment.