Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add metagenomic_report_merge() #342

Merged
merged 5 commits into from
Jun 16, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
108 changes: 89 additions & 19 deletions metagenomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import tools.krona
import tools.diamond
import tools.picard
import csv

__commands__ = []

Expand Down Expand Up @@ -369,7 +370,7 @@ def taxa_hits_from_tsv(f, tax_id_column=2):
return c


def kraken_dfs_report(db, taxa_hits):
def kraken_dfs_report(db, taxa_hits, prepend_column=True):
'''Return a kraken compatible DFS report of taxa hits.

Args:
Expand All @@ -379,6 +380,8 @@ def kraken_dfs_report(db, taxa_hits):
Return:
[]str lines of the report
'''
line_prefix = '\t' if prepend_column else ''

db.children = parents_to_children(db.parents)
total_hits = sum(taxa_hits.values())
lines = []
Expand All @@ -387,7 +390,7 @@ def kraken_dfs_report(db, taxa_hits):
unclassified_hits += taxa_hits.get(-1, 0)
if unclassified_hits > 0:
percent_covered = '%.2f' % (unclassified_hits / total_hits * 100)
lines.append('\t'.join([
lines.append(line_prefix+'\t'.join([
str(percent_covered), str(unclassified_hits),
str(unclassified_hits), 'U', '0', 'unclassified'
]))
Expand All @@ -410,6 +413,10 @@ def kraken_dfs(db, lines, taxa_hits, total_hits, taxid, level):

def kraken(inBam, db, outReport=None, outReads=None,
filterThreshold=None, numThreads=1):
'''
Classify reads by taxon using Kraken
'''

assert outReads or outReport, (
'Either --outReads or --outReport must be specified.')
kraken_tool = tools.kraken.Kraken()
Expand Down Expand Up @@ -437,8 +444,27 @@ def kraken(inBam, db, outReport=None, outReads=None,
kraken_tool.report(tmp_filtered_reads, db, outReport)


def parser_kraken(parser=argparse.ArgumentParser()):
parser.add_argument('inBam', help='Input unaligned reads, BAM format.')
parser.add_argument('db', help='Kraken database directory.')
parser.add_argument('--outReport', help='Kraken report output file.')
parser.add_argument('--outReads', help='Kraken per read output file.')
parser.add_argument('--filterThreshold',
default=0.05,
type=float,
help='Kraken filter threshold (default %(default)s)')
parser.add_argument('--numThreads', type=int, default=1, help='Number of threads to run. (default %(default)s)')
util.cmd.common_args(parser, (('loglevel', None), ('version', None),
('tmp_dir', None)))
util.cmd.attach_main(parser, kraken, split_args=True)
return parser


def krona(inTsv, db, outHtml, queryColumn=None, taxidColumn=None,
scoreColumn=None, noHits=None, noRank=None):
'''
Create an interactive HTML report from a tabular metagenomic report
'''

krona_tool = tools.krona.Krona()
if inTsv.endswith('.gz'):
Expand All @@ -455,22 +481,6 @@ def krona(inTsv, db, outHtml, queryColumn=None, taxidColumn=None,
score_column=scoreColumn, no_hits=noHits, no_rank=noRank)


def parser_kraken(parser=argparse.ArgumentParser()):
parser.add_argument('inBam', help='Input unaligned reads, BAM format.')
parser.add_argument('db', help='Kraken database directory.')
parser.add_argument('--outReport', help='Kraken report output file.')
parser.add_argument('--outReads', help='Kraken per read output file.')
parser.add_argument('--filterThreshold',
default=0.05,
type=float,
help='Kraken filter threshold (default %(default)s)')
parser.add_argument('--numThreads', type=int, default=1, help='Number of threads to run. (default %(default)s)')
util.cmd.common_args(parser, (('loglevel', None), ('version', None),
('tmp_dir', None)))
util.cmd.attach_main(parser, kraken, split_args=True)
return parser


def parser_krona(parser=argparse.ArgumentParser()):
parser.add_argument('inTsv', help='Input tab delimited file.')
parser.add_argument('db', help='Krona taxonomy database directory.')
Expand All @@ -491,6 +501,9 @@ def parser_krona(parser=argparse.ArgumentParser()):


def diamond(inBam, db, taxDb, outReport, outM8=None, outLca=None, numThreads=1):
'''
Classify reads by the taxon of the Lowest Common Ancestor (LCA)
'''
tmp_fastq = util.file.mkstempfname('.fastq')
tmp_fastq2 = util.file.mkstempfname('.fastq')
picard = tools.picard.SamToFastqTool()
Expand Down Expand Up @@ -530,7 +543,7 @@ def diamond(inBam, db, taxDb, outReport, outM8=None, outLca=None, numThreads=1):
with open(tmp_lca_tsv) as f:
hits = taxa_hits_from_tsv(f)
with open(outReport, 'w') as f:
for line in kraken_dfs_report(tax_db, hits):
for line in kraken_dfs_report(tax_db, hits, prepend_column=True):
print(line, file=f)


Expand All @@ -547,10 +560,67 @@ def parser_diamond(parser=argparse.ArgumentParser()):
util.cmd.attach_main(parser, diamond, split_args=True)
return parser

def metagenomic_report_merge(metagenomic_reports, out_kraken_summary, kraken_db, out_krona_input):
'''
Merge multiple metegenomic reports into a single metagenomic report.
Any Krona input files created by this
'''
assert out_kraken_summary or out_krona_input, (
"Either --outSummaryReport or --outByQueryToTaxonID must be specified")
assert kraken_db if out_kraken_summary else True, (
'A Kraken db must be provided via --krakenDB if outSummaryReport is specified')

# column numbers containing the query (sequence) ID and taxonomic ID
# these are one-indexed
# See: http://ccb.jhu.edu/software/kraken/MANUAL.html#output-format
tool_data_columns = {
"kraken": (2, 3)
}

# if we're creating a Krona input file
if out_krona_input:
# open the output file (as gz if necessary)
with util.file.open_or_gzopen(out_krona_input ,"wt") as outf:
# create a TSV writer for the output file
output_writer = csv.writer(outf, delimiter='\t', lineterminator='\n')

if metagenomic_reports:
# for each Kraken-format metag file specified, pull out the appropriate columns
# and write them to the TSV output
for metag_file in metagenomic_reports:
with util.file.open_or_gzopen(metag_file.name ,"rt") as inf:
file_reader = csv.reader(inf, delimiter='\t')
for row in file_reader:
# for only the two relevant columns
output_writer.writerow([f for f in row])
#output_writer.writerow([row[c-1] for c in tool_data_columns["kraken"]])


# create a human-readable summary of the Kraken reports
# kraken-report can only be used on kraken reports since it depends on queries being in its database
if out_kraken_summary:
# create temporary file to hold combined kraken report
tmp_metag_combined_txt = util.file.mkstempfname('.txt')

util.file.cat(tmp_metag_combined_txt, [metag_file.name for metag_file in metagenomic_reports])

kraken_tool = tools.kraken.Kraken()
kraken_tool.report(tmp_metag_combined_txt, kraken_db.name, out_kraken_summary)


def parser_metagenomic_report_merge(parser=argparse.ArgumentParser()):
parser.add_argument("metagenomic_reports", help="Input metagenomic reports with the query ID and taxon ID in the 2nd and 3rd columns (Kraken format)", nargs='+', type=argparse.FileType('r'))
parser.add_argument("--outSummaryReport", dest="out_kraken_summary", help="Path of human-readable metagenomic summary report, created by kraken-report")
parser.add_argument("--krakenDB", dest="kraken_db", help="Kraken database (needed for outSummaryReport)", type=argparse.FileType('r'))
parser.add_argument("--outByQueryToTaxonID", dest="out_krona_input", help="Output metagenomic report suitable for Krona input. ")
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, metagenomic_report_merge, split_args=True)
return parser

__commands__.append(('kraken', parser_kraken))
__commands__.append(('diamond', parser_diamond))
__commands__.append(('krona', parser_krona))
__commands__.append(('report_merge', parser_metagenomic_report_merge))


def full_parser():
Expand Down
5 changes: 4 additions & 1 deletion pipes/rules/metagenomics.rules
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ rule krona_import_taxonomy:
".".join([wildcards.sample, method_props[wildcards.method]['reads_ext']]))
output: join(config["data_dir"], config["subdirs"]["metagenomics"], "{sample}.{method,\w+}.krona.html")
run:
query_col, taxid_col = method_props[wildcards.method]['krona_columns']
# method_props key can be changed to allow for different tools,
# but with new changes Diamond output should look like Kraken output
# in terms of the location of the two needed columns
query_col, taxid_col = method_props["kraken"]['krona_columns']
shell("""
{config[bin_dir]}/metagenomics.py krona {input} {config[krona_db]} {output} --noRank \
--queryColumn {query_col} --taxidColumn {taxid_col}
Expand Down
4 changes: 2 additions & 2 deletions util/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,9 +326,9 @@ def replace_in_file(filename, original, new):

def cat(output_file, input_files):
'''Cat list of input filenames to output filename.'''
with open(output_file, 'wb') as wfd:
with open_or_gzopen(output_file, 'wb') as wfd:
for f in input_files:
with open(f, 'rb') as fd:
with open_or_gzopen(f, 'rb') as fd:
shutil.copyfileobj(fd, wfd, 1024*1024*10)


Expand Down