Skip to content

Commit

Permalink
Merge from master
Browse files Browse the repository at this point in the history
Rename outLca to outReads
  • Loading branch information
yesimon committed Sep 8, 2017
1 parent 6013158 commit 2bad405
Show file tree
Hide file tree
Showing 18 changed files with 176 additions and 179 deletions.
123 changes: 73 additions & 50 deletions metagenomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
__author__ = "yesimon@broadinstitute.org"

import argparse
import codecs
import collections
import csv
import gzip
import io
import itertools
import logging
import os.path
Expand All @@ -19,9 +21,12 @@
import re
import shutil
import sys
import subprocess
import tempfile

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import pysam

import util.cmd
Expand Down Expand Up @@ -150,7 +155,7 @@ def load_nodes(self, nodes_db):
BlastRecord = collections.namedtuple(
'BlastRecord', [
'query_id', 'subject_id', 'percent_identity', 'aln_length', 'mismatch_count', 'gap_open_count', 'query_start',
'query_end', 'subject_start', 'subject_end', 'e_val', 'bit_score'
'query_end', 'subject_start', 'subject_end', 'e_val', 'bit_score', 'extra'
]
)

Expand All @@ -165,7 +170,11 @@ def blast_records(f):
parts[field] = int(parts[field])
for field in (2, 10, 11):
parts[field] = float(parts[field])
yield BlastRecord(*parts)
args = parts[:12]
extra = parts[12:]
args.append(extra)

yield BlastRecord(*args)


def paired_query_id(record):
Expand All @@ -188,6 +197,10 @@ def translate_gi_to_tax_id(db, record):
return BlastRecord(*rec_list)


def blast_m8_taxids(record):
return [int(record.subject_id)]


def extract_tax_id(sam1):
'''Replace gi headers in subject ids to int taxonomy ids.'''
parts = sam1.reference_name.split('|')
Expand Down Expand Up @@ -300,18 +313,19 @@ def process_sam_hits(db, sam_hits, top_percent):
return coverage_lca(tax_ids, db.parents)


def process_blast_hits(db, blast_hits, top_percent):
def process_blast_hits(db, hits, top_percent):
'''Filter groups of blast hits and perform lca.
Args:
db: (TaxonomyDb) Taxonomy db.
blast_hits: []BlastRecord groups of hits.
hits: []BlastRecord groups of hits.
top_percent: (float) Only consider hits within this percent of top bit score.
Return:
(int) Tax id of LCA.
'''
hits = (translate_gi_to_tax_id(db, hit) for hit in blast_hits)
hits = (translate_gi_to_tax_id(db, hit) for hit in hits)

hits = [hit for hit in hits if hit.subject_id != 0]
if len(hits) == 0:
return
Expand All @@ -323,7 +337,7 @@ def process_blast_hits(db, blast_hits, top_percent):
# Sort requires realized list
valid_hits.sort(key=operator.attrgetter('bit_score'), reverse=True)
if valid_hits:
tax_ids = [hit.subject_id for hit in valid_hits]
tax_ids = tuple(itertools.chain(*(blast_m8_taxids(hit) for hit in valid_hits)))
return coverage_lca(tax_ids, db.parents)


Expand Down Expand Up @@ -554,7 +568,7 @@ def filter_file(path, sep='\t', taxid_column=0, gi_column=None, a2t=False, heade

if not skipAccession:
if stripVersion:
accessions = set(x.strip('.', 1)[0] for x in file_lines(whitelistAccessionFile))
accessions = set(x.strip().split('.', 1)[0] for x in file_lines(whitelistAccessionFile))
accession_column_i = 0
else:
accessions = set(file_lines(whitelistAccessionFile))
Expand Down Expand Up @@ -602,11 +616,11 @@ def rank_code(rank):
return "-"


def taxa_hits_from_tsv(f, tax_id_column=3):
def taxa_hits_from_tsv(f, taxid_column=2):
'''Return a counter of hits from tsv.'''
c = collections.Counter()
for row in csv.reader(f, delimiter='\t'):
tax_id = int(row[tax_id_column - 1])
tax_id = int(row[taxid_column - 1])
c[tax_id] += 1
return c

Expand All @@ -624,10 +638,14 @@ def kraken_dfs_report(db, taxa_hits):

db.children = parents_to_children(db.parents)
total_hits = sum(taxa_hits.values())
if total_hits == 0:
return ['\t'.join(['100.00', '0', '0', 'U', '0', 'unclassified'])]

lines = []
kraken_dfs(db, lines, taxa_hits, total_hits, 1, 0)
unclassified_hits = taxa_hits.get(0, 0)
unclassified_hits += taxa_hits.get(-1, 0)

if unclassified_hits > 0:
percent_covered = '%.2f' % (unclassified_hits / total_hits * 100)
lines.append(
Expand Down Expand Up @@ -721,63 +739,69 @@ def parser_krona(parser=argparse.ArgumentParser()):
return parser


def diamond(inBam, db, taxDb, outReport, outM8=None, outLca=None, numThreads=1):
def diamond(inBam, db, taxDb, outReport, outReads=None, numThreads=1):
'''
Classify reads by the taxon of the Lowest Common Ancestor (LCA)
'''
tmp_fastq = util.file.mkstempfname('.fastq')
tmp_fastq2 = util.file.mkstempfname('.fastq')
# do not convert this to samtools bam2fq unless we can figure out how to replicate
# the clipping functionality of Picard SamToFastq
picard = tools.picard.SamToFastqTool()
picard_opts = {
'CLIPPING_ATTRIBUTE': tools.picard.SamToFastqTool.illumina_clipping_attribute,
'CLIPPING_ACTION': 'X'
}
picard.execute(
s2fq = picard.execute(
inBam,
tmp_fastq,
tmp_fastq2,
picardOptions=tools.picard.PicardTools.dict_to_picard_opts(picard_opts),
JVMmemory=picard.jvmMemDefault
'/dev/stdout',
interleave=True,
illuminaClipping=True,
JVMmemory=picard.jvmMemDefault,
background=True,
stdout=subprocess.PIPE,
)

diamond_tool = tools.diamond.Diamond()
diamond_tool.install()
tmp_alignment = util.file.mkstempfname('.daa')
tmp_m8 = util.file.mkstempfname('.diamond.m8')
diamond_tool.blastx(db, [tmp_fastq, tmp_fastq2], tmp_alignment, options={'--threads': numThreads})
diamond_tool.view(tmp_alignment, tmp_m8, options={'--threads': numThreads})

if outM8:
with open(tmp_m8, 'rb') as f_in:
with gzip.open(outM8, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
taxonmap = join(taxDb, 'accession2taxid', 'prot.accession2taxid.gz')
taxonnodes = join(taxDb, 'nodes.dmp')

tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True, load_gis=True)
tmp_lca_tsv = util.file.mkstempfname('.tsv')
with open(tmp_m8) as m8, open(tmp_lca_tsv, 'w') as lca:
blast_lca(tax_db, m8, lca, paired=True, min_bit_score=50)
cmd = '{} blastx --outfmt 102 --sallseqid'.format(diamond_tool.install_and_get_path())
if numThreads is not None:
cmd += ' --threads {threads}'.format(threads=numThreads)
cmd += ' --db {db} --taxonmap {taxonmap} --taxonnodes {taxonnodes}'.format(
threads=numThreads,
db=db,
taxonmap=taxonmap,
taxonnodes=taxonnodes)

if outLca:
with open(tmp_lca_tsv, 'rb') as f_in:
with gzip.open(outLca, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
if outReads is not None:
cmd += ' | tee >(gzip > {out})'.format(out=outReads)

with open(tmp_lca_tsv) as f:
hits = taxa_hits_from_tsv(f)
diamond_ps = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, executable='/bin/bash')

def f(input_pipe, output_pipe):
output_pipe = codecs.getwriter('ascii')(output_pipe)
SeqIO.write(util.file.join_interleaved_fastq(input_pipe, output_format='fasta', num_n=16),
output_pipe, 'fasta')

util.misc.bind_pipes(s2fq.stdout, diamond_ps.stdin, f)

tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True)

lca_p = codecs.getreader('ascii')(diamond_ps.stdout)

hits = taxa_hits_from_tsv(lca_p)
with open(outReport, 'w') as f:
for line in kraken_dfs_report(tax_db, hits):
print(line, file=f)

s2fq.wait()
diamond_ps.wait()


def parser_diamond(parser=argparse.ArgumentParser()):
parser.add_argument('inBam', help='Input unaligned reads, BAM format.')
parser.add_argument('db', help='Diamond database directory.')
parser.add_argument('taxDb', help='Taxonomy database directory.')
parser.add_argument('outReport', help='Output taxonomy report.')
parser.add_argument('--outM8', help='Blast m8 formatted output file.')
parser.add_argument('--outLca', help='Output LCA assignments for each read.')
parser.add_argument('--outReads', help='Output LCA assignments for each read.')
parser.add_argument('--numThreads', default=1, help='Number of threads (default: %(default)s)')
util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, diamond, split_args=True)
Expand All @@ -792,7 +816,7 @@ def align_rna_metagenomics(
dupeReport=None,
outBam=None,
dupeLca=None,
outLca=None,
outReads=None,
sensitive=None,
JVMmemory=None,
numThreads=None,
Expand Down Expand Up @@ -831,7 +855,7 @@ def align_rna_metagenomics(
if dupeReport:
aln_bam_sorted = util.file.mkstempfname('.align_namesorted.bam')
samtools.sort(aln_bam, aln_bam_sorted, args=['-n'], threads=numThreads)
sam_lca_report(tax_db, aln_bam_sorted, outReport=dupeReport, outLca=dupeLca, unique_only=False)
sam_lca_report(tax_db, aln_bam_sorted, outReport=dupeReport, outReads=dupeLca, unique_only=False)
os.unlink(aln_bam_sorted)

aln_bam_deduped = outBam if outBam else util.file.mkstempfname('.align_deduped.bam')
Expand All @@ -844,17 +868,16 @@ def align_rna_metagenomics(
os.unlink(aln_bam)
aln_bam_dd_sorted = util.file.mkstempfname('.bam')
samtools.sort(aln_bam_deduped, aln_bam_dd_sorted, args=['-n'], threads=numThreads)
sam_lca_report(tax_db, aln_bam_dd_sorted, outReport=outReport, outLca=outLca)
sam_lca_report(tax_db, aln_bam_dd_sorted, outReport=outReport, outReads=outReads)

if not outBam:
os.unlink(aln_bam_deduped)


def sam_lca_report(tax_db, bam_aligned, outReport, outLca=None, unique_only=None):
lca_tsv = outLca
def sam_lca_report(tax_db, bam_aligned, outReport, outReads=None, unique_only=None):

if outLca:
lca_tsv = outLca
if outReads:
lca_tsv = outReads
else:
lca_tsv = util.file.mkstempfname('.tsv')

Expand All @@ -880,7 +903,7 @@ def parser_align_rna_metagenomics(parser=argparse.ArgumentParser()):
help='Use sensitive instead of default BWA mem options.'
)
parser.add_argument('--outBam', help='Output aligned, indexed BAM file. Default is to write to temp.')
parser.add_argument('--outLca', help='Output LCA assignments for each read.')
parser.add_argument('--outReads', help='Output LCA assignments for each read.')
parser.add_argument('--dupeLca', help='Output LCA assignments for each read including duplicates.')
parser.add_argument('--numThreads', default=1, help='Number of threads (default: %(default)s)')
parser.add_argument(
Expand Down
2 changes: 1 addition & 1 deletion pipes/rules/metagenomics.rules
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ rule diamond:
UGER=config.get('UGER_queues', {}).get('long', '-q long')
shell:
"""
{config[bin_dir]}/metagenomics.py diamond {input} {config[diamond_db]} {config[taxonomy_db]} {output.report} --outLca {output.lca} --numThreads {params.numThreads}
{config[bin_dir]}/metagenomics.py diamond {input} {config[diamond_db]} {config[taxonomy_db]} {output.report} --outReads {output.lca} --numThreads {params.numThreads}
"""

rule kraken:
Expand Down
2 changes: 1 addition & 1 deletion requirements-conda.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ bmtagger=3.101
bwa=0.7.15
cd-hit=4.6.8
cd-hit-auxtools=4.6.8
diamond=0.8.36
diamond=0.9.10
gatk=3.6
kraken-all=0.10.6_fork2
krona=2.7
Expand Down
3 changes: 0 additions & 3 deletions test/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,3 @@ def assert_none_executable():
testDir = os.path.dirname(__file__)
assert all(not os.access(os.path.join(testDir, filename), os.X_OK) for filename in os.listdir(testDir)
if filename.endswith('.py'))


assert_none_executable()
Binary file not shown.
Binary file added test/input/TestMetagenomicsSimple/zaire_ebola.bam
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
13357204 12253
12408699 138950
22129792 11103
13357204 12253
22129792 11103
27085398 12083
190684443 31646
612340474 12253
743485545 42788
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
1134776119 1
1134776120 1
1134776121 1
1134776122 1
1134776123 1
1134776124 1
32 changes: 1 addition & 31 deletions test/input/TestMetagenomicsViralMix/db/taxonomy/names.dmp
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,6 @@
11103 | human hepatitis virus C HCV | | synonym |
11103 | human hepatitis virus HCV | | misnomer |
11103 | post-transfusion hepatitis non A non B virus | | synonym |
11157 | Mononegavirales | | scientific name |
11157 | negative-sense genome single-stranded RNA viruses | | genbank common name |
11266 | Filoviridae | | scientific name |
11266 | Filovirus | | synonym |
12058 | Picormavirus | | misspelling |
12058 | Picornaviridae | | scientific name |
12058 | Picornavirus | | synonym |
Expand All @@ -34,14 +30,14 @@
12083 | HPV-2 | | acronym |
12083 | Human Poliovirus type 2 | | synonym |
12083 | Human poliovirus 2 | | scientific name |
12083 | Poliovirus 2 | | synonym |
12083 | Poliovirus type 2 | | synonym |
12234 | Tobamoviridae | | synonym |
12234 | Tobamovirus | | scientific name |
12253 | Tomato mosaic virus | | scientific name |
31646 | Hepatitis C virus subtype 1a | | scientific name |
31646 | Hepatitis C virus type 1a | | synonym |
35278 | ssRNA positive-strand viruses, no DNA stage | | scientific name |
35301 | ssRNA negative-strand viruses | | scientific name |
41856 | Hepatitis C virus 1 | | synonym |
41856 | Hepatitis C virus genotype 1 | | scientific name |
41856 | Hepatitis C virus type 1 | | synonym |
Expand All @@ -58,32 +54,6 @@
138950 | Enterovirus EV-C | | synonym |
138950 | Human enterovirus C | | synonym |
138950 | Poliovirus | | synonym |
186536 | Ebola-like viruses | | synonym |
186536 | Ebolavirus | | scientific name |
186538 | ZEBOV | | acronym |
186538 | Zaire Ebola virus | | synonym |
186538 | Zaire ebolavirus | | scientific name |
186539 | Ebola virus Reston | | synonym |
186539 | REBOV | | acronym |
186539 | RESTV | | acronym |
186539 | Reston Ebola virus | | synonym |
186539 | Reston ebolavirus | | scientific name |
186540 | Ebolavirus Sudan | | synonym |
186540 | SEBOV | | acronym |
186540 | SUDV | | acronym |
186540 | Sudan Ebola virus | | synonym |
186540 | Sudan ebolavirus | | scientific name |
186541 | CIEBOV | | acronym |
186541 | Cote d'Ivoire Ebola virus | | synonym |
186541 | Cote d'Ivoire ebolavirus | | synonym |
186541 | Ivory Coast ebolavirus | | synonym |
186541 | TAFV | | acronym |
186541 | Tai Forest ebolavirus | | scientific name |
186541 | Tai Forest virus | | synonym |
439488 | ssRNA viruses | | scientific name |
464095 | Picornavirales | | scientific name |
565995 | BDBV | | acronym |
565995 | Bundibugyo ebolavirus | | synonym |
565995 | Bundibugyo virus | | scientific name |
565995 | Ebolavirus bundibugyo | | synonym |
675071 | Virgaviridae | | scientific name |

0 comments on commit 2bad405

Please sign in to comment.