This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 269
/
assign_taxonomy.py
1370 lines (1180 loc) · 55.2 KB
/
assign_taxonomy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
from __future__ import division
__author__ = "Rob Knight, Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Rob Knight", "Greg Caporaso", "Kyle Bittinger",
"Antonio Gonzalez Pena", "David Soergel", "Jai Ram Rideout"]
__license__ = "GPL"
__version__ = "1.9.1-dev"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
import logging
import os
import re
from os import remove
from os.path import abspath, dirname
from itertools import count
from string import strip
from tempfile import NamedTemporaryFile, mkdtemp
from cStringIO import StringIO
from collections import Counter, defaultdict
from shutil import rmtree
from skbio.parse.sequences import parse_fasta
from skbio.util import remove_files
from bfillings.blast import blast_seqs, Blastall, BlastResult
from bfillings.formatdb import build_blast_db_from_fasta_path
from bfillings.uclust import Uclust
from bfillings.sortmerna_v2 import (build_database_sortmerna,
sortmerna_map)
from bfillings import rdp_classifier
from bfillings import mothur
from bfillings import rtax
from qiime.util import FunctionWithParams, get_rdp_jarpath, get_qiime_temp_dir
"""Contains code for assigning taxonomy, using several techniques.
This module has the responsibility for taking a set of sequences and
providing a taxon assignment for each sequence."""
def validate_rdp_version(rdp_jarpath=None):
if rdp_jarpath is None:
rdp_jarpath = get_rdp_jarpath()
if rdp_jarpath is None:
raise RuntimeError(
"RDP classifier is not installed or not accessible to QIIME. "
"See install instructions here: "
"http://qiime.org/install/install.html#rdp-install"
)
rdp_jarname = os.path.basename(rdp_jarpath)
version_match = re.search("\d\.\d", rdp_jarname)
if version_match is None:
raise RuntimeError(
"Unable to detect RDP Classifier version in file %s" % rdp_jarname
)
version = float(version_match.group())
if version < 2.1:
raise RuntimeError(
"RDP Classifier does not look like version 2.2 or greater."
"Versions of the software prior to 2.2 have different "
"formatting conventions and are no longer supported by QIIME. "
"Detected version %s from file %s" % (version, rdp_jarpath)
)
return version
class TaxonAssigner(FunctionWithParams):
"""A TaxonAssigner assigns a taxon to each of a set of sequences.
This is an abstract class: subclasses should implement the __call__
method.
"""
Name = 'TaxonAssigner'
def __init__(self, params):
"""Return new TaxonAssigner object with specified params.
Note: expect params to contain both generic and per-method (e.g. for
RDP classifier w/ Hugenholtz taxonomy) params, so leaving it as a dict
rather than setting attributes. Some standard entries in params are:
Taxonomy: taxonomy used (e.g. RDP, Hugenholtz)
Similarity: similarity threshold for assignment, e.g. 0.97
Bootstrap: bootstrap support for assignment, e.g. 0.80
Application: 3rd-party application used, if any, e.g. RDP classifier
"""
self.Params = params
def __call__(self, seq_path, result_path=None, log_path=None):
"""Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq.
Parameters:
seq_path: path to file of sequences
result_path: path to file of results. If specified, should
dump the result to the desired path instead of returning it.
log_path: path to log, which should include dump of params.
"""
raise NotImplementedError("TaxonAssigner is an abstract class")
@staticmethod
def _parse_id_to_taxonomy_file(f):
""" parse the id_to_taxonomy file into a dict mapping id -> taxonomy
"""
result = {}
for line in f:
line = line.strip()
if line:
identifier, taxonomy = map(strip, line.split('\t'))
result[identifier] = taxonomy
return result
def _tax_assignments_to_consensus_assignments(self,
query_to_assignments):
""" For each query id and list of assignments,
call _get_consensus_assigment to compute the
consensus assignment.
Parameters
----------
query_to_assignments : dict of list of lists
The keys in the dict correspond to query IDs and
the values are a list of lists holding associated
taxonomies.
Returns
-------
query_to_assignments: dict
The keys in the dict correspond to query IDs and
the values carry a single consensus taxonomy
assignment.
"""
for query_id, assignments in query_to_assignments.iteritems():
consensus_assignment = self._get_consensus_assignment(assignments)
query_to_assignments[query_id] = consensus_assignment
return query_to_assignments
def _get_consensus_assignment(self, assignments):
""" compute the consensus assignment from a list of assignments
(method applied to SortMeRNATaxonAssigner and UclustConsensusTaxonAssigner)
"""
num_input_assignments = len(assignments)
consensus_assignment = []
# if the assignments don't all have the same number
# of levels, the resulting assignment will have a max number
# of levels equal to the number of levels in the assignment
# with the fewest number of levels. this is to avoid
# a case where, for example, there are n assignments, one of
# which has 7 levels, and the other n-1 assignments have 6 levels.
# A 7th level in the result would be misleading because it
# would appear to the user as though it was the consensus
# across all n assignments.
num_levels = min([len(a) for a in assignments])
# iterate over the assignment levels
for level in range(num_levels):
# count the different taxonomic assignments at the current level.
# the counts are computed based on the current level and all higher
# levels to reflect that, for example, 'p__A; c__B; o__C' and
# 'p__X; c__Y; o__C' represent different taxa at the o__ level (since
# they are different at the p__ and c__ levels).
current_level_assignments = \
Counter([tuple(e[:level + 1]) for e in assignments])
# identify the most common taxonomic assignment, and compute the
# fraction of assignments that contained it. it's safe to compute the
# fraction using num_assignments because the deepest level we'll
# ever look at here is num_levels (see above comment on how that
# is decided).
tax, max_count = current_level_assignments.most_common(1)[0]
max_consensus_fraction = max_count / num_input_assignments
# check whether the most common taxonomic assignment is observed
# in at least min_consensus_fraction of the sequences
if max_consensus_fraction >= self.Params['min_consensus_fraction']:
# if so, append the current level only (e.g., 'o__C' if tax is
# 'p__A; c__B; o__C', and continue on to the next level
consensus_assignment.append((tax[-1], max_consensus_fraction))
else:
# if not, there is no assignment at this level, and we're
# done iterating over levels
break
# construct the results
# determine the number of levels in the consensus assignment
consensus_assignment_depth = len(consensus_assignment)
if consensus_assignment_depth > 0:
# if it's greater than 0, generate a list of the
# taxa assignments at each level
assignment_result = [a[0] for a in consensus_assignment]
# and assign the consensus_fraction_result as the
# consensus fraction at the deepest level
consensus_fraction_result = \
consensus_assignment[consensus_assignment_depth - 1][1]
else:
# if there are zero assignments, indicate that the taxa is
# unknown
assignment_result = [self.Params['unassignable_label']]
# and assign the consensus_fraction_result to 1.0 (this is
# somewhat arbitrary, but could be interpreted as all of the
# assignments suggest an unknown taxonomy)
consensus_fraction_result = 1.0
return (
assignment_result, consensus_fraction_result, num_input_assignments
)
class SortMeRNATaxonAssigner(TaxonAssigner):
""" Assign taxonomy using SortMeRNA
"""
Name = 'SortMeRNATaxonAssigner'
Application = "SortMeRNA"
Citation = ("SortMeRNA is hosted at:\n"
"http://bioinfo.lifl.fr/RNA/sortmerna\n"
"https://github.com/biocore/sortmerna\n\n"
"The following paper should be cited if this resource is "
"used:\n\n"
"Kopylova, E., Noe L. and Touzet, H.,\n"
"SortMeRNA: fast and accurate filtering of ribosomal RNAs "
"in\n"
"metatranscriptomic data, Bioinformatics (2012) 28(24)\n"
)
_tracked_properties = ['Application', 'Citation']
def __init__(self, params):
_params = {
# id to taxonomy filepath
'id_to_taxonomy_fp': None,
# reference sequences filepath
'reference_sequences_fp': None,
# reference sequences indexed database
'sortmerna_db': None,
# Fraction of sequence hits that a taxonomy assignment
# must show up in to be considered the consensus assignment
'min_consensus_fraction': 0.51,
# minimum identity to consider a hit
'min_percent_id': 90.0,
# minimum query coverage to consider a hit
'min_percent_cov': 90.0,
# output 10 best alignments
'best_N_alignments': 10,
# E-value
'e_value': 1,
# threads
'threads': 1,
# label to apply for queries that cannot be assigned
'unassignable_label': 'Unassigned'
}
_params.update(params)
super(SortMeRNATaxonAssigner, self).__init__(_params)
def __call__(self,
seq_path,
result_path=None,
log_path=None,
HALT_EXEC=False):
"""Returns mapping of each seq to (taxonomy, consensus fraction, n).
Parameters
----------
seq_path : str, mandatory
The filepath to input sequences.
result_path : str, optional
The filepath to store resulting alignments.
log_path : str, optional
The filepath to store logging information.
HALT_EXEC : bool, debugging parameter
If passed, will exit just before the sortmerna command in issued
and will print out the command that would have been called
to stdout.
Returns
-------
dict if result_path=None
The results will be stored in a dict:
dict{query_id:[tax, consensus fraction, n]}
None if result_path
The results will be written to result_path as tab-separated
lines of:
query_id <tab> tax <tab> consensus fraction <tab> n
The values represent:
tax: the consensus taxonomy assignment
consensus fraction: the fraction of the assignments for the
query that contained the lowest level tax assignment that is
included in tax (e.g., if the assignment goes to genus level,
this will be the fraction of assignments that had the consensus
genus assignment)
n: the number of assignments that were considered when
constructing the consensus
"""
# Check input reference sequence and taxonomy are provided
if self.Params['reference_sequences_fp'] is None:
raise ValueError("Filepath for reference sequences is mandatory.")
if self.Params['id_to_taxonomy_fp'] is None:
raise ValueError("Filepath for id to taxonomy map is mandatory.")
# initialize the logger
logger = self._get_logger(log_path)
logger.info(str(self))
self.dirs_to_remove = []
# Indexed database not provided, build it
if not self.Params['sortmerna_db']:
output_dir = mkdtemp()
self.sortmerna_db, files_to_remove = \
build_database_sortmerna(abspath(self.Params[
'reference_sequences_fp']),
output_dir=output_dir)
self.dirs_to_remove.append(output_dir)
# Indexed database provided
else:
self.sortmerna_db = self.Params['sortmerna_db']
# Set SortMeRNA's output directory
if result_path is None:
output_dir = mkdtemp()
self.dirs_to_remove.append(output_dir)
else:
output_dir = dirname(abspath(result_path))
# Call sortmerna mapper
app_result =\
sortmerna_map(seq_path=seq_path,
output_dir=output_dir,
sortmerna_db=self.sortmerna_db,
refseqs_fp=self.Params['reference_sequences_fp'],
e_value=self.Params['e_value'],
threads=self.Params['threads'],
best=self.Params['best_N_alignments'],
HALT_EXEC=False)
with open(self.Params['id_to_taxonomy_fp'], "U") as id_to_taxonomy_f:
self.id_to_taxonomy_map =\
self._parse_id_to_taxonomy_file(id_to_taxonomy_f)
blast_tabular_fp = app_result['BlastAlignments'].name
query_to_assignments = self._blast_to_tax_assignments(blast_tabular_fp)
result = self._tax_assignments_to_consensus_assignments(
query_to_assignments)
# Write results to file
if result_path is not None:
with open(result_path, 'w') as of:
of.write('#OTU ID\ttaxonomy\tconfidence\tnum hits\n')
for seq_id, (assignment, consensus_fraction, n) in result.items():
assignment_str = ';'.join(assignment)
of.write('%s\t%s\t%1.2f\t%d\n' % (
seq_id, assignment_str, consensus_fraction, n))
result = None
logger.info('Result path: %s' % result_path)
else:
# If no result_path was provided, the result dict is
# returned as-is.
logger.info('Result path: None, returned as dict.')
# clean up
map(rmtree, self.dirs_to_remove)
return result
def _get_logger(self, log_path=None):
if log_path is not None:
handler = logging.FileHandler(log_path, mode='w')
else:
class NullHandler(logging.Handler):
def emit(self, record):
pass
handler = NullHandler()
logger = logging.getLogger("SortMeRNATaxonAssigner logger")
logger.addHandler(handler)
logger.setLevel(logging.INFO)
return logger
def _blast_to_tax_assignments(self,
blast_output_fp):
""" Parse SortMeRNA's Blast-like tabular format for query
IDs and the references they map to, use the reference IDs
to find the associated taxonomies in the id_to_taxonomy_map.
Three types of alignments are possible,
1. The Null alignment (E-value threshold failed):
not16S.1_130\t*\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t*\t0
2. All alignments for a query pass the E-value threshold
but fail the %id threshold (3rd column is %id):
f1_4866\t426848\t85.4\t121\t15\t3\t1\t121\t520\t641\t4.79e-32\t131\t72M1D7M1I13M1D28M31S\t79.6
f1_4866\t342684\t84\t91\t9\t6\t1\t91\t522\t612\t2.8e-19\t89\t55M1D4M1I12M1D3M1D4M1I1M1I9M61S\t59.9
3. Some/all alignments for a query pass both E-value and %id
thresholds:
f2_1271\t295053\t100\t128\t0\t0\t1\t128\t520\t647\t1.15e-59\t223\t128M\t100
f2_1271\t42684\t84.8\t124\t17\t2\t1\t124\t527\t650\t2.63e-32\t132\t101M1D6M1I16M4S\t96.9
Parameters
----------
blast_output_fp : str
Filepath to Blast-like tabular alignments.
Returns
-------
result : dict of list of lists
The keys in the dict correspond to query IDs and
the values are a list of lists holding associated
taxonomies.
"""
min_percent_id = self.Params['min_percent_id']
result = defaultdict(list)
with open(blast_output_fp, "U") as blast_output:
for line in blast_output:
fields = line.split('\t')
query_id = fields[0]
subject_id = fields[1]
percent_id = float(fields[2])
# sequence was not aligned
if subject_id == "*":
result[query_id].append([])
# sequence was aligned, passing %id threshold
elif percent_id >= min_percent_id:
# if exists, remove the empty alignment (failing %id
# threshold) for this sequence (Blast tabular output
# will list all alignments passing E-value threshold,
# not necessarily the %id threshold). It should happen
# rarely that an alignment passing the %id threshold
# comes after an alignment that failed the threshold,
# but it can happen (Blast alignments are often ordered
# from highest %id to lowest), though as sortmerna uses
# a heuristic, this isn't always guaranteed.
if [] in result[query_id]:
result[query_id].remove([])
# add alignment passing %id threshold
subject_tax = self.id_to_taxonomy_map[
subject_id].strip().split(';')
result[query_id].append(subject_tax)
# sequence was aligned, however failing %id threshold
# if no alignment results have been recorded for this
# sequence up to now, add an empty list
elif not result[query_id]:
result[query_id].append([])
return result
class BlastTaxonAssigner(TaxonAssigner):
""" Assign taxon best on best blast hit above a threshold
"""
Name = 'BlastTaxonAssigner'
SeqsPerBlastRun = 1000
def __init__(self, params):
""" Initialize the object
"""
_params = {
'Min percent identity': 90.0,
'Max E value': 1e-30,
'Application': 'blastn/megablast'
}
_params.update(params)
TaxonAssigner.__init__(self, _params)
def __call__(self, seq_path=None, seqs=None,
result_path=None, log_path=None):
"""Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq.
"""
assert seq_path or seqs, \
"Must provide either seqs or seq_path when calling a BlastTaxonAssigner."
# initialize the logger
logger = self._get_logger(log_path)
logger.info(str(self))
# assign the blast database, either as a pre-exisiting database
# specified as self.Params['blast_db'] or by creating a
# temporary database from the sequence file specified
# as self.Params['reference_seqs_filepath']
try:
blast_db = self.Params['blast_db']
except KeyError:
# build a temporary blast_db
reference_seqs_path = self.Params['reference_seqs_filepath']
refseqs_dir, refseqs_name = os.path.split(reference_seqs_path)
blast_db_dir = mkdtemp(prefix='bltax-', dir=get_qiime_temp_dir())
blast_db, db_files_to_remove = build_blast_db_from_fasta_path(
abspath(reference_seqs_path), output_dir=blast_db_dir)
# build the mapping of sequence identifier
# (wrt to the blast db seqs) to taxonomy
id_to_taxonomy_map = self._parse_id_to_taxonomy_file(
open(self.Params['id_to_taxonomy_filepath'], 'U'))
# Iterate over the input self.SeqsPerBlastRun seqs at a time.
# There are two competing issues here when dealing with very large
# inputs. If all sequences are read in at once, the containing object
# can be very large, causing the system to page. On the other hand,
# in such cases it would be very slow to treat each sequence
# individually, since blast requires a filepath. Each call would
# therefore involve writing a single sequence to file, opening/closing
# and removing the file. To balance this, sequences are read in and
# blasted in chunks of self.SeqsPerBlastRun (defualt: 1000) at a time.
# This appears to solve the problem with the largest sets I've worked
# with so far.
if seq_path:
# Get a seq iterator
seqs = parse_fasta(open(seq_path))
# Build object to keep track of the current set of sequence to be
# blasted, and the results (i.e., seq_id -> (taxonomy,quaility score)
# mapping)
current_seqs = []
result = {}
# Iterate over the (seq_id, seq) pairs
for seq_id, seq in seqs:
# append the current seq_id,seq to list of seqs to be blasted
current_seqs.append((seq_id, seq))
# When there are 1000 in the list, blast them
if len(current_seqs) == self.SeqsPerBlastRun:
# update the result object
result.update(self._seqs_to_taxonomy(
current_seqs, blast_db, id_to_taxonomy_map))
# reset the list of seqs to be blasted
current_seqs = []
# Assign taxonomy to the remaining sequences
result.update(self._seqs_to_taxonomy(
current_seqs, blast_db, id_to_taxonomy_map))
# End iteration over the input self.SeqsPerBlastRun seqs at a time.
# Write log data if we have a path (while the logger can handle
# being called if we are not logging, some of these steps are slow).
if log_path is not None:
num_inspected = len(result)
logger.info('Number of sequences inspected: %s' % num_inspected)
num_null_hits = [r[1] for r in result.values()].count(None)
logger.info('Number with no blast hits: %s' % num_null_hits)
if result_path:
# if the user provided a result_path, write the
# results to file
of = open(result_path, 'w')
for seq_id, (lineage, confidence, blast_hit_id) in result.items():
of.write('%s\t%s\t%s\t%s\n' %
(seq_id, lineage, confidence, blast_hit_id))
of.close()
result = None
logger.info('Result path: %s' % result_path)
else:
# Returning the data as a dict, so no modification to result
# is necessary.
pass
# if no result_path was provided, return the data as a dict
logger.info('Result path: None, returned as dict.')
# clean-up temp blastdb files, if a temp blastdb was created
if 'reference_seqs_filepath' in self.Params:
remove_files(db_files_to_remove)
rmtree(blast_db_dir)
# return the result
return result
def _seqs_to_taxonomy(self, seqs, blast_db, id_to_taxonomy_map):
""" Assign taxonomy to (seq_id,seq) pairs
"""
# Handle the case of no seqs passed in
if not seqs:
return {}
# blast the seqs
blast_hits = self._get_blast_hits(blast_db, seqs)
# select the best blast hit for each query sequence
best_blast_hit_ids = self._get_first_blast_hit_per_seq(blast_hits)
# map the identifier of the best blast hit to (taxonomy, e-value)
return self._map_ids_to_taxonomy(
best_blast_hit_ids, id_to_taxonomy_map)
def _get_logger(self, log_path=None):
if log_path is not None:
handler = logging.FileHandler(log_path, mode='w')
else:
class NullHandler(logging.Handler):
def emit(self, record):
pass
handler = NullHandler()
logger = logging.getLogger("BlastTaxonAssigner logger")
logger.addHandler(handler)
logger.setLevel(logging.INFO)
return logger
def _map_ids_to_taxonomy(self, hits, id_to_taxonomy_map):
""" map {query_id:(best_blast_seq_id,e-val)} to {query_id:(tax,e-val,best_blast_seq_id)}
"""
for query_id, hit in hits.items():
query_id = query_id.split()[0]
try:
hit_id, e_value = hit
hits[query_id] = \
(id_to_taxonomy_map.get(hit_id, None), e_value, hit_id)
except TypeError:
hits[query_id] = ('No blast hit', None, None)
return hits
def _get_blast_hits(self, blast_db, seqs):
""" blast each seq in seqs against blast_db and retain good hits
"""
max_evalue = self.Params['Max E value']
min_percent_identity = self.Params['Min percent identity']
if min_percent_identity < 1.0:
min_percent_identity *= 100.0
seq_ids = [s[0] for s in seqs]
result = {}
blast_result = blast_seqs(
seqs, Blastall, blast_db=blast_db,
params={'-p': 'blastn', '-n': 'T'},
add_seq_names=False)
if blast_result['StdOut']:
lines = [x for x in blast_result['StdOut']]
blast_result = BlastResult(lines)
else:
return {}.fromkeys(seq_ids, [])
for seq_id in seq_ids:
blast_result_id = seq_id.split()[0]
try:
result[seq_id] = [(e['SUBJECT ID'], float(e['E-VALUE']))
for e in blast_result[blast_result_id][0]
if (float(e['E-VALUE']) <= max_evalue and
float(e['% IDENTITY']) >= min_percent_identity)]
except KeyError:
result[seq_id] = []
return result
def _get_first_blast_hit_per_seq(self, blast_hits):
""" discard all blast hits except the best for each query sequence
"""
result = {}
for k, v in blast_hits.items():
k = k.split()[0] # get rid of spaces
try:
result[k] = v[0]
except IndexError:
# If there is no good blast hit, do we want to
# leave the key out, or have it point to None?
result[k] = None
return result
class MothurTaxonAssigner(TaxonAssigner):
"""Assign taxonomy using Mothur's naive Bayes implementation
"""
Name = 'MothurTaxonAssigner'
Application = "Mothur"
Citation = (
"Schloss, P.D., et al., Introducing mothur: Open-source, platform-"
"independent, community-supported software for describing and "
"comparing microbial communities. Appl Environ Microbiol, 2009. "
"75(23):7537-41."
)
_tracked_properties = ['Application', 'Citation']
def __init__(self, params):
_params = {
'Confidence': 0.50,
'Iterations': None,
'KmerSize': None,
'id_to_taxonomy_fp': None,
'reference_sequences_fp': None,
}
_params.update(params)
super(MothurTaxonAssigner, self).__init__(_params)
def _format_id_to_taxonomy(self, id_to_taxonomy_file):
"""Reformat taxa to comply with Mothur formatting requirements.
Mothur requires lineages to be semicolon-separated with no space
following the semicolon. (QIIME convention is to include a
space.) Taxa may have no internal spaces. Furthermore, each
lineage must end with a semi-colon.
Returns the re-formatted id-to-taxonomy file as an open file
object.
"""
mothur_tax_file = NamedTemporaryFile(
prefix='MothurTaxonAssigner_',
suffix='.txt',
dir=get_qiime_temp_dir())
original_taxonomy = self._parse_id_to_taxonomy_file(id_to_taxonomy_file)
for seq_id, lineage in original_taxonomy.iteritems():
mothur_tax_file.write(seq_id)
mothur_tax_file.write('\t')
taxa = [t.strip() for t in lineage.split(';')]
for taxon in taxa:
mothur_tax_file.write(self._format_taxon(taxon))
mothur_tax_file.write(';')
mothur_tax_file.write('\n')
mothur_tax_file.seek(0)
return mothur_tax_file
def _unformat_result(self, result):
"""Transform results to remove any changes introduced by formatting.
"""
unformatted_result = {}
for seq_id, (taxa, conf) in result.iteritems():
unformatted_taxa = [self._unformat_taxon(t) for t in taxa]
unformatted_result[seq_id] = (unformatted_taxa, conf)
return unformatted_result
def _format_taxon(self, taxon):
"""Format taxon for MOTHUR, removing internal spaces.
Original taxon names are saved to self._original_taxa for later lookup.
"""
# Create private attribute to store unformatted taxon names.
# If _unformat_taxon() is called without first calling
# _format_taxon(), this attribute will be missing, and an
# AttributeError will be raised.
if not hasattr(self, "_original_taxa"):
self._original_taxa = {}
# Escape backslashes
mothur_taxon = taxon.replace("\\", "\\\\")
# Escape underscores
mothur_taxon = mothur_taxon.replace("_", "\\_")
# Now we can safely replace spaces with underscores
mothur_taxon = mothur_taxon.replace(' ', '_')
if mothur_taxon != taxon:
previously_registered_taxon = self._original_taxa.get(mothur_taxon)
# If we have not yet registered the escaped taxon name, add it now.
if previously_registered_taxon is None:
self._original_taxa[mothur_taxon] = taxon
# Otherwise, check that the previously registered taxon is
# consistent with the current taxon. If we have not
# escaped the taxon names properly, two distinct taxa may
# be registered under the same name. This should probably
# never happen, but I can't prove it, so we check and
# raise an error if the taxa are inconsistent.
elif taxon != previously_registered_taxon:
raise ValueError(
"Taxon %s conflicts with another taxon, %s. "
"Please change one of the names." % (
taxon, previously_registered_taxon))
return mothur_taxon
def _unformat_taxon(self, taxon):
"""Recover original taxon names that were altered due to formatting.
Looks up taxon names in the attribute self._original_taxa. If
self._format_taxon() was never called, this attribute will be
missing, and an AttributeError will be raised.
"""
return self._original_taxa.get(taxon, taxon)
def __call__(self, seq_path, result_path=None, log_path=None):
seq_file = open(seq_path)
percent_confidence = int(self.Params['Confidence'] * 100)
with open(self.Params['id_to_taxonomy_fp'], "U") as tax_file:
mothur_tax_file = self._format_id_to_taxonomy(tax_file)
try:
result = mothur.mothur_classify_file(
query_file=seq_file,
ref_fp=self.Params['reference_sequences_fp'],
tax_fp=mothur_tax_file.name,
cutoff=percent_confidence,
iters=self.Params['Iterations'],
ksize=self.Params['KmerSize'],
output_fp=None,
tmp_dir=get_qiime_temp_dir()
)
finally:
mothur_tax_file.close()
result = self._unformat_result(result)
if result_path is not None:
with open(result_path, "w") as f:
for seq_id, (taxa, conf) in result.iteritems():
lineage = ';'.join(taxa)
f.write("%s\t%s\t%.2f\n" % (seq_id, lineage, conf))
return None
if log_path:
self.writeLog(log_path)
return result
class RdpTaxonAssigner(TaxonAssigner):
"""Assign taxon using RDP's naive Bayesian classifier
"""
Name = "RdpTaxonAssigner"
Application = "RDP classfier"
Citation = "Wang, Q, G. M. Garrity, J. M. Tiedje, and J. R. Cole. 2007. Naive Bayesian Classifier for Rapid Assignment of rRNA Sequences into the New Bacterial Taxonomy. Appl Environ Microbiol. 73(16):5261-7."
Taxonomy = "RDP"
_tracked_properties = ['Application', 'Citation', 'Taxonomy']
def __init__(self, params):
"""Return new RdpTaxonAssigner object with specified params.
Standard entries in params are:
Taxonomy: taxonomy used (e.g. RDP, Hugenholtz)
"""
_params = {
'Confidence': 0.50,
'id_to_taxonomy_fp': None,
'reference_sequences_fp': None,
'training_data_properties_fp': None,
'max_memory': None
}
_params.update(params)
TaxonAssigner.__init__(self, _params)
def __call__(self, seq_path, result_path=None, log_path=None):
"""Returns dict mapping {seq_id:(taxonomy, confidence)} for
each seq.
Parameters:
seq_path: path to file of sequences
result_path: path to file of results. If specified, dumps the
result to the desired path instead of returning it.
log_path: path to log, which should include dump of params.
"""
tmp_dir = get_qiime_temp_dir()
min_conf = self.Params['Confidence']
training_data_properties_fp = self.Params[
'training_data_properties_fp']
reference_sequences_fp = self.Params['reference_sequences_fp']
id_to_taxonomy_fp = self.Params['id_to_taxonomy_fp']
max_memory = self.Params['max_memory']
seq_file = open(seq_path, 'U')
if reference_sequences_fp and id_to_taxonomy_fp:
# Train and assign taxonomy
taxonomy_file, training_seqs_file = self._generate_training_files()
results = rdp_classifier.train_rdp_classifier_and_assign_taxonomy(
training_seqs_file, taxonomy_file, seq_file,
min_confidence=min_conf,
classification_output_fp=result_path,
max_memory=max_memory, tmp_dir=tmp_dir)
if result_path is None:
results = self._training_set.fix_results(results)
else:
self._training_set.fix_output_file(result_path)
else:
# Just assign taxonomy, using properties file if passed
if training_data_properties_fp:
fix_ranks = False
else:
fix_ranks = True
results = rdp_classifier.assign_taxonomy(
seq_file, min_confidence=min_conf, output_fp=result_path,
training_data_fp=training_data_properties_fp,
max_memory=max_memory, fixrank=fix_ranks, tmp_dir=tmp_dir)
if log_path:
self.writeLog(log_path)
return results
def _generate_training_files(self):
"""Returns a tuple of file objects suitable for passing to the
RdpTrainer application controller.
"""
tmp_dir = get_qiime_temp_dir()
training_set = RdpTrainingSet()
reference_seqs_file = open(self.Params['reference_sequences_fp'], 'U')
id_to_taxonomy_file = open(self.Params['id_to_taxonomy_fp'], 'U')
for seq_id, seq in parse_fasta(reference_seqs_file):
training_set.add_sequence(seq_id, seq)
for line in id_to_taxonomy_file:
seq_id, lineage_str = map(strip, line.split('\t'))
training_set.add_lineage(seq_id, lineage_str)
training_set.dereplicate_taxa()
rdp_taxonomy_file = NamedTemporaryFile(
prefix='RdpTaxonAssigner_taxonomy_', suffix='.txt', dir=tmp_dir)
rdp_taxonomy_file.write(training_set.get_rdp_taxonomy())
rdp_taxonomy_file.seek(0)
rdp_training_seqs_file = NamedTemporaryFile(
prefix='RdpTaxonAssigner_training_seqs_', suffix='.fasta',
dir=tmp_dir)
for rdp_id, seq in training_set.get_training_seqs():
rdp_training_seqs_file.write('>%s\n%s\n' % (rdp_id, seq))
rdp_training_seqs_file.seek(0)
self._training_set = training_set
return rdp_taxonomy_file, rdp_training_seqs_file
class RdpTrainingSet(object):
def __init__(self):
self._tree = RdpTree()
self.sequences = {}
self.sequence_nodes = {}
self.lineage_depth = None
def add_sequence(self, seq_id, seq):
self.sequences[seq_id] = seq
def add_lineage(self, seq_id, lineage_str):
for char, escape_str in _QIIME_RDP_ESCAPES:
lineage_str = re.sub(char, escape_str, lineage_str)
lineage = self._parse_lineage(lineage_str)
seq_node = self._tree.insert_lineage(lineage)
self.sequence_nodes[seq_id] = seq_node
def dereplicate_taxa(self):
return self._tree.dereplicate_taxa()
def _parse_lineage(self, lineage_str):
"""Returns a list of taxa from the semi-colon-separated
lineage string of an id_to_taxonomy file.
"""
lineage = lineage_str.strip().split(';')
if self.lineage_depth is None:
self.lineage_depth = len(lineage)
if len(lineage) != self.lineage_depth:
raise ValueError(
'Because the RDP Classifier operates in a bottom-up manner, '
'each taxonomy assignment in the id-to-taxonomy file must have '
'the same number of ranks. Detected %s ranks in the first '
'item of the file, but detected %s ranks later in the file. '
'Offending taxonomy string: %s' %
(self.lineage_depth, len(lineage), lineage_str))
return lineage
def get_training_seqs(self):
"""Returns an iterator of valid training sequences in
RDP-compatible format
Each training sequence is represented by a tuple (rdp_id,
seq). The rdp_id consists of two items: the original sequence
ID with whitespace replaced by underscores, and the lineage
with taxa separated by semicolons.
"""
# Rdp requires unique sequence IDs without whitespace. Can't
# trust user IDs to not have whitespace, so we replace all
# whitespace with an underscore. Classification may fail if
# the replacement method generates a name collision.
for seq_id, node in self.sequence_nodes.iteritems():
seq = self.sequences.get(seq_id)
if seq is not None:
lineage = node.get_lineage()
rdp_id = '%s %s' % (
re.sub('\s',
'_',
seq_id),
';'.join(lineage))
yield rdp_id, seq
def get_rdp_taxonomy(self):
return self._tree.get_rdp_taxonomy()
def fix_output_file(self, result_path):
# Ultimate hack to replace mangled taxa names
temp_results = StringIO()
for line in open(result_path):
line = re.sub(
_QIIME_RDP_TAXON_TAG + "[^;\n\t]*", '', line)
for char, escape_str in _QIIME_RDP_ESCAPES:
line = re.sub(escape_str, char, line)
temp_results.write(line)
open(result_path, 'w').write(temp_results.getvalue())
def fix_results(self, results_dict):
for seq_id, assignment in results_dict.iteritems():
lineage, confidence = assignment
lineage = re.sub(
_QIIME_RDP_TAXON_TAG + "[^;\n\t]*", '', lineage)
for char, escape_str in _QIIME_RDP_ESCAPES:
lineage = re.sub(escape_str, char, lineage)
results_dict[seq_id] = (lineage, confidence)