This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
/
assign_taxonomy.py
1072 lines (917 loc) · 43.3 KB
/
assign_taxonomy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
from __future__ import division
__author__ = "Rob Knight, Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Rob Knight", "Greg Caporaso", "Kyle Bittinger",
"Antonio Gonzalez Pena", "David Soergel", "Jai Ram Rideout"]
__license__ = "GPL"
__version__ = "1.8.0"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
import logging
import os
import re
from os import remove
from itertools import count
from string import strip
from shutil import copy as copy_file
from tempfile import NamedTemporaryFile
from cStringIO import StringIO
from collections import Counter, defaultdict
from cogent import LoadSeqs, DNA
from cogent.app.formatdb import build_blast_db_from_fasta_path
from cogent.app.blast import blast_seqs, Blastall, BlastResult
from cogent.app import rtax
from cogent.app.util import ApplicationNotFoundError
from cogent.parse.fasta import MinimalFastaParser
from qiime.pycogent_backports.uclust import Uclust
from qiime.pycogent_backports import rdp_classifier
from qiime.pycogent_backports import mothur
from qiime.util import FunctionWithParams, get_rdp_jarpath, get_qiime_temp_dir
# Load Tax2Tree if it's available. If it's not, skip it, but set up
# to raise errors if the user tries to use it.
try:
from t2t.nlevel import load_consensus_map, load_tree, determine_rank_order
from qiime.pycogent_backports import tax2tree
except ImportError:
def raise_tax2tree_not_found_error(*args, **kwargs):
raise ApplicationNotFoundError,\
"Tax2Tree cannot be found.\nIs Tax2Tree installed? Is it in your $PYTHONPATH?"+\
"\nYou can obtain Tax2Tree from http://sourceforge.net/projects/tax2tree/."
#set functions which cannot be imported to raise_tax2tree_not_found_error
load_consensus = load_tree = determine_rank_order = tax2tree_controller = raise_tax2tree_not_found_error
"""Contains code for assigning taxonomy, using several techniques.
This module has the responsibility for taking a set of sequences and
providing a taxon assignment for each sequence."""
def validate_rdp_version(rdp_jarpath=None):
if rdp_jarpath is None:
rdp_jarpath = get_rdp_jarpath()
if rdp_jarpath is None:
raise RuntimeError(
"RDP classifier is not installed or not accessible to QIIME. "
"See install instructions here: "
"http://qiime.org/install/install.html#rdp-install"
)
rdp_jarname = os.path.basename(rdp_jarpath)
version_match = re.search("\d\.\d", rdp_jarname)
if version_match is None:
raise RuntimeError(
"Unable to detect RDP Classifier version in file %s" % rdp_jarname
)
version = float(version_match.group())
if version < 2.1:
raise RuntimeError(
"RDP Classifier does not look like version 2.2 or greater."
"Versions of the software prior to 2.2 have different "
"formatting conventions and are no longer supported by QIIME. "
"Detected version %s from file %s" % (version, rdp_jarpath)
)
return version
class TaxonAssigner(FunctionWithParams):
"""A TaxonAssigner assigns a taxon to each of a set of sequences.
This is an abstract class: subclasses should implement the __call__
method.
"""
Name = 'TaxonAssigner'
def __init__(self, params):
"""Return new TaxonAssigner object with specified params.
Note: expect params to contain both generic and per-method (e.g. for
RDP classifier w/ Hugenholtz taxonomy) params, so leaving it as a dict
rather than setting attributes. Some standard entries in params are:
Taxonomy: taxonomy used (e.g. RDP, Hugenholtz)
Similarity: similarity threshold for assignment, e.g. 0.97
Bootstrap: bootstrap support for assignment, e.g. 0.80
Application: 3rd-party application used, if any, e.g. RDP classifier
"""
self.Params = params
def __call__ (self, seq_path, result_path=None, log_path=None):
"""Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq.
Parameters:
seq_path: path to file of sequences
result_path: path to file of results. If specified, should
dump the result to the desired path instead of returning it.
log_path: path to log, which should include dump of params.
"""
raise NotImplementedError, "TaxonAssigner is an abstract class"
@staticmethod
def _parse_id_to_taxonomy_file(f):
""" parse the id_to_taxonomy file into a dict mapping id -> taxonomy
"""
result = {}
for line in f:
line = line.strip()
if line:
identifier, taxonomy = map(strip, line.split('\t'))
result[identifier] = taxonomy
return result
class BlastTaxonAssigner(TaxonAssigner):
""" Assign taxon best on best blast hit above a threshold
"""
Name = 'BlastTaxonAssigner'
SeqsPerBlastRun = 1000
def __init__(self, params):
""" Initialize the object
"""
_params = {
'Min percent identity': 90.0,
'Max E value': 1e-30,
'Application': 'blastn/megablast'
}
_params.update(params)
TaxonAssigner.__init__(self, _params)
def __call__(self, seq_path=None, seqs=None, result_path=None, log_path=None):
"""Returns dict mapping {seq_id:(taxonomy, confidence)} for each seq.
"""
assert seq_path or seqs, \
"Must provide either seqs or seq_path when calling a BlastTaxonAssigner."
# initialize the logger
logger = self._get_logger(log_path)
logger.info(str(self))
# assign the blast database, either as a pre-exisiting database
# specified as self.Params['blast_db'] or by creating a
# temporary database from the sequence file specified
# as self.Params['reference_seqs_filepath']
try:
blast_db = self.Params['blast_db']
except KeyError:
# build a temporary blast_db
reference_seqs_path = self.Params['reference_seqs_filepath']
refseqs_dir, refseqs_name = os.path.split(reference_seqs_path)
blast_db, db_files_to_remove = \
build_blast_db_from_fasta_path(reference_seqs_path)
# build the mapping of sequence identifier
# (wrt to the blast db seqs) to taxonomy
id_to_taxonomy_map = self._parse_id_to_taxonomy_file(\
open(self.Params['id_to_taxonomy_filepath'],'U'))
## Iterate over the input self.SeqsPerBlastRun seqs at a time.
# There are two competing issues here when dealing with very large
# inputs. If all sequences are read in at once, the containing object
# can be very large, causing the system to page. On the other hand,
# in such cases it would be very slow to treat each sequence
# individually, since blast requires a filepath. Each call would
# therefore involve writing a single sequence to file, opening/closing
# and removing the file. To balance this, sequences are read in and
# blasted in chunks of self.SeqsPerBlastRun (defualt: 1000) at a time.
# This appears to solve the problem with the largest sets I've worked
# with so far.
if seq_path:
# Get a seq iterator
seqs = MinimalFastaParser(open(seq_path))
# Build object to keep track of the current set of sequence to be
# blasted, and the results (i.e., seq_id -> (taxonomy,quaility score)
# mapping)
current_seqs = []
result = {}
# Iterate over the (seq_id, seq) pairs
for seq_id, seq in seqs:
# append the current seq_id,seq to list of seqs to be blasted
current_seqs.append((seq_id,seq))
# When there are 1000 in the list, blast them
if len(current_seqs) == self.SeqsPerBlastRun:
# update the result object
result.update(self._seqs_to_taxonomy(\
current_seqs,blast_db,id_to_taxonomy_map))
# reset the list of seqs to be blasted
current_seqs = []
# Assign taxonomy to the remaining sequences
result.update(self._seqs_to_taxonomy(\
current_seqs,blast_db,id_to_taxonomy_map))
## End iteration over the input self.SeqsPerBlastRun seqs at a time.
# Write log data if we have a path (while the logger can handle
# being called if we are not logging, some of these steps are slow).
if log_path is not None:
num_inspected = len(result)
logger.info('Number of sequences inspected: %s' % num_inspected)
num_null_hits = [r[1] for r in result.values()].count(None)
logger.info('Number with no blast hits: %s' % num_null_hits)
if result_path:
# if the user provided a result_path, write the
# results to file
of = open(result_path,'w')
for seq_id, (lineage, confidence, blast_hit_id) in result.items():
of.write('%s\t%s\t%s\t%s\n' %
(seq_id, lineage, confidence, blast_hit_id))
of.close()
result = None
logger.info('Result path: %s' % result_path)
else:
# Returning the data as a dict, so no modification to result
# is necessary.
pass
# if no result_path was provided, return the data as a dict
logger.info('Result path: None, returned as dict.')
# clean-up temp blastdb files, if a temp blastdb was created
if 'reference_seqs_filepath' in self.Params:
map(remove,db_files_to_remove)
# return the result
return result
def _seqs_to_taxonomy(self,seqs,blast_db,id_to_taxonomy_map):
""" Assign taxonomy to (seq_id,seq) pairs
"""
# Handle the case of no seqs passed in
if not seqs:
return {}
# blast the seqs
blast_hits = self._get_blast_hits(blast_db,seqs)
# select the best blast hit for each query sequence
best_blast_hit_ids = self._get_first_blast_hit_per_seq(blast_hits)
# map the identifier of the best blast hit to (taxonomy, e-value)
return self._map_ids_to_taxonomy(\
best_blast_hit_ids,id_to_taxonomy_map)
def _get_logger(self, log_path=None):
if log_path is not None:
handler = logging.FileHandler(log_path, mode='w')
else:
class NullHandler(logging.Handler):
def emit(self, record): pass
handler = NullHandler()
logger = logging.getLogger("BlastTaxonAssigner logger")
logger.addHandler(handler)
logger.setLevel(logging.INFO)
return logger
def _map_ids_to_taxonomy(self, hits, id_to_taxonomy_map):
""" map {query_id:(best_blast_seq_id,e-val)} to {query_id:(tax,e-val,best_blast_seq_id)}
"""
for query_id, hit in hits.items():
query_id=query_id.split()[0]
try:
hit_id, e_value = hit
hits[query_id] = \
(id_to_taxonomy_map.get(hit_id, None),e_value,hit_id)
except TypeError:
hits[query_id] = ('No blast hit', None, None)
return hits
def _get_blast_hits(self,blast_db,seqs):
""" blast each seq in seqs against blast_db and retain good hits
"""
max_evalue = self.Params['Max E value']
min_percent_identity = self.Params['Min percent identity']
if min_percent_identity < 1.0:
min_percent_identity *= 100.0
seq_ids = [s[0] for s in seqs]
result = {}
blast_result = blast_seqs(\
seqs,Blastall,blast_db=blast_db,\
params={'-p':'blastn','-n':'T'},\
add_seq_names=False)
if blast_result['StdOut']:
lines = [x for x in blast_result['StdOut']]
blast_result = BlastResult(lines)
else:
return {}.fromkeys(seq_ids,[])
for seq_id in seq_ids:
blast_result_id = seq_id.split()[0]
try:
result[seq_id] = [(e['SUBJECT ID'],float(e['E-VALUE'])) \
for e in blast_result[blast_result_id][0]
if (float(e['E-VALUE']) <= max_evalue and \
float(e['% IDENTITY']) >= min_percent_identity)]
except KeyError:
result[seq_id] = []
return result
def _get_first_blast_hit_per_seq(self,blast_hits):
""" discard all blast hits except the best for each query sequence
"""
result = {}
for k,v in blast_hits.items():
k = k.split()[0] #get rid of spaces
try:
result[k] = v[0]
except IndexError:
# If there is no good blast hit, do we want to
# leave the key out, or have it point to None?
result[k] = None
return result
class MothurTaxonAssigner(TaxonAssigner):
"""Assign taxonomy using Mothur's naive Bayes implementation
"""
Name = 'MothurTaxonAssigner'
Application = "Mothur"
Citation = (
"Schloss, P.D., et al., Introducing mothur: Open-source, platform-"
"independent, community-supported software for describing and "
"comparing microbial communities. Appl Environ Microbiol, 2009. "
"75(23):7537-41."
)
_tracked_properties = ['Application', 'Citation']
def __init__(self, params):
_params = {
'Confidence': 0.80,
'Iterations': None,
'KmerSize': None,
'id_to_taxonomy_fp': None,
'reference_sequences_fp': None,
}
_params.update(params)
super(MothurTaxonAssigner, self).__init__(_params)
def __call__(self, seq_path, result_path=None, log_path=None):
seq_file = open(seq_path)
percent_confidence = int(self.Params['Confidence'] * 100)
result = mothur.mothur_classify_file(
query_file=seq_file,
ref_fp=self.Params['reference_sequences_fp'],
tax_fp=self.Params['id_to_taxonomy_fp'],
cutoff=percent_confidence,
iters=self.Params['Iterations'],
ksize=self.Params['KmerSize'],
output_fp=result_path,
)
if log_path:
self.writeLog(log_path)
return result
class RdpTaxonAssigner(TaxonAssigner):
"""Assign taxon using RDP's naive Bayesian classifier
"""
Name = "RdpTaxonAssigner"
Application = "RDP classfier"
Citation = "Wang, Q, G. M. Garrity, J. M. Tiedje, and J. R. Cole. 2007. Naive Bayesian Classifier for Rapid Assignment of rRNA Sequences into the New Bacterial Taxonomy. Appl Environ Microbiol. 73(16):5261-7."
Taxonomy = "RDP"
_tracked_properties = ['Application','Citation','Taxonomy']
def __init__(self, params):
"""Return new RdpTaxonAssigner object with specified params.
Standard entries in params are:
Taxonomy: taxonomy used (e.g. RDP, Hugenholtz)
"""
_params = {
'Confidence': 0.80,
'id_to_taxonomy_fp': None,
'reference_sequences_fp': None,
'training_data_properties_fp': None,
'max_memory': None
}
_params.update(params)
TaxonAssigner.__init__(self, _params)
def __call__(self, seq_path, result_path=None, log_path=None):
"""Returns dict mapping {seq_id:(taxonomy, confidence)} for
each seq.
Parameters:
seq_path: path to file of sequences
result_path: path to file of results. If specified, dumps the
result to the desired path instead of returning it.
log_path: path to log, which should include dump of params.
"""
tmp_dir = get_qiime_temp_dir()
min_conf = self.Params['Confidence']
training_data_properties_fp = self.Params['training_data_properties_fp']
reference_sequences_fp = self.Params['reference_sequences_fp']
id_to_taxonomy_fp = self.Params['id_to_taxonomy_fp']
max_memory = self.Params['max_memory']
seq_file = open(seq_path, 'U')
if reference_sequences_fp and id_to_taxonomy_fp:
# Train and assign taxonomy
taxonomy_file, training_seqs_file = self._generate_training_files()
results = rdp_classifier.train_rdp_classifier_and_assign_taxonomy(
training_seqs_file, taxonomy_file, seq_file,
min_confidence=min_conf,
classification_output_fp=result_path,
max_memory=max_memory, tmp_dir=tmp_dir)
if result_path is None:
results = self._training_set.fix_results(results)
else:
self._training_set.fix_output_file(result_path)
else:
# Just assign taxonomy, using properties file if passed
if training_data_properties_fp:
fix_ranks = False
else:
fix_ranks = True
results = rdp_classifier.assign_taxonomy(
seq_file, min_confidence=min_conf, output_fp=result_path,
training_data_fp=training_data_properties_fp,
max_memory=max_memory, fixrank=fix_ranks, tmp_dir=tmp_dir)
if log_path:
self.writeLog(log_path)
return results
def _generate_training_files(self):
"""Returns a tuple of file objects suitable for passing to the
RdpTrainer application controller.
"""
tmp_dir = get_qiime_temp_dir()
training_set = RdpTrainingSet()
reference_seqs_file = open(self.Params['reference_sequences_fp'], 'U')
id_to_taxonomy_file = open(self.Params['id_to_taxonomy_fp'], 'U')
for seq_id, seq in MinimalFastaParser(reference_seqs_file):
training_set.add_sequence(seq_id, seq)
for line in id_to_taxonomy_file:
seq_id, lineage_str = map(strip, line.split('\t'))
training_set.add_lineage(seq_id, lineage_str)
training_set.dereplicate_taxa()
rdp_taxonomy_file = NamedTemporaryFile(
prefix='RdpTaxonAssigner_taxonomy_', suffix='.txt', dir=tmp_dir)
rdp_taxonomy_file.write(training_set.get_rdp_taxonomy())
rdp_taxonomy_file.seek(0)
rdp_training_seqs_file = NamedTemporaryFile(
prefix='RdpTaxonAssigner_training_seqs_', suffix='.fasta',
dir=tmp_dir)
for rdp_id, seq in training_set.get_training_seqs():
rdp_training_seqs_file.write('>%s\n%s\n' % (rdp_id, seq))
rdp_training_seqs_file.seek(0)
self._training_set = training_set
return rdp_taxonomy_file, rdp_training_seqs_file
class RdpTrainingSet(object):
def __init__(self):
self._tree = RdpTree()
self.sequences = {}
self.sequence_nodes = {}
self.lineage_depth = None
def add_sequence(self, seq_id, seq):
self.sequences[seq_id] = seq
def add_lineage(self, seq_id, lineage_str):
for char, escape_str in _QIIME_RDP_ESCAPES:
lineage_str = re.sub(char, escape_str, lineage_str)
lineage = self._parse_lineage(lineage_str)
seq_node = self._tree.insert_lineage(lineage)
self.sequence_nodes[seq_id] = seq_node
def dereplicate_taxa(self):
return self._tree.dereplicate_taxa()
def _parse_lineage(self, lineage_str):
"""Returns a list of taxa from the semi-colon-separated
lineage string of an id_to_taxonomy file.
"""
lineage = lineage_str.strip().split(';')
if self.lineage_depth is None:
self.lineage_depth = len(lineage)
if len(lineage) != self.lineage_depth:
raise ValueError(
'Because the RDP Classifier operates in a bottom-up manner, '
'each taxonomy assignment in the id-to-taxonomy file must have '
'the same number of ranks. Detected %s ranks in the first '
'item of the file, but detected %s ranks later in the file. '
'Offending taxonomy string: %s' %
(self.lineage_depth, len(lineage), lineage_str))
return lineage
def get_training_seqs(self):
"""Returns an iterator of valid training sequences in
RDP-compatible format
Each training sequence is represented by a tuple (rdp_id,
seq). The rdp_id consists of two items: the original sequence
ID with whitespace replaced by underscores, and the lineage
with taxa separated by semicolons.
"""
# Rdp requires unique sequence IDs without whitespace. Can't
# trust user IDs to not have whitespace, so we replace all
# whitespace with an underscore. Classification may fail if
# the replacement method generates a name collision.
for seq_id, node in self.sequence_nodes.iteritems():
seq = self.sequences.get(seq_id)
if seq is not None:
lineage = node.get_lineage()
rdp_id = '%s %s' % (re.sub('\s', '_', seq_id), ';'.join(lineage))
yield rdp_id, seq
def get_rdp_taxonomy(self):
return self._tree.get_rdp_taxonomy()
def fix_output_file(self, result_path):
# Ultimate hack to replace mangled taxa names
temp_results = StringIO()
for line in open(result_path):
line = re.sub(
_QIIME_RDP_TAXON_TAG + "[^;\n\t]*", '', line)
for char, escape_str in _QIIME_RDP_ESCAPES:
line = re.sub(escape_str, char, line)
temp_results.write(line)
open(result_path, 'w').write(temp_results.getvalue())
def fix_results(self, results_dict):
for seq_id, assignment in results_dict.iteritems():
lineage, confidence = assignment
lineage = re.sub(
_QIIME_RDP_TAXON_TAG + "[^;\n\t]*", '', lineage)
for char, escape_str in _QIIME_RDP_ESCAPES:
lineage = re.sub(escape_str, char, lineage)
results_dict[seq_id] = (lineage, confidence)
return results_dict
class RdpTree(object):
"""Simple, specialized tree class used to generate a taxonomy
file for the Rdp Classifier.
"""
taxonomic_ranks = ' abcdefghijklmnopqrstuvwxyz'
def __init__(self, name='Root', parent=None, counter=None):
if counter is None:
self.counter = count(0)
else:
self.counter = counter
self.id = self.counter.next()
self.name = name
self.parent = parent
self.seq_ids = []
if parent is None:
self.depth = 0
else:
self.depth = parent.depth + 1
self.children = dict() # name => subtree
def insert_lineage(self, lineage):
"""Inserts an assignment into the taxonomic tree.
Lineage must support the iterator interface, or provide an
__iter__() method that returns an iterator.
"""
lineage = iter(lineage)
try:
taxon = lineage.next()
if taxon not in self.children:
self.children[taxon] = self.__class__(
name=taxon, parent=self, counter=self.counter)
retval = self.children[taxon].insert_lineage(lineage)
except StopIteration:
retval = self
return retval
def get_lineage(self):
if self.parent is not None:
return self.parent.get_lineage() + [self.name]
else:
return [self.name]
def get_nodes(self):
yield self
for child in self.children.values():
child_nodes = child.get_nodes()
for node in child_nodes:
yield node
def dereplicate_taxa(self):
# We check that there are no duplicate taxon names (case insensitive)
# at a given depth. We must do a case insensitive check because the RDP
# classifier converts taxon names to lowercase when it checks for
# duplicates, and will throw an error otherwise.
taxa_by_depth = {}
for node in self.get_nodes():
name = node.name
depth = node.depth
current_names = taxa_by_depth.get(depth, set())
if name.lower() in current_names:
node.name = name + _QIIME_RDP_TAXON_TAG + str(node.id)
else:
current_names.add(name.lower())
taxa_by_depth[depth] = current_names
def get_rdp_taxonomy(self):
"""Returns a string, in Rdp-compatible format.
"""
# RDP uses 0 for the parent ID of the root node
if self.parent is None:
parent_id = 0
else:
parent_id = self.parent.id
# top rank name must be norank, and bottom rank must be genus
if self.depth == 0:
rank_name = "norank"
elif self.children:
rank_name = self.taxonomic_ranks[self.depth]
else:
rank_name = "genus"
fields = [
self.id, self.name, parent_id, self.depth, rank_name]
taxonomy_str = '*'.join(map(str, fields)) + "\n"
# Recursively append lines from sorted list of subtrees
child_names = self.children.keys()
child_names.sort()
subtrees = [self.children[name] for name in child_names]
for subtree in subtrees:
taxonomy_str += subtree.get_rdp_taxonomy()
return taxonomy_str
_QIIME_RDP_TAXON_TAG = "_qiime_unique_taxon_tag_"
_QIIME_RDP_ESCAPES = [
("&", "_qiime_ampersand_escape_"),
(">", "_qiime_greaterthan_escape_"),
("<", "_qiime_lessthan_escape_"),
]
class RtaxTaxonAssigner(TaxonAssigner):
"""Assign taxon using RTAX
"""
Name = "RtaxTaxonAssigner"
Application = "RTAX classifier" # ", version 0.98" # don't hardcode the version number, as it may change, and then the log output test would fail
Citation = "Soergel D.A.W., Dey N., Knight R., and Brenner S.E. 2012. Selection of primers for optimal taxonomic classification of environmental 16S rRNA gene sequences. ISME J (6), 1440-1444"
_tracked_properties = ['Application','Citation']
def __init__(self, params):
"""Return new RtaxTaxonAssigner object with specified params.
"""
_params = {
'id_to_taxonomy_fp': None,
'reference_sequences_fp': None,
# 'delimiter': ","
'header_id_regex' : "\\S+\\s+(\\S+?)\/", # use the amplicon ID, not including /1 or /3, as the primary key for the query sequences
'read_id_regex' : "\\S+\\s+(\\S+)", # OTU clustering produces ">clusterID read_1_id"
'amplicon_id_regex' : "(\\S+)\\s+(\\S+?)\/", # split_libraries produces >read_1_id ampliconID/1 . This makes a map between read_1_id and ampliconID.
'read_1_seqs_fp' : None,
'read_2_seqs_fp' : None,
'single_ok' : False,
'no_single_ok_generic' : False
}
_params.update(params)
TaxonAssigner.__init__(self, _params)
def __call__(self, seq_path, result_path=None, log_path=None):
"""Returns dict mapping {seq_id:(taxonomy, confidence)} for
each seq.
Parameters:
seq_path: path to file of sequences
result_path: path to file of results. If specified, dumps the
result to the desired path instead of returning it.
log_path: path to log, which should include dump of params.
"""
if log_path:
self.writeLog(log_path)
reference_sequences_fp = self.Params['reference_sequences_fp']
assert reference_sequences_fp, \
"Must provide reference_sequences_fp when calling an RtaxTaxonAssigner."
id_to_taxonomy_fp = self.Params['id_to_taxonomy_fp']
assert id_to_taxonomy_fp, \
"Must provide id_to_taxonomy_fp when calling an RtaxTaxonAssigner."
# delimiter = self.Params['delimiter']
read_1_seqs_fp=self.Params['read_1_seqs_fp']
assert read_1_seqs_fp, \
"Must provide read_1_seqs_fp when calling an RtaxTaxonAssigner."
# following params may all be null
read_2_seqs_fp=self.Params['read_2_seqs_fp']
single_ok=self.Params['single_ok']
no_single_ok_generic=self.Params['no_single_ok_generic']
header_id_regex=self.Params['header_id_regex']
assert header_id_regex, \
"Must not provide empty header_id_regex when calling an RtaxTaxonAssigner; leave unset"\
"to use default if in doubt."
read_id_regex=self.Params['read_id_regex']
amplicon_id_regex=self.Params['amplicon_id_regex']
# seq_file = open(seq_path, 'r')
results = rtax.assign_taxonomy(seq_path, reference_sequences_fp, id_to_taxonomy_fp,
read_1_seqs_fp, read_2_seqs_fp, single_ok=single_ok, no_single_ok_generic=no_single_ok_generic,
header_id_regex=header_id_regex, read_id_regex=read_id_regex,
amplicon_id_regex=amplicon_id_regex, output_fp=result_path,
log_path=log_path,base_tmp_dir=get_qiime_temp_dir())
return results
class Tax2TreeTaxonAssigner(TaxonAssigner):
"""Assign taxon using Tax2Tree
"""
Name = "Tax2TreeTaxonAssigner"
Application = "Tax2Tree"
Citation = "Daniel McDonald"
def __init__(self, params):
"""Returns a new Tax2TreeAssigner object with specified params
"""
_params = {
#Required. Used as consensus map.
'id_to_taxonomy_fp': None,
#Required. The aligned and filtered tree of combined input and reference seqs.
'tree_fp': None,
}
_params.update(params)
TaxonAssigner.__init__(self, _params)
def __call__(self, seq_path=None, result_path=None, log_path=None):
"""Returns a dict mapping {seq_id:(taxonomy, confidence)} for each seq
Keep in mind, "confidence" is only done for consistency and in fact
all assignments will have a score of 0 because a method for determining
confidence is not currently implemented.
Parameters:
seq_path: path to file of sequences. The sequences themselves are
never actually used, but they are needed for their ids.
result_path: path to file of results. If specified, dumps the
result to the desired path instead of returning it.
log_path: path to log, which should include dump of params.
"""
# initialize the logger
logger = self._get_logger(log_path)
logger.info(str(self))
with open(seq_path, 'U') as f:
seqs = dict(MinimalFastaParser(f))
consensus_map = tax2tree.prep_consensus(open(self.Params['id_to_taxonomy_fp']), seqs.keys())
seed_con = consensus_map[0].strip().split('\t')[1]
determine_rank_order(seed_con)
tipnames_map = load_consensus_map(consensus_map, False)
tree = load_tree(open(self.Params['tree_fp']), tipnames_map)
results = tax2tree.generate_constrings(tree, tipnames_map)
results = tax2tree.clean_output(results, seqs.keys())
if result_path:
# if the user provided a result_path, write the
# results to file
with open(result_path,'w') as f:
for seq_id, (lineage, confidence) in results.iteritems():
f.write('%s\t%s\t%s\n' %(seq_id, lineage, confidence))
logger.info('Result path: %s' % result_path)
return results
def _get_logger(self, log_path=None):
if log_path is not None:
handler = logging.FileHandler(log_path, mode='w')
else:
class NullHandler(logging.Handler):
def emit(self, record): pass
handler = NullHandler()
logger = logging.getLogger("Tax2TreeTaxonAssigner logger")
logger.addHandler(handler)
logger.setLevel(logging.INFO)
return logger
class UclustConsensusTaxonAssigner(TaxonAssigner):
"""Assign taxonomy using uclust
"""
Name = "UclustConsensusTaxonAssigner"
Application = "uclust"
Citation = """uclust citation: Search and clustering orders of magnitude faster than BLAST. Edgar RC. Bioinformatics. 2010 Oct 1;26(19):2460-1.
uclust-based consensus taxonomy assigner by Greg Caporaso, citation: QIIME allows analysis of high-throughput community sequencing data. Caporaso JG, Kuczynski J, Stombaugh J, Bittinger K, Bushman FD, Costello EK, Fierer N, Pena AG, Goodrich JK, Gordon JI, Huttley GA, Kelley ST, Knights D, Koenig JE, Ley RE, Lozupone CA, McDonald D, Muegge BD, Pirrung M, Reeder J, Sevinsky JR, Turnbaugh PJ, Walters WA, Widmann J, Yatsunenko T, Zaneveld J, Knight R. Nat Methods. 2010 May;7(5):335-6.
"""
def __init__(self, params):
"""Returns a new UclustConsensusTaxonAssigner object with specified params
"""
_params = {
# Required, mapping of reference sequence to taxonomy
'id_to_taxonomy_fp': None,
# Required, reference sequence fasta file
'reference_sequences_fp': None,
# max-accepts parameter, as passed to uclust
'max_accepts': 3,
# Fraction of sequence hits that a taxonomy assignment
# must show up in to be considered the consensus assignment
'min_consensus_fraction':0.51,
# minimum identity to consider a hit (passed to uclust as --id)
'similarity':0.90,
# label to apply for queries that cannot be assigned
'unassignable_label':'Unassigned'
}
_params.update(params)
TaxonAssigner.__init__(self, _params)
if self.Params['id_to_taxonomy_fp'] is None:
raise ValueError, \
"id_to_taxonomy_fp must be provided when instantiating a UclustConsensusTaxonAssigner"
if self.Params['reference_sequences_fp'] is None:
raise ValueError, \
"reference_sequences_fp must be provided when instantiating a UclustConsensusTaxonAssigner"
id_to_taxonomy_f = open(self.Params['id_to_taxonomy_fp'],'U')
self.id_to_taxonomy = self._parse_id_to_taxonomy_file(id_to_taxonomy_f)
def __call__(self,
seq_path,
result_path=None,
uc_path=None,
log_path=None,
HALT_EXEC=False):
"""Returns mapping of each seq to (tax, consensus fraction, n)
Results:
If result_path is specified, the results will be written to file
as tab-separated lines of:
query_id <tab> tax <tab> consensus fraction <tab> n
If result_path is None (default), the results will be returned
as a dict of:
{'query_id': (tax, consensus fraction, n)}
In both cases, the values are:
tax: the consensus taxonomy assignment
consensus fraction: the fraction of the assignments for the
query that contained the lowest level tax assignment that is
included in tax (e.g., if the assignment goes to genus level,
this will be the fraction of assignments that had the consensus
genus assignment)
n: the number of assignments that were considered when constructing
the consensus
Parameters:
seq_path: path to file of query sequences
result_path: path where results should be written. If None (default),
returns results as a dict
uc_path: path where .uc file should be saved. If None (default), and
log_path is specified, the .uc contents will be written to appended to
the log file.
log_path: path where run log should be written. If None (default), no
log file is written.
HALT_EXEC: debugging paramter. If pass, will exit just before the
uclust command is issued, and will print the command that would have
been called to stdout.
"""
# initialize the logger
logger = self._get_logger(log_path)
logger.info(str(self))
# set the user-defined parameters
params = {'--id':self.Params['similarity'],
'--maxaccepts':self.Params['max_accepts']}
# initialize the application controller object
app = Uclust(params,
HALT_EXEC=HALT_EXEC)
# Configure for consensus taxonomy assignment
app.Parameters['--rev'].on()
app.Parameters['--lib'].on(self.Params['reference_sequences_fp'])
app.Parameters['--libonly'].on()
app.Parameters['--allhits'].on()
if uc_path is None:
uc = NamedTemporaryFile(prefix='UclustConsensusTaxonAssigner_',
suffix='.uc',
dir=get_qiime_temp_dir())
uc_path = uc.name
store_uc_in_log = True
else:
store_uc_in_log = False
app_result = app({'--input':seq_path,
'--uc':uc_path})
result = self._uc_to_assignment(app_result['ClusterFile'])
if result_path is not None:
# if the user provided a result_path, write the
# results to file
of = open(result_path,'w')
for seq_id, (assignment, consensus_fraction, n) in result.items():
assignment_str = ';'.join(assignment)
of.write('%s\t%s\t%1.2f\t%d\n' %
(seq_id, assignment_str, consensus_fraction, n))
of.close()
result = None
logger.info('Result path: %s' % result_path)
else:
# If no result_path was provided, the result dict is
# returned as-is.
logger.info('Result path: None, returned as dict.')
if store_uc_in_log:
# This is a little hackish, but we don't have a good way
# to pass the uc_path value right now through the
# assign_taxonomy.py script, so writing the contents to the
# user-specified log file (since this is being stored for logging
# purposes).
app_result['ClusterFile'].seek(0)
logger.info('\n.uc file contents:\n')
for line in app_result['ClusterFile']:
logger.info(line.strip())
return result
def _get_logger(self, log_path=None):
if log_path is not None:
handler = logging.FileHandler(log_path, mode='w')
else:
class NullHandler(logging.Handler):
def emit(self, record): pass
handler = NullHandler()
logger = logging.getLogger("UclustConsensusTaxonAssigner logger")
logger.addHandler(handler)
logger.setLevel(logging.INFO)
return logger
def _get_consensus_assignment(self, assignments):
""" compute the consensus assignment from a list of assignments
"""
num_input_assignments = len(assignments)
consensus_assignment = []
# if the assignments don't all have the same number
# of levels, the resulting assignment will have a max number
# of levels equal to the number of levels in the assignment
# with the fewest number of levels. this is to avoid
# a case where, for example, there are n assignments, one of
# which has 7 levels, and the other n-1 assignments have 6 levels.
# A 7th level in the result would be misleading because it
# would appear to the user as though it was the consensus
# across all n assignments.
num_levels = min([len(a) for a in assignments])
# iterate over the assignment levels
for level in range(num_levels):
# count the different taxonomic assignments at the current level.
# the counts are computed based on the current level and all higher
# levels to reflect that, for example, 'p__A; c__B; o__C' and
# 'p__X; c__Y; o__C' represent different taxa at the o__ level (since