This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 268
/
identify_chimeric_seqs.py
executable file
·624 lines (505 loc) · 23.7 KB
/
identify_chimeric_seqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
#!/usr/bin/env python
# File created on 05 Oct 2009.
from __future__ import division
from os import remove, makedirs, system
from os.path import split, splitext, basename, isdir, abspath, isfile, exists
from subprocess import PIPE, Popen
from cogent.util.misc import remove_files, app_path
from cogent.parse.fasta import MinimalFastaParser
from cogent.app.formatdb import build_blast_db_from_fasta_path
from cogent.app.parameters import ValuedParameter, FlagParameter
from cogent.app.util import CommandLineApplication, ResultPath,\
ApplicationError, ApplicationNotFoundError
from cogent.util.misc import remove_files
from qiime.util import FunctionWithParams, degap_fasta_aln, \
write_degapped_fasta_to_file, get_tmp_filename
from qiime.assign_taxonomy import BlastTaxonAssigner
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Greg Caporaso","Jens Reeder"]
__license__ = "GPL"
__version__ = "1.6.0"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
__status__ = "Release"
#NOTE: The next ChimeraSlayer release will have an option to increase the number of
# iterations performed. Currently the hard coded default is 100, which leads to
# considerable fluctuations in the classification. As of Brian Haas recommendation
# we should increase this value to (optimally to 1000, need to test if a lower value
# is a good compromise)
class ChimeraChecker(FunctionWithParams):
"""A ChimeraChecker takes a sequence collection and returns list of chimeric seqs.
This is an abstract class: subclasses should implement the __call__
method.
Note: sequence ids should be preserved during this process, i.e. the
description lines should be saved/restored if the alignment app is
destructive to them.
"""
Name = 'ChimeraChecker'
def __init__(self, params):
"""Return new ChimeraChecker object with specified params.
"""
self.Params = params
def __call__ (self, seq_path, result_path=None, log_path=None):
"""Returns alignment from sequences.
Parameters:
seq_path: path to file of sequences
result_path: path to file of results. If specified, should
dump the result to the desired path as fasta, otherwise will
return list of sequence identifiers.
log_path: path to log, which should include dump of params.
"""
# return list of the chimeric sequences
return self.getResult(seq_path)
def getResult(self,seq_path):
""" Must be overwritten in subclasses """
raise NotImplementedError, "ChimeraChecker is an abstract class"
def cleanUp(self):
""" Overwrite if any clean-up is necessasry
Included here so all ChimeraCheckers can call cleanUp() without
having to know if it is defined for a certain sub-class.
"""
pass
class MetaChimeraChecker(ChimeraChecker):
""" """
Name = 'MetaChimeraChecker'
def __init__(self,params):
"""
"""
try:
chimera_checker_constructors = \
params['chimera_checker_constructors']
except KeyError:
raise ValueError,\
"Must provide 'chimera_checker_constructors' to define %s." \
% self.Name
chimera_checkers = []
for constructor, constructor_params in chimera_checker_constructors:
chimera_checkers.append(constructor(constructor_params))
self.chimera_checkers = chimera_checkers
def getResult(self,seqs_fp):
raise NotImplementedError, "%s is not yet written" % self.Name
class ChimeraSlayerChimeraChecker(ChimeraChecker):
""" """
# Most of the work is done in the ChimeraSlayer_app controller,
# so we have a very lean Class here
Name = 'ChimeraSlayerChimeraChecker'
def __init__(self, params):
"""Return new ChimeraSlayerChimeraChecker object with specified params."""
_params = {}
_params.update(params)
ChimeraChecker.__init__(self, _params)
def cleanUp(self):
"""Remove all intermediate files"""
#All intermediates should be removed by app controller
pass
def __call__ (self, seq_path, db_FASTA_fp=None, db_NAST_fp=None,
result_path=None, log_path=None, min_div_ratio=None,
keep_intermediates=False):
"""Run chimeraSlayer on input seqs."""
chimeras = get_chimeras_from_Nast_aligned(seq_path, ref_db_aligned_fp=db_NAST_fp,
ref_db_fasta_fp=db_FASTA_fp,
min_div_ratio=min_div_ratio,
keep_intermediates=keep_intermediates)
return chimeras
def chimeraSlayer_identify_chimeras(seqs_fp, db_FASTA_fp=None,
db_NAST_fp=None, output_fp=None,
min_div_ratio=None, keep_intermediates=False):
""" """
params = {}
cc = ChimeraSlayerChimeraChecker(params)
if output_fp:
of = open(output_fp,'w')
for seq_id, parents in cc(seqs_fp, db_FASTA_fp=db_FASTA_fp,
db_NAST_fp=db_NAST_fp,
min_div_ratio=min_div_ratio,
keep_intermediates=keep_intermediates):
of.write('\t'.join([seq_id] + parents))
of.write('\n')
of.close()
result = None
else:
result = list(cc(seqs_fp, db_FASTA_fp=db_FASTA_fp,
db_NAST_fp=db_NAST_fp,
min_div_ratio=min_div_ratio,
keep_intermediates=keep_intermediates))
if not keep_intermediates:
cc.cleanUp()
return result
class BlastFragmentsChimeraChecker(ChimeraChecker):
""" """
Name = 'BlastFragmentsChimeraChecker'
def __init__(self, params):
"""Return new BlastFragmentsChimeraChecker object with specified params.
"""
_params = {'max_e_value':1e-30,\
'min_pct_id':0.90,\
'num_fragments':3,\
'taxonomy_depth':4}
_params.update(params)
try:
id_to_taxonomy_fp = params['id_to_taxonomy_fp']
except KeyError:
raise ValueError,\
"id_to_taxonomy_filepath must be provided to %s" % self.Name
# Create the blast database if it hasn't been provided
if 'blast_db' not in params or params['blast_db'] == None:
try:
reference_seqs_fp = params['reference_seqs_fp']
except KeyError:
raise ValueError, \
"refseqs_fp or blast_db must be provided to %s" % self.Name
blast_db, self._db_files_to_remove = \
build_blast_db_from_fasta_path(reference_seqs_fp)
else:
blast_db = params['blast_db']
self._db_files_to_remove = []
self._taxon_assigner = BlastTaxonAssigner(\
{'blast_db':blast_db,\
'id_to_taxonomy_filepath':id_to_taxonomy_fp,
'Max E value':_params['max_e_value'],
'Min percent identity':_params['min_pct_id']
})
ChimeraChecker.__init__(self, _params)
def cleanUp(self):
""" Remove temporary blast database files, if applicable
"""
remove_files(self._db_files_to_remove,error_on_missing=False)
def _fragment_seq(self,seq):
""" Returns list of n roughly equal-sized fragments of seq
Modified from Python Cookbook entry comment at:
http://code.activestate.com/recipes/425397/
** This method needs to be parameterized, as there are other ways
we might want to fragment sequences. For example, we might want to
specify breakpoints. Currently, this will split seq into
self.num_fragments roughly equal-sized fragments.
"""
num_fragments = self.Params['num_fragments']
results = []
start = 0
for i in range(num_fragments):
# My notes:
# len(seq[i::n]) gives the number of even multiples of
# num_fragments exist between i (inclusive) and the end of the seq.
stop = start + len(seq[i::num_fragments])
results.append(seq[start:stop])
start = stop
return results
def getResult(self,seq_path):
""" """
# Iterate over seq_id, seq pairs from seq_path
for seq_id, seq in MinimalFastaParser(open(seq_path)):
# Generate a list of fragments
fragments = self._fragment_seq(seq)
# Assign the taxonomy for each of the fragments
taxon_assignments = \
[self._get_taxonomy(fragment) for fragment in fragments]
# Test whether the taxon_assignments suggest that the
# current seq is a chimera
if self._is_chimeric(taxon_assignments):
# If so, yield the seq_id and the list of taxon_assignments
yield seq_id, taxon_assignments
def _is_chimeric(self,taxon_assignments):
"""From list of taxon assignments, determine if sequence is chimeric
** This method needs to be parameterized, as there are other ways
we might want to test for chimeras based on taxon_assignments.
This method checks that taxon assignments are identical to a
depth of depth (the input parameter). Assignments to
'No blast hit' are ignored, as these cannot provide evidence
for a sequence being chimeric. This function considers a sequnce
non-chimeric until it finds evidence that it is chimeric.
Currently only the first blast hit is considered -- this is
something we will want to change.
When tested against a balanced test set containing 84 chimeric
sequences and 84 non-chimeric sequences
(precision, recall, f-measure)
respectively are:
@ depth=3 (0.67,0.36,0.47)
@ depth=4 (0.67,0.60,0.63)
@ depth=5 (0.56,0.64,0.60)
"""
depth = self.Params['taxonomy_depth']
# split the assignments, ignoring fragments which had no blast
# result (as these cannot provide evidence for a sequence being
# chimeric)
split_assignments = [ta.split(';') \
for ta in taxon_assignments if ta != 'No blast hit']
for i in range(depth):
try:
assignments_at_depth_i =\
[a[i].strip() for a in split_assignments]
except IndexError:
# We return False if there is no information at current
# depth because there is not enough information to call
# the sequence chimeric
return False
if len(set(assignments_at_depth_i)) > 1:
return True
return False
def _get_taxonomy(self,fragment):
""" Return the taxonomy of fragment
This function is awful right now, because the blast app
controller takes a file. This therefore results in each
fragment being written to file and subsequently cleaned up.
This can be _very_ slow. I recently updated BlastTaxonAssigner
to handle this better, but it will involve some re-writes here
to get it all going.
"""
# Pass the temporary file to the taxon assigner, and get the
# taxonomy and quality score back
r = self._taxon_assigner(seqs=[('fragment',fragment)])['fragment']
taxonomy, quality_score = r[0], r[1]
# Return the taxonomy
return taxonomy
def blast_fragments_identify_chimeras(seqs_fp,id_to_taxonomy_fp,\
reference_seqs_fp=None,blast_db=None,min_pct_id=0.90,\
max_e_value=1e-30,num_fragments=3,output_fp=None,taxonomy_depth=4):
""" """
params = {'id_to_taxonomy_fp': id_to_taxonomy_fp,\
'reference_seqs_fp':reference_seqs_fp,\
'blast_db':blast_db,\
'min_pct_id':min_pct_id,\
'max_e_value':max_e_value,\
'num_fragments':num_fragments,\
'taxonomy_depth':taxonomy_depth}
bcc = BlastFragmentsChimeraChecker(params)
if output_fp:
of = open(output_fp,'w')
for seq_id, tax_assignments in bcc(seqs_fp):
of.write('\t'.join([seq_id] + tax_assignments))
of.write('\n')
of.close()
result = None
else:
result = list(bcc(seqs_fp))
bcc.cleanUp()
return result
######App controller for ChimeraSlayer and convenience functions
class ChimeraSlayer(CommandLineApplication):
""" ChimeraSlayer ApplicationController
"""
_command = 'ChimeraSlayer.pl'
_input_handler = '_input_as_parameters'
_parameters = {\
# multi-fasta file containing query sequences in alignment format
'--query_NAST':ValuedParameter('--',Name='query_NAST', Delimiter=' ',
IsPath=True),
'--db_NAST':ValuedParameter('--',Name='db_NAST', Delimiter=' ',
IsPath=True),
'--db_FASTA':ValuedParameter('--',Name='db_FASTA', Delimiter=' ',
IsPath=True),
'--exec_dir':ValuedParameter('--', Name='exec_dir', Delimiter=' ',
IsPath=True),
'-R':ValuedParameter('-', Name='R', Delimiter=' ')
}
_suppress_stdout = False
_suppress_stderr = False
def _input_as_parameters(self,data):
""" Set the input paths (a NAST aligned fasta filepath)
"""
# The list of values which can be passed on a per-run basis
allowed_values = ['--query_NAST', '--db_NAST', '--db_FASTA', '-R']
unsupported_parameters = set(data.keys()) - set(allowed_values)
if unsupported_parameters:
raise ApplicationError,\
"Unsupported parameter(s) passed when calling ChimeraSlayer: %s" %\
' '.join(unsupported_parameters)
return ''
def _get_result_paths(self,data):
""" Set the result paths """
result = {}
inp_file_name = str(self.Parameters['--query_NAST'].Value)
inp_file_name = inp_file_name.rstrip('"')
inp_file_name = inp_file_name.lstrip('"')
exec_dir = self.Parameters['--exec_dir']
if exec_dir.isOn():
exec_dir = str(exec_dir.Value)
exec_dir = exec_dir.lstrip('"')
exec_dir = exec_dir.rstrip('"')
if inp_file_name[0] == '/':
# path is already absolute
pass
else:
inp_file_name = exec_dir +"/" +inp_file_name
if not exists(inp_file_name+".CPS.CPC"):
raise ApplicationError,"Calling ChimeraSlayer failed."
result['CPS'] = ResultPath(Path=inp_file_name + ".CPS.CPC",\
IsWritten=True)
return result
def remove_intermediate_files(self):
"""Remove all intermediate files."""
#tmp files are written in the current dir,
#app controller always jumps into dir specified via exec_dir
#Note: blast intermediates are not removed
exec_dir = str(self.Parameters['--exec_dir'].Value)
inp_file_name = str(self.Parameters['--query_NAST'].Value)
exec_dir = exec_dir.rstrip('"')
exec_dir = exec_dir.lstrip('"')
inp_file_name = inp_file_name.rstrip('"')
inp_file_name = inp_file_name.lstrip('"')
tmp_suffixes = [".CPS", ".CPS.CPC", ".CPS_RENAST",".CPS_RENAST.cidx",
".CPS.CPC.wTaxons", ".cidx"]
cs_tmp_files = [exec_dir +'/'+ inp_file_name + x for x in tmp_suffixes]
remove_files(cs_tmp_files, error_on_missing=False)
db_param = self.Parameters['--db_NAST']
if db_param.isOn():
nast_db_name = str(db_param.Value)
nast_db_name = nast_db_name.rstrip('"')
nast_db_name = nast_db_name.lstrip('"')
#Better do not remove this file since other ChimeraSlayer
#instances running on the same ref set might use this file
#Should be rather deleted in the calling function
# remove_files([nast_db_name + ".cidx"],
# error_on_missing=False)
fasta_param = self.Parameters['--db_FASTA']
if fasta_param.isOn():
fasta_name = str(fasta_param.Value)
fasta_name = fasta_name.rstrip('"')
fasta_name = fasta_name.lstrip('"')
blast_db_files = [fasta_name + x for x in [".nsq",".nin",".nhr",".cidx"]]
remove_files(blast_db_files, error_on_missing=False)
def getHelp(self):
"""Method that points to documentation"""
help_str =\
"""##########################################################################################
#
# Required:
#
# --query_NAST multi-fasta file containing query sequences in alignment format
#
# Common opts:
#
# --db_NAST db in NAST format
# --db_FASTA db in fasta format (megablast formatted)
#
#
# -n number of top matching database sequences to compare to (default 15)
# -R min divergence ratio default: 1.007
# -P min percent identity among matching sequences (default: 90)
#
# ## parameters to tune ChimeraParentSelector:
#
# Scoring parameters:
# -M match score (default: +5)
# -N mismatch penalty (default: -4)
# -Q min query coverage by matching database sequence (default: 70)
# -T maximum traverses of the multiple alignment (default: 1)
#
# ## parameters to tune ChimeraPhyloChecker:
#
#
# --windowSize default 50
# --windowStep default 5
# --minBS minimum bootstrap support for calling chimera (default: 90)
# -S percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10)
# --num_parents_test number of potential parents to test for chimeras (default: 3)
# --MAX_CHIMERA_PARENT_PER_ID Chimera/Parent alignments with perID above this are considered non-chimeras (default 100; turned off)
#
# ## misc opts
#
# --printFinalAlignments shows alignment between query sequence and pair of candidate chimera parents
# --printCSalignments print ChimeraSlayer alignments in ChimeraSlayer output
# --exec_dir chdir to here before running
#
#########################################################################################
"""
return help_str
## Start functions for processing output files
def parse_CPS_file(lines):
"""Parse the CPS file from ChimeraSlayer.
One line if this file look like this:
ChimeraSlayer chimera_AJ007403 7000004131495956 S000469847 1.0360 99.35 100 0.9354 89.70 0 YES NAST:4595-4596 ECO:941-942
0 ChimeraSlayer
1 chimera_AJ007403 # the accession of the chimera query
2 S000387216 # reference parent A
3 S000001688 # reference parent B
4 0.9422 # divergence ratio of query to chimera (left_A, right_B)
5 90.00 # percent identity between query and chimera(left_A, right_B)
6 0 # confidence in query as a chimera related to (left_A, right_B)
7 1.0419 # divergence ratio of query to chimera (right_A, left_B)
8 99.52 # percent identity between query and chimera(right_A, left_B)
9 100 # confidence in query as a chimera related to (right_A, left_B)
10 YES # ** verdict as a chimera or not **
11 NAST:4032-4033 # estimated approximate chimera breakpoint in NAST coordinates
12 ECO:767-768 # estimated approximate chimera breakpoint according to the E. coli unaligned reference seq coordinates
"""
chimeras = []
for line in lines:
record = line.split()
try:
id = record[1]
parent1 = record[2]
parent2 = record[3]
verdict = record[10]
except IndexError:
raise ValueError,"Error parsing ChimeraSlayer CPS file."
if verdict=="YES":
chimeras.append((id, [parent1,parent2]))
return chimeras
## End functions for processing output files
## Start convenience functions
def get_chimeras_from_Nast_aligned(seqs_fp, ref_db_aligned_fp=None,
ref_db_fasta_fp=None,
HALT_EXEC=False, min_div_ratio=None,
keep_intermediates=False):
"""remove chimeras from seqs_fp using chimeraSlayer.
seqs_fp: a filepath with the seqs to check in the file
ref_db_aligned_fp: fp to (pynast) aligned reference sequences
ref_db_fasta_fp: same seqs as above, just unaligned. Will be computed on the fly if not provided,
HALT_EXEC: stop execution if true
min_div_ratio: passed to ChimeraSlayer App
"""
files_to_remove = []
#might come in as FilePath object with quotes
seqs_fp = str(seqs_fp)
seqs_fp = seqs_fp.rstrip('"')
seqs_fp = seqs_fp.lstrip('"')
seqs_dir, new_seqs_fp = split(seqs_fp)
#if fp is in current dir, we fake a dir change
if seqs_dir == "":
seqs_dir = "./"
#Chimera Slayer puts some temp files in current dir and some in dir of input file
#use exe_dir to change to dir of input file, so to have all tmp files in one place
params={'--query_NAST': new_seqs_fp,
'--exec_dir': seqs_dir}
if ref_db_aligned_fp==None and ref_db_fasta_fp==None:
#use default db, whose relative position to the
#ChimeraSlayer binary is hardcoded
pass
else:
if not ref_db_fasta_fp:
#make degapped reference file
ref_db_fasta_fp = write_degapped_fasta_to_file(MinimalFastaParser( \
open(ref_db_aligned_fp)))
files_to_remove.append(ref_db_fasta_fp)
#use user db
params.update({'--db_NAST': ref_db_aligned_fp,
'--db_FASTA': ref_db_fasta_fp})
if min_div_ratio !=None:
params.update({'-R':min_div_ratio})
app = ChimeraSlayer(params=params, HALT_EXEC=HALT_EXEC)
app_results = app()
# this is a FilePath object in case of success.
# How can we test for failure here?
# if not exists(app_results['CPS']):
# raise ApplicationError, "ChimeraSlayer failed. No output file."
chimeras = parse_CPS_file((app_results['CPS']))
if not keep_intermediates:
app.remove_intermediate_files()
remove_files(files_to_remove)
return chimeras
def make_cidx_file(fp):
"""make a CS index file.
fp: reference DB to make an index of
ChimeraSlayer implicetely performs this task when the file is not
already created and deletes it when done. Especially in a parallel
environment it mkaes sense to manually make and delete the file.
"""
if app_path("cdbfasta"):
#cdbfasta comes with the microbiometools/CS
#should always be installed when using CS
args = ["cdbfasta", fp]
#cdbfasta write one line to stderr
stdout, stderr = Popen(args, stderr=PIPE, stdout=PIPE).communicate()
else:
raise ApplicationNotFoundError