This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
/
utils.py
448 lines (345 loc) · 13.3 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
#!/usr/bin/env python
from __future__ import division
"""Various helper functions."""
__author__ = "Jens Reeder"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Jens Reeder", "Rob Knight", "Jai Ram Rideout"]#remember to add yourself if you make changes
__license__ = "GPL"
__version__ = "1.6.0"
__maintainer__ = "Jens Reeder"
__email__ = "jens.reeder@gmail.com"
__status__ = "Release"
import sys
from os import remove, makedirs, access, X_OK, R_OK
from os.path import exists, isdir
from collections import defaultdict
from re import sub
from time import sleep
from socket import error
from itertools import chain
from subprocess import Popen, PIPE, STDOUT
import pickle
from cogent import Sequence
from cogent.app.util import ApplicationNotFoundError,ApplicationError
from cogent.util.misc import app_path, create_dir
from cogent.parse.flowgram_parser import lazy_parse_sff_handle
from qiime.util import (get_qiime_project_dir, get_qiime_scripts_dir,
FileFormatError, get_tmp_filename)
from qiime.denoiser.flowgram_filter import write_sff_header
# Wrap into explicit function so we can easily move the data dir around.
def get_denoiser_data_dir():
"""Return the directory of the denoiser error profiles.
"""
dir = get_qiime_project_dir() + "/qiime/support_files/denoiser/Data/"
return dir
def get_flowgram_ali_exe():
"""Return the path to the flowgram alignment prog
"""
fp = get_qiime_scripts_dir() + "/FlowgramAli_4frame"
return fp
def check_flowgram_ali_exe():
"""Check if we have a working FlowgramAligner"""
ali_fp = get_flowgram_ali_exe()
if (not exists(ali_fp) ):
raise ApplicationNotFoundError,\
("The alignment program is not where it's supposed to be: %s" % ali_fp)
#test if its callable and actually works
command = "%s -h" % ali_fp
proc = Popen(command, shell=True, universal_newlines=True,\
stdout=PIPE,stderr=STDOUT)
if (proc.wait() != 0):
raise ApplicationError,\
("Calling %s failed. Check permissions and that it is in fact an executable."\
% ali_fp)
result = proc.stdout.read()
#check that the help string looks correct
if (not result.startswith("Usage")):
raise ApplicationError,\
("Calling %s failed. Check permissions and that it is in fact an executable."\
% ali_fp)
return True
class FlowgramContainerFile():
"""A Flogram container using a file.
This class can be used to store intermediate flowgram files on disk.
Slower, but keeps a very low memory footprint.
"""
def __init__(self, header, outdir="/tmp/"):
#set up output file
self.filename = get_tmp_filename(tmp_dir=outdir, prefix="fc",
suffix =".sff.txt")
self.fh = open(self.filename, "w")
write_sff_header(header, self.fh)
self.write_mode = True
def add(self, flowgram):
if self.write_mode:
self.fh.write(flowgram.createFlowHeader() +"\n")
else:
raise ValueError,"add function can't be called after iteration started."
def __iter__(self):
#make it read_only and reset to start of file
self.write_mode=False
self.fh.close()
(self.flowgrams, self.header) = lazy_parse_sff_handle(open(self.filename))
return self.flowgrams
def __del__(self):
remove(self.filename)
class FlowgramContainerArray():
"""A Flogram container using a simple list.
Keeps all flowgrams in memory. Faster, but needs a lot of memory.
"""
def __init__(self, header=None):
"""
header parameter only for compatibility with FCArray
"""
self.data=[]
def add(self, flowgram):
self.data.append(flowgram)
def __iter__(self):
return self.data.__iter__()
def make_stats(mapping):
"""Calculates some cluster statistics (counts).
mapping: The prefix mapping dict
"""
stats = ["Clustersize\t#"]
counts = defaultdict(int)
for key in mapping.keys():
counts[len(mapping[key])] += 1
keys = counts.keys()
keys.sort()
for key in keys:
stats.append("%d:\t\t%d"%(key+1, counts[key]))
return "\n".join(stats)
def sort_ids(ids, mapping):
"""sorts ids based on their cluster_size"""
def _lookup(id):
try:
return len(mapping[id])
except (KeyError):
return 0
deco = [(_lookup(id),id) for id in ids]
deco.sort(reverse=True)
return [id for _,id in deco]
def sort_seqs_by_clustersize(seqs, mapping):
"""sort sequences by the cluster size from mapping
seqs: seqs as iterator or list of (label, seq)
mapping: cluster mapping as dict
"""
ids = []
seqs_cache = {}
for header,seq in seqs:
id = header.split("|")[0]
id = id.rstrip(" ")
ids.append(id)
seqs_cache[id] = (header,seq)
for id in sort_ids(ids, mapping):
yield seqs_cache[id]
def get_representatives(mapping, seqs):
"""Returns representative seqs.
mapping: The prefix mapping dict
seqs_fh: An open Fasta filehandle
"""
for (label,seq) in seqs:
if(mapping.has_key(label)):
seq = Sequence(name = "%s: %d" %(label, len(mapping[label])+1),
seq = seq)
yield seq
def store_mapping(mapping, outdir, prefix ):
"""Store the mapping of denoised seq ids to input ids."""
fh = open(outdir+"/"+prefix+"_mapping.txt", "w")
for (key, valuelist) in mapping.iteritems():
fh.write("%s:"%key)
for v in valuelist:
fh.write("\t%s" %v)
fh.write("\n")
fh.close()
def store_clusters(mapping, sff_fp, outdir="/tmp/", store_members=False):
"""Stores fasta and flogram file for each cluster."""
# get mapping read to cluster
invert_map = invert_mapping(mapping)
(flowgrams, header) = lazy_parse_sff_handle(open(sff_fp))
leftover_fasta_fh = open(outdir + "/singletons.fasta", "w")
centroids = []
for f in flowgrams:
try:
key = invert_map[f.Name]
except KeyError:
# this flowgram has not been clustered
continue
if (len(mapping[key])==0):
# do not store singletons in a separate cluster
leftover_fasta_fh.write(f.toFasta()+"\n")
continue
elif(mapping.has_key(f.Name)):
#save as a centroid
centroids.append((len(mapping[f.Name])+1, f.Name, f.toSeq()))
if (store_members):
flows_fh = open(outdir+key+".flows", "a")
fasta_fh = open(outdir+key+".fasta", "a")
flows_fh.write("%s\n" % f)
fasta_fh.write(f.toFasta()+"\n")
fasta_fh.close()
flows_fh.close()
leftover_fasta_fh.close()
#sort and store ordered by cluster_size
centroids.sort(reverse=True)
centroid_fh = open(outdir + "/centroids.fasta", "w")
for size,name,seq in centroids:
centroid_fh.write(">%s | cluster size: %d \n%s\n" %
(name, size, seq))
centroid_fh.close()
def squeeze_seq(seq):
"""Squeezes consecutive identical nucleotides to one.
seq: a string
"""
return sub(r'([AGCTacgt])\1+', '\\1', seq)
def wait_for_file(filename, interval=10, test_mode=False):
"""Puts the process to sleep until the file is there.
filename: file to wait for
interval: sleep interval in seconds
test_mode: raise Exception instead of going to sleep
"""
while(not exists(filename)):
if test_mode:
raise RuntimeWarning
sleep(interval)
def wait_for_cluster_ids(ids, interval = 10):
"""Puts process to sleep until jobs with ids are done.
ids: list of ids to wait for
interval: time to sleep in seconds
NOT USED ANYMORE
"""
if (app_path("qstat")):
for id in ids:
while(getoutput("qstat %s" % id).startswith("Job")):
sleep(interval)
else:
raise ApplicationNotFoundError,"qstat not available. Is it installed?\n"+\
"This test may fail if not run on a cluster."
def init_flowgram_file(filename=None, n=0, l=400, prefix = "/tmp/" ):
"""Opens a file in plain flowgram format and writes header information.
filename: name of output file
n: number of flowgrams in the file
l: length of each flowgram in the file
prefix: directory prefix
Returns an open filehandle and the file name.
"""
if (filename == None ):
filename = get_tmp_filename(tmp_dir = prefix, suffix=".dat")
fh = open(filename,"w")
fh.write("%d %d\n" % (n, l))
return (fh, filename)
def append_to_flowgram_file(identifier, flowgram, fh, trim = False):
"""Adds one flowgram to an open plain flowgram file.
id: identifier of this flowgram
flowgram: the flowgram itself
fh: filehandle to write in
trim: Boolean flag for quality trimming flowgrams
"""
if trim:
flowgram = flowgram.getQualityTrimmedFlowgram()
#store space separated string representation of flowgram
if (not hasattr(flowgram, "spaced_flowgram")):
spaced_flowgram_seq = " ".join(map(str, flowgram.flowgram))
flowgram.spaced_flowgram = spaced_flowgram_seq
else:
spaced_flowgram_seq = flowgram.spaced_flowgram
fh.write("%s %d %s\n" % (identifier, len(flowgram), spaced_flowgram_seq))
def read_signal_probs(file):
"""Read and check the signal probabilty file"""
f = open(file)
lines = f.readlines()
f.close()
flow_probs = defaultdict(list)
flow_logs = defaultdict(list)
for line in lines:
if line.startswith('#'):
continue
for i, num in enumerate(line.strip().split()[2::2]):
flow_probs[i].append(float(num))
for i, num in enumerate(line.strip().split()[1::2]):
flow_logs[i].append(float(num))
for p in flow_probs:
s = sum(flow_probs[p])
flow_probs[p] = [i/s for i in flow_probs[p]]
return (flow_probs, flow_logs)
def invert_mapping(mapping):
"""Inverts a dictionary mapping.
Keys are inserted as a special case:
Ex: {1:(2,3,4)} ==>{1:1, 2:1, 3:1, 4:1}
Note: This will overwrite an entry if it is redundant.
"""
invert_map = {}
for key in mapping.keys():
invert_map[key] = key
for id in mapping[key]:
invert_map[id] = key
return invert_map
def cat_sff_files(list_of_file_handles):
"""virtually concat several sff files
list_of_file_handles: list of open filehandles to .sff.txt files
returns: flowgram generator, header
"""
#mimicks lazy_parse_sff_handle on multiple files
#Move to cogent???
if (list_of_file_handles==[]):
return [],None
try:
flowgrams_and_headers = map(lazy_parse_sff_handle, list_of_file_handles)
except ValueError:
raise FileFormatError, 'Wrong flogram file format. Make sure you pass the sff.txt format '+\
'produced by sffinfo. The binary .sff will not work here.'
flowgram_iterators = [a for a,b in flowgrams_and_headers]
return chain(*flowgram_iterators), flowgrams_and_headers[0][1]
def files_exist(comma_sep_fps):
"""check if all files in commasep list exist
comma_sep_fps: list of filenames as comma-separated string
"""
filenames = comma_sep_fps.split(",")
for file in filenames:
if not exists(file):
return False
return True
def read_denoiser_mapping(mapping_fh):
"""read the cluster mapping file handle
mapping_fh: an open file handle to a cluster file.
Expected format:
id1: id2 id3
id4:
...
"""
denoiser_mapping = {}
for i,line in enumerate(mapping_fh):
if line == "":
continue
centroid, members = line.split(':')
denoiser_mapping[centroid] = members.split()
return denoiser_mapping
def write_checkpoint(current_key, ctr, cluster_mapping, ids, bestscores, order, out_fp):
"""write intermediate results to checkpoint file
current_key: the identifier of the current denoiser round
ctr: a uniq counter to label the checkpoint
cluster_mapping: an intermediate cluster mapping as dict
ids: the dict of active ids
order: a list of ids, which defines the order of which flowgrams are clustered
bestscores: a dict of
"""
checkpoint_dir = out_fp+"/checkpoints/"
if (not exists(checkpoint_dir)):
create_dir(checkpoint_dir)
out_fp = checkpoint_dir+"/checkpoint%d.pickle" % ctr
out_fh = open(out_fp, "w")
pickle.dump((current_key, ctr, cluster_mapping, ids, bestscores, order), out_fh)
return out_fp
def read_checkpoint(out_fp):
"""Read in information stored in a checkpoint
out_fp: The path to the checkpoint dir
"""
pickle_fh = open(out_fp, "r")
return pickle.load(pickle_fh)
def sort_mapping_by_size(cluster_mapping):
"""Sort the keys of a dict reative to their values length
cluster_mapping: dict with mapping as list of ids
"""
return sorted(cluster_mapping.keys(),
cmp = lambda a,b: cmp(len(a), len(b)),
key=lambda k: cluster_mapping[k], reverse=True)