This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 269
/
upstream.py
434 lines (380 loc) · 16.1 KB
/
upstream.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
#!/usr/bin/env python
# File created on 20 Feb 2013
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso", "Kyle Bittinger", "Justin Kuczynski",
"Jai Ram Rideout"]
__license__ = "GPL"
__version__ = "1.8.0-dev"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
from os.path import split, splitext, abspath
from qiime.util import create_dir
from qiime.workflow.util import (print_to_stdout,
generate_log_fp,
WorkflowLogger,
log_input_md5s,
get_params_str)
def run_pick_de_novo_otus(input_fp,
output_dir,
command_handler,
params,
qiime_config,
parallel=False,
logger=None,
suppress_md5=False,
status_update_callback=print_to_stdout):
""" Run the data preparation steps of Qiime
The steps performed by this function are:
1) Pick OTUs;
2) Pick a representative set;
3) Align the representative set;
4) Assign taxonomy;
5) Filter the alignment prior to tree building - remove positions
which are all gaps, and specified as 0 in the lanemask
6) Build a phylogenetic tree;
7) Build an OTU table.
"""
# Prepare some variables for the later steps
input_dir, input_filename = split(input_fp)
input_basename, input_ext = splitext(input_filename)
create_dir(output_dir)
commands = []
cluster_failures = False
if logger is None:
logger = WorkflowLogger(generate_log_fp(output_dir),
params=params,
qiime_config=qiime_config)
close_logger_on_success = True
else:
close_logger_on_success = False
if not suppress_md5:
log_input_md5s(logger, [input_fp])
# Prep the OTU picking command
try:
otu_picking_method = params['pick_otus']['otu_picking_method']
except KeyError:
otu_picking_method = 'uclust'
pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method)
otu_fp = '%s/%s_otus.txt' % (pick_otu_dir, input_basename)
if parallel and (otu_picking_method == 'blast' or
otu_picking_method == 'uclust_ref'):
# Grab the parallel-specific parameters
try:
params_str = get_params_str(params['parallel'])
except KeyError:
params_str = ''
# Grab the OTU picker parameters
try:
# Want to find a cleaner strategy for this: the parallel script
# is method-specific, so doesn't take a --otu_picking_method
# option. This works for now though.
d = params['pick_otus'].copy()
del d['otu_picking_method']
except KeyError:
pass
if otu_picking_method == 'uclust_ref':
try:
suppress_new_clusters = d['suppress_new_clusters']
del d['suppress_new_clusters']
cluster_failures = False
except KeyError:
cluster_failures = True
failure_otu_picking_method = 'uclust'
params_str += ' %s' % get_params_str(d)
otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method
# Build the OTU picking command
pick_otus_cmd = '%s -i %s -o %s -T %s' % (otu_picking_script,
input_fp,
pick_otu_dir,
params_str)
else:
try:
params_str = get_params_str(params['pick_otus'])
except KeyError:
params_str = ''
# Build the OTU picking command
pick_otus_cmd = 'pick_otus.py -i %s -o %s %s' %\
(input_fp, pick_otu_dir, params_str)
commands.append([('Pick OTUs', pick_otus_cmd)])
if cluster_failures:
reference_otu_fp = otu_fp
clustered_failures_dir = '%s/failure_otus/' % pick_otu_dir
try:
d = params['pick_otus'].copy()
del d['otu_picking_method']
except KeyError:
pass
if 'denovo_otu_id_prefix' not in d:
d['denovo_otu_id_prefix'] = 'DeNovoOTU'
params_str = ' %s' % get_params_str(d)
failures_list_fp = '%s/%s_failures.txt' % \
(pick_otu_dir, input_basename)
failures_fasta_fp = '%s/%s_failures.fasta' % \
(pick_otu_dir, input_basename)
filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
(input_fp, failures_list_fp, failures_fasta_fp)
commands.append([('Generate failures fasta file',
filter_fasta_cmd)])
# Prep the OTU picking command for
failure_otu_fp = '%s/%s_failures_otus.txt' % (clustered_failures_dir,
input_basename)
# Build the OTU picking command
pick_otus_cmd = 'pick_otus.py -i %s -o %s -m %s %s' %\
(failures_fasta_fp, clustered_failures_dir,
failure_otu_picking_method, params_str)
commands.append(
[('Pick de novo OTUs for new clusters', pick_otus_cmd)])
merged_otu_map_fp = '%s/merged_otu_map.txt' % clustered_failures_dir
cat_otu_tables_cmd = 'cat %s %s >> %s' %\
(reference_otu_fp, failure_otu_fp, merged_otu_map_fp)
commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
otu_fp = merged_otu_map_fp
# Prep the representative set picking command
rep_set_dir = '%s/rep_set/' % output_dir
create_dir(rep_set_dir)
rep_set_fp = '%s/%s_rep_set.fasta' % (rep_set_dir, input_basename)
rep_set_log_fp = '%s/%s_rep_set.log' % (rep_set_dir, input_basename)
try:
params_str = get_params_str(params['pick_rep_set'])
except KeyError:
params_str = ''
# Build the representative set picking command
pick_rep_set_cmd = 'pick_rep_set.py -i %s -f %s -l %s -o %s %s' %\
(otu_fp, input_fp, rep_set_log_fp, rep_set_fp, params_str)
commands.append([('Pick representative set', pick_rep_set_cmd)])
# Prep the taxonomy assignment command
try:
assignment_method = params['assign_taxonomy']['assignment_method']
except KeyError:
assignment_method = 'uclust'
assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\
(output_dir, assignment_method)
taxonomy_fp = '%s/%s_rep_set_tax_assignments.txt' % \
(assign_taxonomy_dir, input_basename)
if parallel and (assignment_method == 'rdp' or
assignment_method == 'blast' or
assignment_method == 'uclust'):
# Grab the parallel-specific parameters
try:
params_str = get_params_str(params['parallel'])
except KeyError:
params_str = ''
# Grab the taxonomy assignment parameters
try:
# Want to find a cleaner strategy for this: the parallel script
# is method-specific, so doesn't take a --assignment_method
# option. This works for now though.
d = params['assign_taxonomy'].copy()
if 'assignment_method' in d:
del d['assignment_method']
params_str += ' %s' % get_params_str(d)
except KeyError:
pass
# Build the parallel taxonomy assignment command
assign_taxonomy_cmd = \
'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\
(assignment_method, rep_set_fp, assign_taxonomy_dir, params_str)
else:
try:
params_str = get_params_str(params['assign_taxonomy'])
except KeyError:
params_str = ''
# Build the taxonomy assignment command
assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\
(assign_taxonomy_dir, rep_set_fp, params_str)
commands.append([('Assign taxonomy', assign_taxonomy_cmd)])
# Prep the OTU table building command
otu_table_fp = '%s/otu_table.biom' % output_dir
try:
params_str = get_params_str(params['make_otu_table'])
except KeyError:
params_str = ''
# Build the OTU table building command
make_otu_table_cmd = 'make_otu_table.py -i %s -t %s -o %s %s' %\
(otu_fp, taxonomy_fp, otu_table_fp, params_str)
commands.append([('Make OTU table', make_otu_table_cmd)])
if cluster_failures:
reference_otu_table_fp = '%s/reference_only_otu_table.biom' % output_dir
# Build the OTU table building command
make_otu_table_cmd = 'make_otu_table.py -i %s -t %s -o %s %s' %\
(reference_otu_fp, taxonomy_fp, reference_otu_table_fp, params_str)
commands.append(
[('Make reference-only OTU table', make_otu_table_cmd)])
# Prep the pynast alignment command
try:
alignment_method = params['align_seqs']['alignment_method']
except KeyError:
alignment_method = 'pynast'
pynast_dir = '%s/%s_aligned_seqs' % (output_dir, alignment_method)
aln_fp = '%s/%s_rep_set_aligned.fasta' % (pynast_dir, input_basename)
if parallel and alignment_method == 'pynast':
# Grab the parallel-specific parameters
try:
params_str = get_params_str(params['parallel'])
except KeyError:
params_str = ''
# Grab the alignment parameters
# Want to find a cleaner strategy for this: the parallel script
# is method-specific, so doesn't take a --alignment_method
# option. This works for now though.
try:
d = params['align_seqs'].copy()
except KeyError:
d = {}
try:
del d['alignment_method']
except KeyError:
pass
params_str += ' %s' % get_params_str(d)
# Build the parallel pynast alignment command
align_seqs_cmd = 'parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\
(rep_set_fp, pynast_dir, params_str)
else:
try:
params_str = get_params_str(params['align_seqs'])
except KeyError:
params_str = ''
# Build the pynast alignment command
align_seqs_cmd = 'align_seqs.py -i %s -o %s %s' %\
(rep_set_fp, pynast_dir, params_str)
commands.append([('Align sequences', align_seqs_cmd)])
# Prep the alignment filtering command
filtered_aln_fp = '%s/%s_rep_set_aligned_pfiltered.fasta' %\
(pynast_dir, input_basename)
try:
params_str = get_params_str(params['filter_alignment'])
except KeyError:
params_str = ''
# Build the alignment filtering command
filter_alignment_cmd = 'filter_alignment.py -o %s -i %s %s' %\
(pynast_dir, aln_fp, params_str)
commands.append([('Filter alignment', filter_alignment_cmd)])
# Prep the tree building command
tree_fp = '%s/rep_set.tre' % output_dir
try:
params_str = get_params_str(params['make_phylogeny'])
except KeyError:
params_str = ''
# Build the tree building command
make_phylogeny_cmd = 'make_phylogeny.py -i %s -o %s %s' %\
(filtered_aln_fp, tree_fp, params_str)
commands.append([('Build phylogenetic tree', make_phylogeny_cmd)])
# Call the command handler on the list of commands
command_handler(commands,
status_update_callback,
logger=logger,
close_logger_on_success=close_logger_on_success)
return abspath(tree_fp), abspath(otu_table_fp)
run_qiime_data_preparation = run_pick_otus_through_otu_table = run_pick_de_novo_otus
def run_pick_closed_reference_otus(
input_fp,
refseqs_fp,
output_dir,
taxonomy_fp,
command_handler,
params,
qiime_config,
parallel=False,
logger=None,
suppress_md5=False,
status_update_callback=print_to_stdout):
""" Run the data preparation steps of Qiime
The steps performed by this function are:
1) Pick OTUs;
2) Build an OTU table with optional pre-defined taxonmy.
"""
# confirm that a valid otu picking method was supplied before doing
# any work
reference_otu_picking_methods = ['blast', 'uclust_ref', 'usearch61_ref',
'usearch_ref', 'sortmerna']
try:
otu_picking_method = params['pick_otus']['otu_picking_method']
except KeyError:
otu_picking_method = 'uclust_ref'
assert otu_picking_method in reference_otu_picking_methods,\
"Invalid OTU picking method supplied: %s. Valid choices are: %s"\
% (otu_picking_method, ' '.join(reference_otu_picking_methods))
# Prepare some variables for the later steps
input_dir, input_filename = split(input_fp)
input_basename, input_ext = splitext(input_filename)
create_dir(output_dir)
commands = []
if logger is None:
logger = WorkflowLogger(generate_log_fp(output_dir),
params=params,
qiime_config=qiime_config)
close_logger_on_success = True
else:
close_logger_on_success = False
if not suppress_md5:
log_input_md5s(logger, [input_fp, refseqs_fp, taxonomy_fp])
# Prep the OTU picking command
pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method)
otu_fp = '%s/%s_otus.txt' % (pick_otu_dir, input_basename)
if parallel and (otu_picking_method == 'blast' or
otu_picking_method == 'uclust_ref' or
otu_picking_method == 'usearch61_ref'):
# Grab the parallel-specific parameters
try:
params_str = get_params_str(params['parallel'])
except KeyError:
params_str = ''
# Grab the OTU picker parameters
try:
# Want to find a cleaner strategy for this: the parallel script
# is method-specific, so doesn't take a --alignment_method
# option. This works for now though.
d = params['pick_otus'].copy()
if 'otu_picking_method' in d:
del d['otu_picking_method']
params_str += ' %s' % get_params_str(d)
except KeyError:
pass
otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method
# Build the OTU picking command
pick_otus_cmd = '%s -i %s -o %s -r %s -T %s' %\
(otu_picking_script,
input_fp,
pick_otu_dir,
refseqs_fp,
params_str)
else:
try:
params_str = get_params_str(params['pick_otus'])
except KeyError:
params_str = ''
# Since this is reference-based OTU picking we always want to
# suppress new clusters -- force it here.
params_str += ' --suppress_new_clusters'
logger.write(
"Forcing --suppress_new_clusters as this is closed-reference OTU picking.\n\n")
# Build the OTU picking command
pick_otus_cmd = 'pick_otus.py -i %s -o %s -r %s -m %s %s' %\
(input_fp,
pick_otu_dir,
refseqs_fp,
otu_picking_method,
params_str)
commands.append([('Pick OTUs', pick_otus_cmd)])
# Prep the OTU table building command
otu_table_fp = '%s/otu_table.biom' % output_dir
try:
params_str = get_params_str(params['make_otu_table'])
except KeyError:
params_str = ''
if taxonomy_fp:
taxonomy_str = '-t %s' % taxonomy_fp
else:
taxonomy_str = ''
# Build the OTU table building command
make_otu_table_cmd = 'make_otu_table.py -i %s %s -o %s %s' %\
(otu_fp, taxonomy_str, otu_table_fp, params_str)
commands.append([('Make OTU table', make_otu_table_cmd)])
# Call the command handler on the list of commands
command_handler(commands,
status_update_callback,
logger=logger,
close_logger_on_success=close_logger_on_success)
run_pick_reference_otus_through_otu_table = run_pick_closed_reference_otus