This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 268
/
pick_de_novo_otus.py
executable file
·180 lines (157 loc) · 8.78 KB
/
pick_de_novo_otus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python
# File created on 27 Oct 2009.
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Greg Caporaso", "Kyle Bittinger"]
__license__ = "GPL"
__version__ = "1.9.1-dev"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
from qiime.util import make_option
from os import makedirs
from qiime.util import (load_qiime_config,
parse_command_line_parameters,
get_options_lookup)
from qiime.parse import parse_qiime_parameters
from qiime.workflow.upstream import run_pick_de_novo_otus
from qiime.workflow.util import (print_commands,
call_commands_serially,
print_to_stdout,
no_status_updates,
validate_and_set_jobs_to_start)
qiime_config = load_qiime_config()
options_lookup = get_options_lookup()
script_info = {}
script_info[
'brief_description'] = """A workflow for de novo OTU picking, taxonomy assignment, phylogenetic tree construction, and OTU table construction."""
script_info[
'script_description'] = """This script takes a sequence file and performs all processing steps through building the OTU table."""
script_info['script_usage'] = []
script_info['script_usage'].append(
("""Uclust example""",
"""The following command will start an analysis on seqs.fna (-i), which is a post-split_libraries fasta file. The sequence identifiers in this file should be of the form <sample_id>_<unique_seq_id>. The following steps, corresponding to the preliminary data preparation, are applied: Pick de novo OTUs at 97%; pick a representative sequence for each OTU (the OTU centroid sequence); align the representative set with PyNAST; assign taxonomy with the uclust consensus taxonomy assigner; filter the alignment prior to tree building - remove positions which are all gaps, and specified as 0 in the lanemask; build a phylogenetic tree with FastTree; build an OTU table. All output files will be written to the directory specified by -o, and subdirectories as appropriate. ALWAYS SPECIFY ABSOLUTE FILE PATHS (absolute path represented here as $PWD, but will generally look something like /home/ubuntu/my_analysis/).""",
"""%prog -i $PWD/seqs.fna -o $PWD/uclust_otus/"""))
script_info['script_usage'].append(
("SumaClust example",
"The following command will start an analysis on seqs.fna (-i), "
"which is a post-split_libraries fasta file. The sequence identifiers "
"in this file should be of the form <sample_id>_<unique_seq_id>. The "
"following steps, corresponding to the preliminary data preparation, "
"are applied: Pick de novo OTUs at 97%; pick a representative sequence "
"for each OTU (the OTU centroid sequence); align the representative set "
"with PyNAST; assign taxonomy with the RDP consensus taxonomy "
"assigner; filter the alignment prior to tree building - remove "
"positions which are all gaps, and specified as 0 in the lanemask; "
"build a phylogenetic tree with FastTree; build an OTU table. All "
"output files will be written to the directory specified by -o, and "
"subdirectories as appropriate. ALWAYS SPECIFY ABSOLUTE FILE PATHS "
"(absolute path represented here as $PWD, but will generally look "
"something like /home/ubuntu/my_analysis/). ",
"%prog -i $PWD/seqs.fna -o $PWD/sumaclust_otus/ -p "
"$PWD/sumaclust_params.txt"))
script_info['script_usage'].append(
("Swarm example",
"The following command will start an analysis on seqs.fna (-i), "
"which is a post-split_libraries fasta file. The sequence identifiers "
"in this file should be of the form <sample_id>_<unique_seq_id>. The "
"following steps, corresponding to the preliminary data preparation, "
"are applied: Pick de novo OTUs at 97%; pick a representative sequence "
"for each OTU (the OTU centroid sequence); align the representative set "
"with PyNAST; assign taxonomy with the RDP consensus taxonomy "
"assigner; filter the alignment prior to tree building - remove "
"positions which are all gaps, and specified as 0 in the lanemask; "
"build a phylogenetic tree with FastTree; build an OTU table. All "
"output files will be written to the directory specified by -o, and "
"subdirectories as appropriate. ALWAYS SPECIFY ABSOLUTE FILE PATHS "
"(absolute path represented here as $PWD, but will generally look "
"something like /home/ubuntu/my_analysis/). ",
"%prog -i $PWD/seqs.fna -o $PWD/swarm_otus/ -p "
"$PWD/swarm_params.txt"))
script_info['script_usage_output_to_remove'] = ['$PWD/uclust_otus/',
'$PWD/sumaclust_otus/',
'$PWD/swarm_otus/']
script_info[
'output_description'] = """This script will produce an OTU mapping file (pick_otus.py), a representative set of sequences (FASTA file from pick_rep_set.py), a sequence alignment file (FASTA file from align_seqs.py), taxonomy assignment file (from assign_taxonomy.py), a filtered sequence alignment (from filter_alignment.py), a phylogenetic tree (Newick file from make_phylogeny.py) and a biom-formatted OTU table (from make_otu_table.py)."""
script_info['required_options'] = [
make_option('-i', '--input_fp', type='existing_filepath',
help='the input fasta file [REQUIRED]'),
make_option('-o', '--output_dir', type='new_dirpath',
help='the output directory [REQUIRED]'),
]
script_info['optional_options'] = [
make_option('-p', '--parameter_fp', type='existing_filepath',
help='path to the parameter file, which specifies changes' +
' to the default behavior. ' +
'See http://www.qiime.org/documentation/file_formats.html#qiime-parameters .' +
' [if omitted, default values will be used]'),
make_option('-f', '--force', action='store_true',
dest='force', help='Force overwrite of existing output directory' +
' (note: existing files in output_dir will not be removed)' +
' [default: %default]'),
make_option('-w', '--print_only', action='store_true',
dest='print_only', help='Print the commands but don\'t call them -- ' +
'useful for debugging [default: %default]', default=False),
make_option('-a', '--parallel', action='store_true',
dest='parallel', default=False,
help='Run in parallel where available [default: %default]'),
options_lookup['jobs_to_start_workflow']
]
script_info['version'] = __version__
def main():
option_parser, opts, args = parse_command_line_parameters(**script_info)
verbose = opts.verbose
input_fp = opts.input_fp
output_dir = opts.output_dir
verbose = opts.verbose
print_only = opts.print_only
parallel = opts.parallel
# No longer checking that jobs_to_start > 2, but
# commenting as we may change our minds about this.
#if parallel: raise_error_on_parallel_unavailable()
if opts.parameter_fp:
try:
parameter_f = open(opts.parameter_fp, 'U')
except IOError:
raise IOError("Can't open parameters file (%s). Does it exist? Do you have read access?"
% opts.parameter_fp)
params = parse_qiime_parameters(parameter_f)
parameter_f.close()
else:
params = parse_qiime_parameters([])
# empty list returns empty defaultdict for now
jobs_to_start = opts.jobs_to_start
default_jobs_to_start = qiime_config['jobs_to_start']
validate_and_set_jobs_to_start(params,
jobs_to_start,
default_jobs_to_start,
parallel,
option_parser)
try:
makedirs(output_dir)
except OSError:
if opts.force:
pass
else:
# Since the analysis can take quite a while, I put this check
# in to help users avoid overwriting previous output.
option_parser.error("Output directory already exists. Please choose"
" a different directory, or force overwrite with -f.")
if print_only:
command_handler = print_commands
else:
command_handler = call_commands_serially
if verbose:
status_update_callback = print_to_stdout
else:
status_update_callback = no_status_updates
run_pick_de_novo_otus(
input_fp,
output_dir,
command_handler=command_handler,
params=params,
qiime_config=qiime_config,
parallel=parallel,
status_update_callback=status_update_callback)
if __name__ == "__main__":
main()