This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
/
summarize_taxa.py
executable file
·261 lines (227 loc) · 12.4 KB
/
summarize_taxa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#!/usr/bin/env python
# File created on 09 Feb 2010
from __future__ import division
__author__ = "Rob Knight"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Rob Knight", "Catherine Lozupone", "Justin Kuczynski",
"Julia Goodrich", "Daniel McDonald", "Antonio Gonzalez Pena",
"Jesse Stombaugh", "Jose Carlos Clemente Litran",
"Greg Caporaso", "Jai Ram Rideout", "Adam Robbins-Pianka"]
__license__ = "GPL"
__version__ = "1.9.1"
__maintainer__ = "Daniel McDonald"
__email__ = "wasade@gmail.com"
from os.path import split, splitext, join
from sys import stdout, stderr
import numpy as np
from biom.table import Table
from biom import load_table
from qiime.util import (parse_command_line_parameters, make_option,
get_options_lookup, create_dir, write_biom_table)
from qiime.summarize_taxa import make_summary, add_summary_mapping
from qiime.parse import parse_mapping_file
from qiime.format import (
write_add_taxa_summary_mapping, format_add_taxa_summary_mapping)
options_lookup = get_options_lookup()
script_info = {}
script_info['brief_description'] = (
"Summarize taxa and store results in a new table or appended to an "
"existing mapping file.")
script_info['script_description'] = (
"The summarize_taxa.py script provides summary information of the "
"representation of taxonomic groups within each sample. It takes an OTU "
"table that contains taxonomic information as input. The taxonomic level "
"for which the summary information is provided is designated with the -L "
"option. The meaning of this level will depend on the format of the taxon "
"strings that are returned from the taxonomy assignment step. The "
"taxonomy strings that are most useful are those that standardize the "
"taxonomic level with the depth in the taxonomic strings. For instance, "
"the Greengenes database uses the following levels: Level 1 = Kingdom "
"(e.g Bacteria), Level 2 = Phylum (e.g Actinobacteria), Level 3 = Class "
"(e.g Actinobacteria), Level 4 = Order (e.g Actinomycetales), Level 5 = "
"Family (e.g Streptomycetaceae), Level 6 = Genus (e.g Streptomyces), "
"Level 7 = Species (e.g mirabilis). "
"By default, the relative abundance of each taxonomic "
"group will be reported, but the raw counts can be returned if -a is "
"passed.\n\nBy default, taxa summary tables will be output in both "
"classic (tab-separated) and BIOM formats. The BIOM-formatted taxa "
"summary tables can be used as input to other QIIME scripts that accept "
"BIOM files.")
script_info['script_usage'] = []
script_info['script_usage'].append(
("Examples:",
"Summarize taxa based at taxonomic levels 2, 3, 4, 5, and 6, and write "
"resulting taxa tables to the directory './tax'",
"%prog -i otu_table.biom -o ./tax"))
script_info['script_usage'].append(
("Examples:",
"Summarize taxa based at taxonomic levels 2, 3, 4, 5, and 6, and write "
"resulting mapping files to the directory './tax'",
"%prog -i otu_table.biom -o tax_mapping/ -m Fasting_Map.txt"))
script_info['output_description'] = (
"There are two possible output formats depending on whether or not a "
"mapping file is provided with the -m option. If a mapping file is not "
"provided, a table is returned where the taxonomic groups are each in a "
"row and there is a column for each sample. If a mapping file is "
"provided, the summary information will be appended to this file. "
"Specifically, a new column will be made for each taxonomic group to "
"which the relative abundances or raw counts will be added to the "
"existing rows for each sample. The addition of the taxonomic information "
"to the mapping file allows for taxonomic coloration of Principal "
"coordinates plots in Emperor. As described in the Emperor "
"documentation, principal coordinates plots can be dynamically colored based on "
"any of the metadata columns in the mapping file. Dynamic coloration of "
"the plots by the relative abundances of each taxonomic group can help to "
"distinguish which taxonomic groups are driving the clustering patterns.")
script_info['required_options'] = [
make_option('-i', '--otu_table_fp', dest='otu_table_fp',
help='Input OTU table filepath [REQUIRED]',
type='existing_filepath'),
]
script_info['optional_options'] = [
make_option('-L', '--level', default='2,3,4,5,6', type='string',
help='Taxonomic level to summarize by. [default: %default]'),
make_option('-m', '--mapping',
help='Input metadata mapping filepath. If supplied, then the '
'taxon information will be added to this file. This option is '
'useful for coloring PCoA plots by taxon abundance or to '
'perform statistical tests of taxon/mapping associations.',
type='existing_filepath'),
make_option('--md_identifier', default='taxonomy', type='string',
help='the relevant observation metadata key '
'[default: %default]'),
make_option('--md_as_string', default=False, action='store_true',
help='metadata is included as string [default: metadata is '
'included as list]'),
make_option('-d', '--delimiter', action='store', type='string',
dest='delimiter', default=';',
help='Delimiter separating taxonomy levels. '
'[default: %default]'),
make_option('-a', '--absolute_abundance', action='store_true',
dest='absolute_abundance', default=False,
help='If present, the absolute abundance of the lineage in '
'each sample is reported. By default, this script uses '
'relative abundance [default: %default]'),
make_option('-l', '--lower_percentage', type='float', default=None,
help='If present, OTUs having higher absolute abundance are '
'trimmed. To remove OTUs that make up more than 5% of the '
'total dataset you would pass 0.05. [default: %default]'),
make_option('-u', '--upper_percentage', type='float', default=None,
help='If present, OTUs having lower absolute abundance are '
'trimmed. To remove the OTUs that makes up less than 45% of '
'the total dataset you would pass 0.45. [default: %default]'),
make_option('-t', '--transposed_output', action='store_true',
dest='transposed_output', default=False,
help='If present, the output will be written transposed from '
'the regular output. This is helpful in cases when you want '
'to use Site Painter to visualize your data '
'[default: %default]'),
options_lookup['output_dir'],
make_option('--suppress_classic_table_output', action='store_true',
default=False, help='If present, the classic (TSV) format '
'taxon table will not be created in the output directory. '
'This option is ignored if -m/--mapping is present '
'[default: %default]'),
make_option('--suppress_biom_table_output', action='store_true',
default=False, help='If present, the BIOM-formatted taxon '
'table will not be created in the output directory. This '
'option is ignored if -m/--mapping is present '
'[default: %default]')
]
script_info['option_label'] = {'otu_table_fp': 'OTU table filepath',
'output_fp': 'Output filepath',
'mapping': 'QIIME-formatted mapping filepath',
'level': 'Summarize level',
'delimiter': 'Taxonomic delimiter',
'absolute_abundance': 'Use absolute abundance',
'lower_percentage': 'Top % of OTUs to remove',
'upper_percentage': 'Bottom % of OTUs to '
'remove'}
script_info['version'] = __version__
def main():
option_parser, opts, args = parse_command_line_parameters(**script_info)
lower_percentage = opts.lower_percentage
upper_percentage = opts.upper_percentage
otu_table_fp = opts.otu_table_fp
otu_table = load_table(otu_table_fp)
delimiter = opts.delimiter
mapping_fp = opts.mapping
md_as_string = opts.md_as_string
md_identifier = opts.md_identifier
levels = opts.level.split(',')
suppress_classic_table_output = opts.suppress_classic_table_output
suppress_biom_table_output = opts.suppress_biom_table_output
if upper_percentage is not None and lower_percentage is not None:
raise ValueError(
"upper_percentage and lower_percentage are mutually exclusive")
if upper_percentage is not None and lower_percentage is not None and \
mapping:
raise ValueError("upper_percentage and lower_percentage can not be "
"using with mapping file")
if upper_percentage is not None and \
(upper_percentage < 0 or upper_percentage > 1.0):
raise ValueError('max_otu_percentage should be between 0.0 and 1.0')
if lower_percentage is not None and \
(lower_percentage < 0 or lower_percentage > 1.0):
raise ValueError('lower_percentage should be between 0.0 and 1.0')
if mapping_fp:
mapping_file = open(mapping_fp, 'U')
mapping, header, comments = parse_mapping_file(mapping_file)
# use the input Mapping file for producing the output filenames
map_dir_path, map_fname = split(mapping_fp)
map_basename, map_fname_ext = splitext(map_fname)
else:
if suppress_classic_table_output and suppress_biom_table_output:
option_parser.error("Both classic and BIOM output formats were "
"suppressed.")
if not opts.absolute_abundance:
otu_table = otu_table.norm(axis='sample', inplace=False)
# introduced output directory to will allow for multiple outputs
if opts.output_dir:
create_dir(opts.output_dir, False)
output_dir_path = opts.output_dir
else:
output_dir_path = './'
# use the input OTU table to produce the output filenames
dir_path, fname = split(otu_table_fp)
basename, fname_ext = splitext(fname)
# Iterate over the levels and generate a summarized taxonomy for each
for level in levels:
if mapping_fp:
# define output filename
output_fname = join(output_dir_path,
map_basename + '_L%s.txt' % (level))
summary, tax_order = add_summary_mapping(otu_table,
mapping,
int(level),
md_as_string,
md_identifier)
write_add_taxa_summary_mapping(summary, tax_order, mapping,
header, output_fname, delimiter)
else:
# define the output filename. The extension will be added to the
# end depending on the output format
output_fname = join(output_dir_path, basename + '_L%s' % level)
summary, header = make_summary(otu_table,
int(level),
upper_percentage,
lower_percentage,
md_as_string,
md_identifier)
sample_ids = header[1:]
observation_ids = []
data = []
for row in summary:
# Join taxonomic levels to create an observation ID.
observation_ids.append(delimiter.join(row[0]))
data.append(row[1:])
table = Table(np.asarray(data), observation_ids, sample_ids)
if opts.transposed_output:
table = table.transpose()
if not suppress_classic_table_output:
with open(output_fname + '.txt', 'w') as outfile:
outfile.write(table.to_tsv())
if not suppress_biom_table_output:
write_biom_table(table, output_fname + '.biom')
if __name__ == "__main__":
main()