This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
/
compute_core_microbiome.py
executable file
·149 lines (120 loc) · 6.37 KB
/
compute_core_microbiome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python
# File created on 12 Jun 2012
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.8.0"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
from os.path import join
from matplotlib import use
use('Agg', warn=False)
from pylab import xlim, ylim, xlabel, ylabel, plot, savefig
from numpy import linspace
from cogent.util.misc import create_dir
from biom.parse import parse_biom_table
from biom.exception import TableException
from qiime.util import parse_command_line_parameters, make_option
from qiime.format import format_biom_table
from qiime.core_microbiome import filter_table_to_core
from qiime.filter import sample_ids_from_metadata_description
script_info = {}
script_info['brief_description'] = "Identify the core microbiome."
script_info['script_description'] = ""
script_info['script_usage'] = []
script_info['script_usage'].append(("","Identify the core OTUs in otu_table.biom, defined as the OTUs that are present in at least 50% of the samples. Write the list of core OTUs to a text file, and a new BIOM file containing only the core OTUs.","%prog -i otu_table.biom -o otu_table_core"))
script_info['script_usage'].append(("","Identify the core OTUs in otu_table.biom, defined as the OTUs that are present in all of the samples in the 'Fast' treatment (as specified in the mapping file). Write the list of core OTUs to a text file.","%prog -i otu_table.biom -o otu_table_core_fast --mapping_fp map.txt --valid_states \"Treatment:Fast\""))
script_info['output_description']= ""
script_info['required_options'] = [\
make_option('-i','--input_fp',type="existing_filepath",help='the input otu table in BIOM format'),
make_option('-o','--output_dir',type="new_dirpath",help='directory to store output data'),
]
script_info['optional_options'] = [
make_option('--max_fraction_for_core',type=float,
help='the max fractions of samples that an OTU must be observed in to be considered part of the core as a number in the range [0,1] [default: %default]',default=1.0),
make_option('--min_fraction_for_core',type=float,
help='the min fractions of samples that an OTU must be observed in to be considered part of the core as a number in the range [0,1] [default: %default]',default=0.5),
make_option('--num_fraction_for_core_steps',type=int,
help='the number of evenly sizes steps to take between min_fraction_for_core and max_fraction_for_core [default: %default]',default=11),
make_option('--otu_md',default='taxonomy',
help='the otu metadata category to write to the output file [defualt: %default]'),\
make_option('--mapping_fp',type='existing_filepath',
help='mapping file path (for use with --valid_states) [default: %default]'),
make_option('--valid_states',
help='description of sample ids to retain (for use with --mapping_fp) [default: %default]')
]
script_info['version'] = __version__
def main():
option_parser, opts, args =\
parse_command_line_parameters(**script_info)
input_fp = opts.input_fp
output_dir = opts.output_dir
if opts.num_fraction_for_core_steps < 2:
option_parser.error("Must perform at least two steps. Increase --num_fraction_for_core_steps.")
fractions_for_core = linspace(opts.min_fraction_for_core,
opts.max_fraction_for_core,
opts.num_fraction_for_core_steps)
otu_md = opts.otu_md
valid_states = opts.valid_states
mapping_fp = opts.mapping_fp
create_dir(output_dir)
if valid_states and opts.mapping_fp:
sample_ids = sample_ids_from_metadata_description(open(mapping_fp,'U'),
valid_states)
if len(sample_ids) < 1:
option_parser.error(\
"--valid_states pattern didn't match any entries in mapping file: \"%s\"" %\
valid_states)
else:
# get core across all samples if user doesn't specify a subset of the
# samples to work with
sample_ids = None
input_table = parse_biom_table(open(input_fp,'U'))
otu_counts = []
summary_figure_fp = join(output_dir,'core_otu_size.pdf')
for fraction_for_core in fractions_for_core:
# build a string representation of the fraction as that gets used
# several times
fraction_for_core_str = "%1.0f" % (fraction_for_core * 100.)
# prep output files
output_fp = join(output_dir,'core_otus_%s.txt' % fraction_for_core_str)
output_table_fp = join(output_dir,'core_table_%s.biom' % fraction_for_core_str)
output_f = open(output_fp,'w')
try:
core_table = filter_table_to_core(input_table,
sample_ids,
fraction_for_core)
except TableException:
output_f.write("# No OTUs present in %s %% of samples." % fraction_for_core_str)
output_f.close()
otu_counts.append(0)
continue
# write some header information to file
if sample_ids == None:
output_f.write("# Core OTUs across %s %% of samples.\n" % fraction_for_core_str)
else:
output_f.write(\
"# Core OTUs across %s %% of samples matching the sample metadata pattern \"%s\":\n# %s\n" %\
(fraction_for_core_str, valid_states,' '.join(sample_ids)))
# write the otu id and corresponding metadata for all core otus
otu_count = 0
for value, id_, md in core_table.iterObservations():
output_f.write('%s\t%s\n' % (id_,md[otu_md]))
otu_count += 1
output_f.close()
# write the core biom table
output_table_f = open(output_table_fp,'w')
output_table_f.write(format_biom_table(core_table))
output_table_f.close()
# append the otu count to the list of counts
otu_counts.append(otu_count)
plot(fractions_for_core, otu_counts)
xlim(min(fractions_for_core),max(fractions_for_core))
ylim(0,max(otu_counts)+1)
xlabel("Fraction of samples that OTU must be observed in to be considered 'core'")
ylabel("Number of OTUs")
savefig(summary_figure_fp)
if __name__ == "__main__":
main()