This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
/
beta_significance.py
executable file
·181 lines (153 loc) · 7.45 KB
/
beta_significance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/env python
# File created on 4 June 2010
from __future__ import division
__author__ = "Justin Kuczynski"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Justin Kuczynski", "Jose Antonio Navas", "Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.8.0"
__maintainer__ = "Justin Kuczynski"
__email__ = "justinak@gmail.com"
from qiime.util import make_option
import os
from numpy import array
import warnings
warnings.filterwarnings('ignore', 'Not using MPI as mpi4py not found')
from cogent.maths.unifrac.fast_unifrac import fast_unifrac_permutations_file, TEST_ON_PAIRWISE, TEST_ON_TREE, TEST_ON_ENVS
from cogent.maths.unifrac.fast_unifrac import fast_p_test_file
from qiime.util import parse_command_line_parameters
from qiime.format import format_unifrac_sample_mapping
from biom.parse import parse_biom_table
script_info = {}
script_info['brief_description'] = "This script runs any of a set of common tests to determine if a sample is statistically significantly different from another sample"
script_info['script_description'] = "The tests are conducted on each pair of samples present in the input otu table. See the unifrac tutorial online for more details (http://bmf2.colorado.edu/unifrac/tutorial.psp)"
script_info['script_usage'] = []
script_info['script_usage'].append(("Example:","Perform 100 randomizations of sample/sequence assignments, and record the probability that sample 1 is phylogenetically different from sample 2, using the unifrac monte carlo significance test. The test is run for all pairs of samples.","%prog -i otu_table.biom -t rep_set.tre -s unweighted_unifrac -o unw_sig.txt"))
script_info['output_description']= "The script outputs a tab delimited text file with each pair of samples and a p value representing the probability that a random sample/sequence assignment will result in more dissimilar samples than the actual pair of samples."
script_info['required_options'] = [\
make_option('-i', '--input_path',type='existing_filepath',
help='input otu table in biom format'),
make_option('-o', '--output_path',type='new_filepath',
help='output results path'),
make_option('-s', '--significance_test',type='choice',
choices=['unweighted_unifrac', 'weighted_unifrac', 'weighted_normalized_unifrac', 'p-test'],
help="significance test to use, options are 'unweighted_unifrac', 'weighted_unifrac', 'weighted_normalized_unifrac', or 'p-test'"),
make_option('-t', '--tree_path',type='existing_filepath',help='path to newick tree file'),
]
script_info['optional_options'] = [
make_option('-n', '--num_iters', default=100, type="int",
help='number of monte carlo randomizations [default: %default]'),
make_option('-k', '--type_of_test', type='choice',
choices=['all_together', 'each_pair', 'each_sample'], default='each_pair',
help="type of significance test to perform, options are 'all_together', 'each_pair' or 'each_sample'. [default: %default]"),
]
script_info['version'] = __version__
def main():
option_parser, opts, args =\
parse_command_line_parameters(**script_info)
otu_table_fp = opts.input_path
otu_table = parse_biom_table(open(otu_table_fp, 'U'))
sample_ids = otu_table.SampleIds
otu_ids = otu_table.ObservationIds
## This is not memory safe: need to be able to load the otu table as ints
otu_table_array = array(list(otu_table.iterObservationData()),dtype='int')
if opts.type_of_test == 'all_together':
type_of_test = TEST_ON_TREE
header_text = "sample\tp value\tp value (Bonferroni corrected)\n"
elif opts.type_of_test == 'each_pair':
type_of_test = TEST_ON_PAIRWISE
header_text = "sample 1\tsample 2\tp value\tp value (Bonferroni corrected)\n"
elif opts.type_of_test == 'each_sample':
type_of_test = TEST_ON_ENVS
header_text = "sample\tp value\tp value (Bonferroni corrected)\n"
if opts.significance_test == 'p-test':
raise RuntimeError('significance test type "each_sample" not allowed for p-test')
else:
raise RuntimeError('significance test type "%s" not found' %\
opts.type_of_test)
# note, uses ugly temp file
if opts.significance_test == 'unweighted_unifrac':
tree_in = open(opts.tree_path,'U')
output_fp = opts.output_path + '_envs.tmp'
result = format_unifrac_sample_mapping(
sample_ids, otu_ids, otu_table_array)
of = open(output_fp, 'w')
of.write('\n'.join(result))
of.close()
envs_in = open(output_fp,'U')
result = fast_unifrac_permutations_file(tree_in, envs_in,
weighted=False, num_iters=opts.num_iters, verbose=opts.verbose, test_on=type_of_test)
envs_in.close()
os.remove(output_fp)
of = open(opts.output_path,'w')
of.write("#unweighted unifrac significance test\n")
of.write(header_text)
for line in result:
of.write('\t'.join(map(str,line)) + '\n')
of.close()
elif opts.significance_test == 'p-test':
tree_in = open(opts.tree_path,'U')
output_fp = opts.output_path + '_envs.tmp'
result = format_unifrac_sample_mapping(
sample_ids, otu_ids, otu_table_array)
of = open(output_fp, 'w')
of.write('\n'.join(result))
of.close()
envs_in = open(output_fp,'U')
result = fast_p_test_file(tree_in, envs_in,
num_iters=opts.num_iters, verbose=opts.verbose, test_on=type_of_test)
envs_in.close()
os.remove(output_fp)
of = open(opts.output_path,'w')
of.write(\
"#andy martin's p-test significance test\n")
of.write(header_text)
for line in result:
of.write('\t'.join(map(str,line)) + '\n')
of.close()
elif opts.significance_test == 'weighted_unifrac':
tree_in = open(opts.tree_path,'U')
output_fp = opts.output_path + '_envs.tmp'
result = format_unifrac_sample_mapping(
sample_ids, otu_ids, otu_table_array)
of = open(output_fp, 'w')
of.write('\n'.join(result))
of.close()
envs_in = open(output_fp,'U')
result = fast_unifrac_permutations_file(tree_in, envs_in,
weighted=True, num_iters=opts.num_iters, verbose=opts.verbose, test_on=type_of_test)
envs_in.close()
os.remove(output_fp)
of = open(opts.output_path,'w')
of.write(\
"#weighted unifrac significance test\n")
of.write(header_text)
for line in result:
of.write('\t'.join(map(str,line)) + '\n')
of.close()
elif opts.significance_test == 'weighted_normalized_unifrac':
tree_in = open(opts.tree_path, 'U')
output_fp = opts.output_path + '_envs.tmp'
result = format_unifrac_sample_mapping(
sample_ids, otu_ids, otu_table_array)
of = open(output_fp, 'w')
of.write('\n'.join(result))
of.close()
envs_in = open(output_fp, 'U')
result = fast_unifrac_permutations_file(tree_in, envs_in,
weighted='correct', num_iters = opts.num_iters, verbose=opts.verbose, test_on=type_of_test)
envs_in.close()
os.remove(output_fp)
of = open(opts.output_path, 'w')
of.write(\
"#weighted normalized unifrac significance test\n")
of.write(\
"sample 1\tsample 2\tp value\tp value (Bonferroni corrected)\n")
for line in result:
of.write('\t'.join(map(str,line)) + '\n')
of.close()
else:
raise RuntimeError('significance test "%s" not found' %\
opts.significance_test)
if __name__ == "__main__":
main()