/
util.py
164 lines (143 loc) · 5.61 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python
__author__ = "Daniel McDonald"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Catherine Lozupone", "Yoshiki Vazquez Baeza"]
__license__ = "BSD"
__version__ = "1.7.0-dev"
__maintainer__ = "Yoshiki Vazquez Baeza"
__email__ = "yoshik89@gmail.com"
__status__ = "Development"
from scipy.spatial import procrustes
from numpy.ma.extras import apply_along_axis
from numpy.ma import MaskedArray
from numpy import (shape, vstack, zeros, sum as numpy_sum, sort as numpy_sort,
nan as numpy_nan, array, median, average)
def summarize_pcoas(master_pcoa, support_pcoas, method='IQR', apply_procrustes=True):
"""returns the average PCoA vector values for the support pcoas
Also returns the ranges as calculated with the specified method.
The choices are:
IQR: the Interquartile Range
ideal fourths: Ideal fourths method as implemented in scipy
"""
if apply_procrustes:
# perform procrustes before averaging
support_pcoas = [list(sp) for sp in support_pcoas]
master_pcoa = list(master_pcoa)
for i, pcoa in enumerate(support_pcoas):
master_std, pcoa_std, m_squared = procrustes(master_pcoa[1],pcoa[1])
support_pcoas[i][1] = pcoa_std
master_pcoa[1] = master_std
m_matrix = master_pcoa[1]
m_eigvals = master_pcoa[2]
m_names = master_pcoa[0]
jn_flipped_matrices = []
all_eigvals = []
for rep in support_pcoas:
matrix = rep[1]
eigvals = rep[2]
all_eigvals.append(eigvals)
jn_flipped_matrices.append(_flip_vectors(matrix, m_matrix))
matrix_average, matrix_low, matrix_high = _compute_jn_pcoa_avg_ranges(\
jn_flipped_matrices, method)
#compute average eigvals
all_eigvals_stack = vstack(all_eigvals)
eigval_sum = numpy_sum(all_eigvals_stack, axis=0)
eigval_average = eigval_sum / float(len(all_eigvals))
return matrix_average, matrix_low, matrix_high, eigval_average, m_names
def _flip_vectors(jn_matrix, m_matrix):
"""transforms PCA vectors so that signs are correct"""
m_matrix_trans = m_matrix.transpose()
jn_matrix_trans = jn_matrix.transpose()
new_matrix= zeros(jn_matrix_trans.shape, float)
for i, m_vector in enumerate(m_matrix_trans):
jn_vector = jn_matrix_trans[i]
disT = list(m_vector - jn_vector)
disT = sum(map(abs, disT))
jn_flip = jn_vector*[-1]
disF = list(m_vector - jn_flip)
disF = sum(map(abs, disF))
if disT > disF:
new_matrix[i] = jn_flip
else:
new_matrix[i] = jn_vector
return new_matrix.transpose()
def _compute_jn_pcoa_avg_ranges(jn_flipped_matrices, method):
"""Computes PCoA average and ranges for jackknife plotting
returns 1) an array of jn_averages
2) an array of upper values of the ranges
3) an array of lower values for the ranges
method: the method by which to calculate the range
IQR: Interquartile Range
ideal fourths: Ideal fourths method as implemented in scipy
"""
x, y = shape(jn_flipped_matrices[0])
matrices = len(jn_flipped_matrices)
summary_matrix = array(jn_flipped_matrices).reshape(matrices, x * y)
matrix_average = average(jn_flipped_matrices, axis=0)
if method == 'IQR':
result = matrix_IQR(summary_matrix)
matrix_low = result[0].reshape(x, y)
matrix_high = result[1].reshape(x, y)
elif method == 'ideal_fourths':
result = idealfourths(summary_matrix, axis=0)
matrix_low = result[0].reshape(x, y)
matrix_high = result[1].reshape(x, y)
elif method == "sdev":
# calculate std error for each sample in each dimension
sdevs = zeros(shape=[x, y])
for j in range(y):
for i in range(x):
vals = array([pcoa[i][j] for pcoa in jn_flipped_matrices])
sdevs[i, j] = vals.std(ddof=1)
matrix_low = -sdevs/2
matrix_high = sdevs/2
return matrix_average, matrix_low, matrix_high
def IQR(x):
"""calculates the interquartile range of x
x can be a list or an array
returns min_val and max_val of the IQR"""
x.sort()
#split values into lower and upper portions at the median
odd = len(x) % 2
midpoint = int(len(x)/2)
if odd:
low_vals = x[:midpoint]
high_vals = x[midpoint+1:]
else: #if even
low_vals = x[:midpoint]
high_vals = x[midpoint:]
#find the median of the low and high values
min_val = median(low_vals)
max_val = median(high_vals)
return min_val, max_val
def matrix_IQR(x):
"""calculates the IQR for each column in an array
"""
num_cols = x.shape[1]
min_vals = zeros(num_cols)
max_vals = zeros(num_cols)
for i in range(x.shape[1]):
col = x[:, i]
min_vals[i], max_vals[i] = IQR(col)
return min_vals, max_vals
def idealfourths(data, axis=None):
"""This function returns an estimate of the lower and upper quartiles of the data along
the given axis, as computed with the ideal fourths. This function was taken
from scipy.stats.mstat_extra.py (http://projects.scipy.org/scipy/browser/trunk/scipy/stats/mstats_extras.py?rev=6392)
"""
def _idf(data):
x = data.compressed()
n = len(x)
if n < 3:
return [numpy_nan,numpy_nan]
(j,h) = divmod(n/4. + 5/12.,1)
j = int(j)
qlo = (1-h)*x[j-1] + h*x[j]
k = n - j
qup = (1-h)*x[k] + h*x[k-1]
return [qlo, qup]
data = numpy_sort(data, axis=axis).view(MaskedArray)
if (axis is None):
return _idf(data)
else:
return apply_along_axis(_idf, axis, data)