-
Notifications
You must be signed in to change notification settings - Fork 4
/
mytstats.py
195 lines (164 loc) · 6.76 KB
/
mytstats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import numpy as np
__all__ = ['tstatistic',
'nantstatistic',
'diffmean']
def diffmean(a, b, axis=0):
"""Difference of means statistic.
Parameters
----------
a,b : ndarray with shapes equal along all dims except axis
Input data for the calculation.
axis : int
Specify the axis along which the statistic will be computed.
Returns
-------
dm : ndarray with one less dimension"""
return a.mean(axis=axis) - b.mean(axis=axis)
def tstatistic(a, b, axis=0, equal_var=True):
"""Computes a two-sample t-statistic on a and b along the specific axis
Code is lifted from scipy.stats.ttest_ind except that there is no
calculation of the associated p-value
Parameters
----------
a,b : ndarray with shapes equal along all dims except axis
Input data for the calculation.
axis : int
Specify the axis along which the statistic will be computed.
equal_var : bool
Specify if the statistic will use a pooled estimate of the variance (True)
or if it will make no assumption about equal variance (False).
Returns
-------
t : ndarray with one less dimension"""
v1 = np.var(a, axis, ddof=1)
v2 = np.var(b, axis, ddof=1)
n1 = a.shape[axis]
n2 = b.shape[axis]
if equal_var:
df = n1 + n2 - 2
svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / float(df)
denom = np.sqrt(svar * (1.0 / n1 + 1.0 / n2))
else:
vn1 = v1 / n1
vn2 = v2 / n2
df = ((vn1 + vn2)**2) / ((vn1**2) / (n1 - 1) + (vn2**2) / (n2 - 1))
# If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0).
# Hence it doesn't matter what df is as long as it's not NaN.
df = np.where(np.isnan(df), 1, df)
denom = np.sqrt(vn1 + vn2)
d = np.mean(a, axis) - np.mean(b, axis)
t = np.divide(d, denom)
return t
def nantstatistic(a, b, axis = 0, equal_var = True):
"""Computes a two-sample t-statistic on a and b along the specific axis
Uses nan* functions which can be slightly slower.
Code is lifted from scipy.stats.ttest_ind except that there is no
calculation of the associated p-value
Parameters
----------
a,b : ndarray with shapes equal along all dims except axis
Input data for the calculation.
axis : int
Specify the axis along which the statistic will be computed.
equal_var : bool
Specify if the statistic will use a pooled estimate of the variance (True)
or if it will make no assumption about equal variance (False).
Returns
-------
t : ndarray with one less dimension
"""
v1 = np.nanvar(a, axis, ddof=1)
v2 = np.nanvar(b, axis, ddof=1)
n1 = a.shape[axis]
n2 = b.shape[axis]
if equal_var:
df = n1 + n2 - 2
svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / np.float(df)
denom = np.sqrt(svar * (1.0 / n1 + 1.0 / n2))
else:
vn1 = v1 / n1
vn2 = v2 / n2
df = ((vn1 + vn2)**2) / ((vn1**2) / (n1 - 1) + (vn2**2) / (n2 - 1))
# If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0).
# Hence it doesn't matter what df is as long as it's not NaN.
df = np.where(np.isnan(df), 1, df)
denom = np.sqrt(vn1 + vn2)
d = np.nanmean(a, axis) - np.nanmean(b, axis)
t = np.divide(d, denom)
return t
'''TODO: implement in numba so they can be used in a numbized permutation test
"""Attempt to import numba and define numba compiled versions of these functions"""
import os
import sys
try:
import numba as nb
print 'mytstats: Successfully imported numba version %s' % (nb.__version__)
NB_SUCCESS = True
except OSError:
try:
"""On Windows it is neccessary to be on the same drive as the LLVM DLL
in order to import numba without generating a "Windows Error 161: The specified path is invalid."""
curDir = os.getcwd()
targetDir = os.path.splitdrive(sys.executable)[0]
os.chdir(targetDir)
import numba as nb
print 'mytstats: Successfully imported numba version %s' % (nb.__version__)
NB_SUCCESS = True
except OSError:
NB_SUCCESS = False
print 'mytstats: Could not load numba\n(may be a path issue try starting python in C:\\)'
finally:
os.chdir(curDir)
except ImportError:
NB_SUCCESS = False
print 'mytstats: Could not load numba'
"""TODO: (1) Test numba functions (this code is just copied from above)
if the function can just be decorated then do that,
but i think it may need to be modified
(2) Add numba permutation function that utlizes these statistics
(this is where the speed-up will happen)"""
if NB_SUCCESS and False:
__all__.extend(['nb_tstatistic', 'nb_nantstatistic'])
@nb.jit(nb.float64[:](nb.float64[:],nb.float64[:], nb.int32, nb.boolean), nopython = True)
def nb_tstatistic(a, b, axis, equal_var):
v1 = np.var(a, axis, ddof=1)
v2 = np.var(b, axis, ddof=1)
n1 = a.shape[axis]
n2 = b.shape[axis]
if equal_var:
df = n1 + n2 - 2
svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / np.float(df)
denom = np.sqrt(svar * (1.0 / n1 + 1.0 / n2))
else:
vn1 = v1 / n1
vn2 = v2 / n2
df = ((vn1 + vn2)**2) / ((vn1**2) / (n1 - 1) + (vn2**2) / (n2 - 1))
# If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0).
# Hence it doesn't matter what df is as long as it's not NaN.
df = np.where(np.isnan(df), 1, df)
denom = np.sqrt(vn1 + vn2)
d = np.mean(a, axis) - np.mean(b, axis)
t = np.divide(d, denom)
return t
@nb.jit(nb.float64[:](nb.float64[:],nb.float64[:], nb.int, nb.boolean), nopython = True)
def nb_nantstatistic(a, b, axis, equal_var):
v1 = np.nanvar(a, axis, ddof=1)
v2 = np.nanvar(b, axis, ddof=1)
n1 = a.shape[axis]
n2 = b.shape[axis]
if equal_var:
df = n1 + n2 - 2
svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / float(df)
denom = np.sqrt(svar * (1.0 / n1 + 1.0 / n2))
else:
vn1 = v1 / n1
vn2 = v2 / n2
df = ((vn1 + vn2)**2) / ((vn1**2) / (n1 - 1) + (vn2**2) / (n2 - 1))
# If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0).
# Hence it doesn't matter what df is as long as it's not NaN.
df = np.where(np.isnan(df), 1, df)
denom = np.sqrt(vn1 + vn2)
d = np.nanmean(a, axis) - np.nanmean(b, axis)
t = np.divide(d, denom)
return t
'''