-
Notifications
You must be signed in to change notification settings - Fork 12
/
monte_carlo.py
281 lines (225 loc) · 9.6 KB
/
monte_carlo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
__author__ = 'Olga'
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
from scipy.stats import beta
import seaborn as sns
from .visualize import MODALITY_ORDER
from .names import NEAR_ZERO, NEAR_HALF, NEAR_ONE, BIMODAL, NULL_MODEL
def _assign_modality_from_estimate(mean_alpha, mean_beta):
"""
Given estimated alpha and beta parameters from an Markov Chain Monte Carlo
run, assign a modality.
"""
# check if one parameter is much larger than another, and that they're
# both larger than 1 (then it's either skewed towards 0 or 1)
# if mean_alpha/mean_beta > 2 or mean_beta/mean_alpha > 2:
if (mean_alpha > mean_beta) and (mean_beta > 1):
# if alpha is greater than beta, then the probability is skewed
# higher towards 1
return NEAR_ONE
elif (mean_beta > mean_alpha) and (mean_alpha > 1):
# If alpha is less than beta, then the probability is skewed
# higher towards 0
return NEAR_ZERO
elif (mean_alpha < 1) and (mean_beta < 1):
# if they're both under 1, then there's a valley in the middle,
# and higher probability at the extremes of 0 and 1
return BIMODAL
elif mean_alpha >= 1.5 and mean_beta >= 1.5:
# if they're both fairly big, then there's a hump in the middle, and
# low probability near the extremes of 0 and 1
return NEAR_HALF
else:
# maybe should check if both mean_alpha and mean_beta are near 1?
return NULL_MODEL
# elif abs(mean_alpha - 1) < 0.25 and abs(mean_beta - 1) < 0.25:
# return 'uniform'
# else:
# return None
def _print_and_plot(mean_alpha, mean_beta, alphas, betas, n_iter, data):
print
print mean_alpha, mean_beta, ' estimated modality:', \
_assign_modality_from_estimate(mean_alpha, mean_beta)
import numpy as np
from scipy.stats import beta
import matplotlib.pyplot as plt
import prettyplotlib as ppl
fig, axes = plt.subplots(ncols=2, figsize=(12, 4))
ax = axes[0]
ppl.plot(alphas, label='alpha', ax=ax)
ppl.plot(betas, label='beta', ax=ax)
ppl.legend(ax=ax)
ax.hlines(mean_alpha, 0, n_iter)
ax.hlines(mean_beta, 0, n_iter)
ax.annotate('mean_alpha = {:.5f}'.format(mean_alpha),
(0, mean_alpha), fontsize=12,
xytext=(0, 1), textcoords='offset points')
ax.annotate('mean_beta = {:.5f}'.format(betas.mean()),
(0, mean_beta), fontsize=12,
xytext=(0, 1), textcoords='offset points')
ax.set_xlim(0, n_iter)
ax = axes[1]
ppl.hist(data, facecolor='grey', alpha=0.5, bins=np.arange(0, 1, 0.05),
zorder=10, ax=ax)
ymin, ymax = ax.get_ylim()
one_x = np.arange(0, 1.01, 0.01)
x = np.repeat(one_x, n_iter).reshape(len(one_x), n_iter)
beta_distributions = np.vstack((beta(a, b).pdf(one_x)
for a, b in zip(alphas, betas))).T
ppl.plot(x, beta_distributions, color=ppl.colors.set2[0], alpha=0.1,
linewidth=2, ax=ax, zorder=1)
ax.set_ylim(0, ymax)
rv_included = beta(5, 1)
rv_excluded = beta(1, 5)
rv_middle = beta(5, 5)
rv_uniform = beta(1, 1)
rv_bimodal = beta(.65, .65)
models = {NEAR_ONE: rv_included,
NEAR_ZERO: rv_excluded,
NEAR_HALF: rv_middle,
NULL_MODEL: rv_uniform,
BIMODAL: rv_bimodal}
# model_args = pd.DataFrame.from_dict(dict((name, np.array(model.args).astype(float)) for name, model in models.iteritems()))
# model_names = models.keys()
# # @pm.deterministic
# def modality_i(a, b, model_args=model_args):
# # Not sure about this next line...
# return model_args.apply(lambda x: spatial.distance.euclidean([a, b], x)).argmin()
def estimate_alpha_beta(data, n_iter=1000, plot=False):
# data = data.dropna()
alpha_var = pm.Exponential('alpha', .5)
beta_var = pm.Exponential('beta', .5)
observations = pm.Beta('observations', alpha_var, beta_var, value=data,
observed=True)
model = pm.Model([alpha_var, beta_var])
mcmc = pm.MCMC(model)
mcmc.sample(n_iter)
if plot:
from pymc.Matplot import plot
plot(mcmc)
sns.despine()
alphas = mcmc.trace('alpha')[:]
betas = mcmc.trace('beta')[:]
mean_alpha = alphas.mean()
mean_beta = betas.mean()
return mean_alpha, mean_beta
def estimate_modality(data, n_iter=1000, plot=False):
alpha_var = pm.Exponential('alpha', .5)
beta_var = pm.Exponential('beta', .5)
# observations = pm.Beta('observations', alpha_var, beta_var, value=data,
# observed=True)
model = pm.Model([alpha_var, beta_var])
mcmc = pm.MCMC(model)
mcmc.sample(n_iter)
if plot:
from pymc.Matplot import plot
plot(mcmc)
sns.despine()
alphas = mcmc.trace('alpha')[:]
betas = mcmc.trace('beta')[:]
counter = Counter(_assign_modality_from_estimate(a,b) for a, b in zip(alphas, betas))
print 'estimated_modalities'
mean_alpha = alphas.mean()
mean_beta = betas.mean()
# estimated_modality = _assign_modality_from_estimate(mean_alpha, mean_beta)
# if plot:
# _print_and_plot(mean_alpha, mean_beta, alphas, betas, n_iter, data)
# return pd.Series({'mean_alpha':mean_alpha, 'mean_beta':mean_beta, 'modality':estimated_modality})
print counter
for modality in MODALITY_ORDER:
if modality not in counter:
counter[modality] = 0
series = pd.Series(counter)
# print series
return series
def estimate_modality_latent(data, n_iter=1000,
model_params=[(2, 1), (2, 2), (1, 2),
(.5, .5), (1, 1)]):
data[data == 0] = 0.001
data[data == 1] = 0.999
# fig, ax = plt.subplots()
# sns.distplot(data, ax=ax, bins=np.arange(0, 1, 0.05))
assignment = pm.Categorical('assignment',
[1./len(model_params)]*len(model_params))
alpha_var = pm.Lambda('alpha_var', lambda a=assignment: model_params[a][0])
beta_var = pm.Lambda('beta_var', lambda a=assignment: model_params[a][1])
observations = pm.Beta('observations', alpha_var, beta_var, value=data,
observed=True)
model = pm.Model([assignment, observations])
mcmc = pm.MCMC(model)
mcmc.sample(n_iter)
assignment_trace = mcmc.trace('assignment')[:]
# print assignment_trace
# plot(mcmc)
# sns.despine()
counter = Counter(MODALITY_ORDER[i] for i in assignment_trace)
# print '\n', counter
for modality in MODALITY_ORDER:
if modality not in counter:
counter[modality] = 0
series = pd.Series(counter)
# print series
return series
class MonteCarloModalities(object):
def __init__(self, n_iter=1000, model_params=((2, 1), (2, 2), (1, 2),
(.5, .5), (1, 1))):
self.n_iter = n_iter
self.assignment = pm.Categorical('assignment',
[1. / len(model_params)] * len(
model_params))
self.alpha_var = pm.Lambda('alpha_var',
lambda a=self.assignment: model_params[a][0])
self.beta_var = pm.Lambda('beta_var',
lambda a=self.assignment: model_params[a][1])
def fit_single_feature(self, feature, near_0=0.001, near_1=0.999):
"""Estimate the beta parameters of a single feature
Parameters
----------
feature : pandas.Series
A (n_samples,) shaped vector of observed values of a feature which
you desire to predict the beta distribution parameters
near_0 : float
The Beta distribution is not defined at exactly 0, and we must
replace all values equal to 0 with a small number that is near
zero, which is this number
near_1 : float
The Beta distribution is not defined at exactly 1, and we must
replace all values equal to 1 with 1 minus a small number, which
is this number
"""
feature[feature == 0] = near_0
feature[feature == 1] = near_1
observations = pm.Beta('observations', self.alpha_var, self.beta_var,
value=feature, observed=True)
model = pm.Model([self.assignment, observations])
mcmc = pm.MCMC(model)
mcmc.sample(self.n_iter)
assignment_trace = mcmc.trace('assignment')[:]
counter = Counter(MODALITY_ORDER[i] for i in assignment_trace)
for modality in MODALITY_ORDER:
if modality not in counter:
counter[modality] = 0
series = pd.Series(counter)
return series
def fit(self, data, near_0=0.001, near_1=0.999):
"""Estimate the beta parameters of many features
Parameters
----------
data : pandas.DataFrame
A (n_samples, n_features) shaped matrix of observed values of a
feature which you desire to predict the beta distribution
parameters of each column (feature)
near_0 : float
The Beta distribution is not defined at exactly 0, and we must
replace all values equal to 0 with a small number that is near
zero, which is this number
near_1 : float
The Beta distribution is not defined at exactly 1, and we must
replace all values equal to 1 with 1 minus a small number, which
is this number
"""
return data.apply(self.estimate_modality_latent, near_0=near_0,
near_1=near_1)