Permalink
Browse files

debug additional rate models, to be included in rate model chapter of…

… book
  • Loading branch information...
1 parent 18562a3 commit b2dc69203a37e1cc142b46677e7926a7561e6fdc @aflaxman committed Feb 29, 2012
Showing with 53 additions and 32 deletions.
  1. +10 −9 book/theory-rate_model.tex
  2. +19 −21 rate_model.py
  3. +24 −2 tests/validate_rate_model.py
@@ -2,7 +2,7 @@ \chapter{Statistical Models for Rates, Ratios, and Durations}
\label{theory-rate_model}
With a solid development of the model of process that will be used to
-represent the consistency of varied epidemiological rates as disease
+represent the consistency between related parameters as disease
moves through a population, I now turn to modeling the
epidemiological rate data collected in a systematic review of the
available data. There are three parts to this model, treated in turn:
@@ -11,7 +11,7 @@ \chapter{Statistical Models for Rates, Ratios, and Durations}
variation.
A useful theoretical framework to guide the development of
-meta-regression techniques is to consider how the model would proceed
+a meta-regression technique is to consider how the model would proceed
if, for each and every study identified in systematic review, complete
microdata were available. Of course, it is unusual that microdata are
available for even \emph{one} study from the systematic review. But
@@ -55,12 +55,13 @@ \chapter{Statistical Models for Rates, Ratios, and Durations}
The forest plot in Figure~\ref{rate-model-schiz-forest} shows the
results of combining $16$ studies using $7$ different data models. As
-the figure demonstrates, the choice of data model can have a huge
-effect on the estimated uncertainty, and can have a noticeable effect
-on the estimated median as well. The models I displayed produce point
-estimates ranging from $1.2$ to $4.0$ per thousand, and uncertainty
-intervals with widths ranging from $0.1$ to $2.9$. Clearly, the
-choice of data model influences the estimate.
+the figure demonstrates, the choice of data model can have a
+substantive effect on the estimated uncertainty, and can have a
+noticeable effect on the estimated median as well. The models I
+displayed produce point estimates ranging from $1.2$ to $4.0$ per
+thousand, and uncertainty intervals with widths ranging from $0.1$ to
+$2.9$. When analyzing sparse and noisy data, the choice of the data model
+matters.
\begin{figure}[h]
\begin{center}
@@ -221,7 +222,7 @@ \section{Beta-binomial model}
\dens(\pi\given \alpha, \beta)
\propto \pi^{\alpha-1}(1-\pi)^{\beta-1}
\]
-and has a high degree of flexibility. It also always takes values
+and has a high degree of flexibility. It always takes values
between zero and one, making it an appropriate distribution for a
probability. Figure~\ref{rate-model-beta} shows the probability
density of the beta distribution for several combinations of $\alpha$
View
@@ -26,9 +26,9 @@ def binom(name, pi, p, n):
def p_obs(value=p, pi=pi, n=n):
return mc.binomial_like(value*n, n, pi)
- # for any observation with n=0, make predictions for n=1.e10, to use for predictive validity
+ # for any observation with n=0, make predictions for n=1e6, to use for predictive validity
n_nonzero = pl.array(n.copy(), dtype=int)
- n_nonzero[n==0] = 1e10
+ n_nonzero[n==0] = 1e6
@mc.deterministic(name='p_pred_%s'%name)
def p_pred(pi=pi, n=n_nonzero):
return mc.rbinomial(n, pi) / (1.*n)
@@ -54,26 +54,23 @@ def beta_binom(name, pi, p, n):
assert pl.all(p >= 0), 'observed values must be non-negative'
assert pl.all(n >= 0), 'effective sample size must non-negative'
+ p_n = mc.Uniform('p_n_%s'%name, lower=1.e4, upper=1.e9, value=1.e4) # convergence requires getting these bounds right
+ pi_latent = [mc.Beta('pi_latent_%s_%d'%(name,i), pi[i]*p_n, (1-pi[i])*p_n, value=pi_i) for i, pi_i in enumerate(pi.value)]
- p_alpha = mc.Uninformative('p_alpha_%s'%name, value=1.)
- @mc.deterministic(name='p_beta_%s'%name)
- def p_beta(p_alpha=p_alpha, pi=pi):
- return p_alpha * (1-pi) / pi
-
- pi_latent = [mc.Beta('pi_latent_%s_%d'%(name,i), p_alpha, p_beta[i], value=pi_i) for i, pi_i in enumerate(pi.value)]
-
+ i_nonzero = (n!=0.)
@mc.observed(name='p_obs_%s'%name)
def p_obs(value=p, pi=pi_latent, n=n):
- return mc.binomial_like(value*n, n, pi)
+ pi_flat = pl.array(pi)
+ return mc.binomial_like((value*n)[i_nonzero], n[i_nonzero], pi_flat[i_nonzero])
- # for any observation with n=0, make predictions for n=1.e10, to use for predictive validity
+ # for any observation with n=0, make predictions for n=1.e6, to use for predictive validity
n_nonzero = pl.array(n.copy(), dtype=int)
- n_nonzero[n==0] = 1e10
+ n_nonzero[n==0] = 1.e6
@mc.deterministic(name='p_pred_%s'%name)
def p_pred(pi=pi_latent, n=n_nonzero):
return mc.rbinomial(n, pi) / (1.*n)
- return dict(p_alpha=p_alpha, p_beta=p_beta, pi_latent=pi_latent, p_obs=p_obs, p_pred=p_pred)
+ return dict(p_n=p_n, pi_latent=pi_latent, p_obs=p_obs, p_pred=p_pred)
def poisson(name, pi, p, n):
@@ -94,13 +91,14 @@ def poisson(name, pi, p, n):
assert pl.all(p >= 0), 'observed values must be non-negative'
assert pl.all(n >= 0), 'effective sample size must non-negative'
+ i_nonzero = (n!=0.)
@mc.observed(name='p_obs_%s'%name)
def p_obs(value=p, pi=pi, n=n):
- return mc.poisson_like(value*n, (pi*n).clip(1.e-9,pl.inf))
+ return mc.poisson_like((value*n)[i_nonzero], (pi*n)[i_nonzero])
- # for any observation with n=0, make predictions for n=1.e10, to use for predictive validity
+ # for any observation with n=0, make predictions for n=1.e6, to use for predictive validity
n_nonzero = pl.array(n.copy(), dtype=float)
- n_nonzero[n==0.] = 1.e10
+ n_nonzero[n==0.] = 1.e6
@mc.deterministic(name='p_pred_%s'%name)
def p_pred(pi=pi, n=n_nonzero):
return mc.rpoisson((pi*n).clip(1.e-9, pl.inf)) / (1.*n)
@@ -251,21 +249,21 @@ def offset_log_normal(name, pi, sigma, p, s):
Returns dict of PyMC objects, including 'p_obs' and 'p_pred'
the observed stochastic likelihood and data predicted stochastic
"""
- assert pl.all(p > 0), 'observed values must be positive'
+ assert pl.all(p >= 0), 'observed values must be non-negative'
assert pl.all(s >= 0), 'standard error must be non-negative'
- p_zeta = mc.Uniform('p_zeta_%s'%name, 1.e-9, 1.e9, value=1.e-6)
+ p_zeta = mc.Uniform('p_zeta_%s'%name, 1.e-9, 10., value=1.e-6)
i_inf = pl.isinf(s)
@mc.observed(name='p_obs_%s'%name)
- def p_obs(value=p, pi=pi, sigma=sigma, s=s/p, p_zeta=p_zeta):
+ def p_obs(value=p, pi=pi, sigma=sigma, s=s, p_zeta=p_zeta):
return mc.normal_like(pl.log(p[~i_inf]+p_zeta), pl.log(pi[~i_inf]+p_zeta),
- 1./(sigma**2. + (s[~i_inf]/(p[~i_inf]+p_zeta))**2.))
+ 1./(sigma**2. + (s/(p+p_zeta))[~i_inf]**2.))
s_noninf = s.copy()
s_noninf[i_inf] = 0.
@mc.deterministic(name='p_pred_%s'%name)
def p_pred(pi=pi, sigma=sigma, s=s_noninf, p_zeta=p_zeta):
- return pl.exp(mc.rnormal(pl.log(pi+p_zeta), 1./(sigma**2. + (s/(pi+p_zeta))**2)) - p_zeta)
+ return pl.exp(mc.rnormal(pl.log(pi+p_zeta), 1./(sigma**2. + (s/(pi+p_zeta))**2.))) - p_zeta
return dict(p_zeta=p_zeta, p_obs=p_obs, p_pred=p_pred)
@@ -25,14 +25,35 @@
pl.seterr('ignore')
-def validate_rate_model(rate_type='neg_binom', replicate=0):
+def validate_rate_model(rate_type='neg_binom', data_type='epilepsy', replicate=0):
# set random seed for reproducibility
mc.np.random.seed(1234567 + replicate)
# load epilepsy data
model = dismod3.data.load('/home/j/Project/dismod/output/dm-32377/')
data = model.get_data('p')
+
+ #data = data.ix[:20, :]
+ # replace data with synthetic data if requested
+ if data_type == 'epilepsy':
+ # no replacement needed
+ pass
+ elif data_type == 'binom':
+ N = 1.e6
+ data['effective_sample_size'] = N
+ mu = data['value'].mean()
+ data['value'] = mc.rbinomial(N, mu, size=len(data.index)) / N
+
+ elif data_type == 'poisson':
+ N = 1.e6
+ data['effective_sample_size'] = N
+ mu = data['value'].mean()
+ data['value'] = mc.rpoisson(N*mu, size=len(data.index)) / N
+
+ else:
+ raise TypeError, 'Unknown data type "%s"' % data_type
+
# sample prevalence data
i_test = mc.rbernoulli(.25, size=len(data.index))
i_nan = pl.isnan(data['effective_sample_size'])
@@ -47,7 +68,7 @@ def validate_rate_model(rate_type='neg_binom', replicate=0):
data['effective_sample_size'][i_test] = 0.
data['value'] = pl.maximum(data['value'], 1.e-12)
-
+
model.input_data = data
@@ -68,6 +89,7 @@ def validate_rate_model(rate_type='neg_binom', replicate=0):
# fit model
dismod3.fit.fit_asr(model, 'p', iter=20000, thin=10, burn=10000)
+ #dismod3.fit.fit_asr(model, 'p', iter=100, thin=1, burn=0)
# compare estimate to hold-out
data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']

0 comments on commit b2dc692

Please sign in to comment.