## Load and get familiar with the data

Run the following to get the data

```
git clone https://github.com/bsmith89/statsclub_pymc3
cd statsclub_pymc3
make data/clean_data.tsv
```

We'll load this cleaned dataset.

In [None]:
import pandas as pd

data = pd.read_table('data/clean_data.tsv')
data.shape

In [None]:
data.head()

Let's take a look at the distribution of radon measurements.

In [None]:
import matplotlib.pyplot as plt

_ = plt.hist(data.radon, bins=100)
plt.yscale('symlog')

Clearly a skewed distribution, and bounded at 0.

We would normally think to log-transform these data, but we have a bunch of
observations that were below the detection limit (0.1 of whatever units these measurements are in).

In [None]:
data[data.radon != 0].radon.min()

## Data transformations

We'll replace those zeros with half of the detection limit, purely as a heuristic.

In [None]:
data['radon_nonzero'] = data.radon
data.radon_nonzero[data.radon == 0] = 0.05

Now we can see that these data are approximately normally distributed.

Much better for our purposes.

In [None]:
import numpy as np

_ = plt.hist(data.radon_nonzero.apply(np.log), bins=100)

In [None]:
data['log_radon'] = data['radon_nonzero'].apply(np.log)

To make our analysis easier to manage, we'll start by just looking at measurements from Minnesota.

In [None]:
d = data[data.state == 'MN']

The first factor that we're going to consider for predicting radon levels is
where the measurement was taken.

We know that radon comes out of the ground, so it makes sense that levels are
higher in basements compared to the rest of a house.

That's approximately what we see.

In [None]:
plt.scatter('is_basement', 'log_radon', data=d, alpha=0.5)

In [None]:
import seaborn as sns

sns.violinplot('is_basement', 'log_radon', data=d)

## Classic linear model (Complete pooling)

In [None]:
import patsy

y, x = patsy.dmatrices('log_radon ~ is_basement', data=d, return_type='dataframe')
n, r = x.shape

In [None]:
import statsmodels.api as sm

fit0 = sm.OLS(y, x).fit()

fit0.summary()

In [None]:
def jitter(x, perc=0.05):
    span = max(x) - min(x)
    return x + (np.random.rand(len(x)) - 0.5) * perc * span

In [None]:
plt.scatter(jitter(fit0.predict(), 0.1),
            fit0.resid_pearson, s=0.5)

In [None]:
d['predict0'] = fit0.predict()
d['resid0'] = d.predict0 - d.log_radon

sns.violinplot('is_basement', 'resid0', data=d)

In [None]:
plt.scatter(jitter(d['is_basement']), d['log_radon'], s=1)
plt.plot([0, 1], [fit0.params[0], fit0.params[0] + fit0.params[1]], color='k')

## Bayesian version complete pooling model

In [None]:
import pymc3 as pm
import theano.tensor as tt

with pm.Model() as model0:
    beta = pm.Normal('beta', sd=10, shape=(r, 1))
    sigma = pm.HalfCauchy('sigma', beta=2)
    
    mu = tt.dot(x.values, beta)
    
    obs = pm.Normal('obs', mu=mu, sd=sigma, observed=y)

In [None]:
model0.logp(model0.test_point)

In [None]:
with model0:
    trace0 = pm.sample(tune=1000)

In [None]:
pm.traceplot(trace0)

## No pooling model

In [None]:
counties = d.state_county.unique()
county_lookup = dict(zip(counties, range(len(counties))))
d['county_idx'] = d.state_county.replace(county_lookup)

In [None]:
y, x1 = patsy.dmatrices('log_radon ~ is_basement', data=d, return_type='dataframe')
x1.drop(columns=['Intercept'], inplace=True)
n, r1 = x1.shape

x2 = patsy.dmatrix('state_county - 1', data=d, return_type='dataframe')
r2 = x2.shape[1]


with pm.Model() as model1:
    beta = pm.Normal('beta', sd=10, shape=(r1, 1))
    gamma = pm.Normal('gamma', sd=10, shape=(r2, 1))
    sigma = pm.HalfCauchy('sigma', beta=2)
    
    mu = tt.dot(x1.values, beta) + tt.dot(x2.values, gamma)
    
    obs = pm.Normal('obs', mu=mu, sd=sigma, observed=y)

In [None]:
with model1:
    trace1 = pm.sample(tune=1000)

In [None]:
pm.traceplot(trace1, var_names=['beta', 'sigma'])

In [None]:
pm.forestplot(trace1, var_names=['gamma'])

## Partial pooling model

In [None]:
y, x1 = patsy.dmatrices('log_radon ~ is_basement', data=d, return_type='dataframe')
x1.drop(columns=['Intercept'], inplace=True)
n, r1 = x1.shape

x2 = patsy.dmatrix('state_county - 1', data=d, return_type='dataframe')
r2 = x2.shape[1]


with pm.Model() as model2:
    beta = pm.Normal('beta', sd=10, shape=(r1, 1))
    
    gamma_hyper_mean = pm.Normal('gamma_hyper_mean', sd=10)
    gamma_hyper_sd = pm.HalfCauchy('gamma_hyper_sd', beta=2)
    gamma = pm.Normal('gamma', mu=gamma_hyper_mean, sd=gamma_hyper_sd, shape=(r2, 1))
    
    sigma = pm.HalfCauchy('sigma', beta=2)
    
    mu = tt.dot(x1.values, beta) + tt.dot(x2.values, gamma)
    
    obs = pm.Normal('obs', mu=mu, sd=sigma, observed=y)

In [None]:
with model2:
    trace2 = pm.sample(tune=1000)

In [None]:
pm.traceplot(trace2, var_names=['beta', 'sigma', 'gamma_hyper_mean', 'gamma_hyper_sd'])

## Reparameterize partial pooling model

In [None]:
y, x1 = patsy.dmatrices('log_radon ~ is_basement', data=d, return_type='dataframe')
x1.drop(columns=['Intercept'], inplace=True)
n, r1 = x1.shape

x2 = patsy.dmatrix('state_county - 1', data=d, return_type='dataframe')
r2 = x2.shape[1]


with pm.Model() as model3:
    beta = pm.Normal('beta', sd=10, shape=(r1, 1))
    
    gamma_hyper_mean = pm.Normal('gamma_hyper_mean', sd=10)
    gamma_hyper_sd = pm.HalfCauchy('gamma_hyper_sd', beta=2)
    gamma_ = pm.Normal('gamma_', shape=(r2, 1))
    gamma = pm.Deterministic('gamma', gamma_hyper_mean + gamma_hyper_sd * gamma_)
    
    sigma = pm.HalfCauchy('sigma', beta=2)

    mu = tt.dot(x1.values, beta) + tt.dot(x2.values, gamma)
    
    obs = pm.Normal('obs', mu=mu, sd=sigma, observed=y)

In [None]:
with model3:
    trace3 = pm.sample(tune=1000)

In [None]:
pm.traceplot(trace3, var_names=['beta', 'sigma', 'gamma_hyper_mean', 'gamma_hyper_sd'])

## Add county-level uranium as predictor

In [None]:
y, x1 = patsy.dmatrices('log_radon ~ is_basement', data=d, return_type='dataframe')
x1.drop(columns=['Intercept'], inplace=True)
n, r1 = x1.shape

x2 = patsy.dmatrix('state_county - 1', data=d, return_type='dataframe')
r2 = x2.shape[1]

u = d[['state_county', 'county_uranium']].drop_duplicates()
u['state_county'] = u.state_county.map(lambda x: 'state_county[' + x + ']')
u = u.set_index('state_county')
u = u.loc[x2.columns]

assert np.all(u.index == x2.columns)


with pm.Model() as model4:
    beta = pm.Normal('beta', sd=10, shape=(r1, 1))
    
    gamma_hyper_beta0 = pm.Normal('gamma_hyper_beta0', sd=10)
    gamma_hyper_beta = pm.Normal('gamma_hyper_beta', sd=10)
    gamma_hyper_mu = pm.Deterministic('gamma_hyper_mu', gamma_hyper_beta0 + gamma_hyper_beta * u.values)
    
    gamma_hyper_sd = pm.HalfCauchy('gamma_hyper_sd', beta=2)
    gamma_ = pm.Normal('gamma_', shape=(r2, 1))
    
    gamma = pm.Deterministic('gamma', gamma_hyper_mu + gamma_ * gamma_hyper_sd)
    
    sigma = pm.HalfCauchy('sigma', beta=2)
    
    mu = tt.dot(x1.values, beta) + tt.dot(x2.values, gamma)
    
    obs = pm.Normal('obs', mu=mu, sd=sigma, observed=y)

In [None]:
with model4:
    trace4 = pm.sample(tune=1000)

In [None]:
pm.traceplot(trace4,
             var_names=['beta', 'gamma_hyper_beta0',
                        'gamma_hyper_beta', 'gamma_hyper_sd',
                        'sigma'])

In [None]:
pm.forestplot(trace4, var_names=['gamma'])

In [None]:
trace4.gamma_hyper_mu.T[0]

In [None]:
quantiles = [0.05, 0.25, 0.5, 0.75, 0.95]
gamma_dist_prd = pd.DataFrame(np.quantile(trace4.gamma_hyper_mu, quantiles, axis=0).T[0],
                              columns=quantiles, index=counties)
gamma_dist_obs = pd.DataFrame(np.quantile(trace4.gamma, quantiles, axis=0).T[0],
                              columns=quantiles, index=counties)

In [None]:
plt.scatter(gamma_dist_prd[0.5], gamma_dist_obs[0.5])
plt.vlines(gamma_dist_prd[0.5], gamma_dist_obs[0.25], gamma_dist_obs[0.75], lw=0.5)
plt.hlines(gamma_dist_obs[0.5], gamma_dist_prd[0.25], gamma_dist_prd[0.75], lw=0.5)

## Add a random slope term by county without pooling

In [None]:
y, x1 = patsy.dmatrices('log_radon ~ is_basement', data=d, return_type='dataframe')
x1.drop(columns=['Intercept'], inplace=True)
n, r1 = x1.shape

x2 = patsy.dmatrix('state_county - 1', data=d, return_type='dataframe')
r2 = x2.shape[1]

u = d[['state_county', 'county_uranium']].drop_duplicates()
u['state_county'] = u.state_county.map(lambda x: 'state_county[' + x + ']')
u = u.set_index('state_county')
u = u.loc[x2.columns]

assert np.all(u.index == x2.columns)


with pm.Model() as model5:
    beta = pm.Normal('beta', sd=10, shape=(r1, 1))
    
    gamma_hyper_beta0 = pm.Normal('gamma_hyper_beta0', sd=10)
    gamma_hyper_beta = pm.Normal('gamma_hyper_beta', sd=10)
    gamma_hyper_mu = pm.Deterministic('gamma_hyper_mu', gamma_hyper_beta0 + gamma_hyper_beta * u.values)
    gamma_hyper_sd = pm.HalfCauchy('gamma_hyper_sd', beta=2)
    gamma_ = pm.Normal('gamma_', shape=(r2, 1))
    gamma = pm.Deterministic('gamma', gamma_hyper_mu + gamma_ * gamma_hyper_sd)
    
    kappa_hyper_beta0 = pm.Normal('kappa_hyper_beta0', sd=10)
    kappa_hyper_beta = pm.Normal('kappa_hyper_beta', sd=10)
    kappa_hyper_mu = pm.Deterministic('kappa_hyper_mu', kappa_hyper_beta0 + kappa_hyper_beta * u.values)
    kappa_hyper_sd = pm.HalfCauchy('kappa_hyper_sd', beta=2)
    kappa_ = pm.Normal('kappa_', shape=(r2, 1))
    kappa = pm.Deterministic('kappa', kappa_hyper_mu + kappa_ * kappa_hyper_sd)
    
    mu = tt.dot(x1.values, beta) + tt.dot(x2.values, gamma) + tt.dot(x1.values * x2.values, kappa)

    sigma = pm.HalfCauchy('sigma', beta=2)
    obs = pm.Normal('obs', mu=mu, sd=sigma, observed=y)

In [None]:
with model5:
    trace5 = pm.sample(tune=1000)

In [None]:
pm.traceplot(trace5, var_names=['beta', 'gamma_hyper_beta0',
                                'gamma_hyper_beta', 'gamma_hyper_sd',
                                'sigma', 'kappa_hyper_beta0',
                                'kappa_hyper_beta', 'kappa_hyper_sd'])

In [None]:
pm.forestplot(trace5, var_names=['kappa'])

In [None]:
pm.forestplot(trace5, var_names=['gamma'])

In [None]:
trace5.varnames

In [None]:
with model5:
    advi5 = pm.fit()