Bayes Rule Book:

https://www.bayesrulesbook.com/chapter-8.html

Materials from the Bayes Rule github:

https://github.com/bayes-rules/bayesrules

# Imports

In [42]:
import math, pyreadr
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from scipy.stats import norm, beta, binom, mode
from os.path import exists

import pyro
import torch as t
import pyro.distributions as pyro_dist
from pyro.infer import MCMC
from pyro.infer.mcmc.nuts import HMC

# MOMA sample

In [6]:
moma_data = "https://github.com/bayes-rules/bayesrules/raw/master/data/moma_sample.rda"

if exists("/Users/zr/Geek/tutorials/bayesian_rules/ch8/moma_sample.csv"):
    df = pd.read_csv("/Users/zr/Geek/tutorials/bayesian_rules/ch8/moma_sample.csv")
else:
    # pyreadr downloads remote file, saves locally and converts the RDA datafile to a pandas DataFrame
    file_path = "/Users/zr/Geek/tutorials/bayesian_rules/ch8/moma_sample.rda"
    pyreadr.download_file(moma_data, file_path)
    result = pyreadr.read_r(file_path)
    df = result['moma_sample']
    df.to_csv("/Users/zr/Geek/tutorials/bayesian_rules/ch8/moma_sample.csv")

In [17]:
df.genx.sum().astype(float)

14.0

In [18]:
def model():
    p = pyro.sample('p', pyro_dist.Beta(4,6))
    pyro.sample('obs', pyro_dist.Binomial(100, p), obs=t.tensor([df.genx.sum().astype(float)]))

kernel = HMC(model, step_size=.9, num_steps=4)
mcmc = MCMC(kernel, 1000, 250)
mcmc.run()
mcmc.summary()

Sample: 100%|██████████| 1250/1250 [00:08, 147.83it/s, step size=7.95e-01, acc. prob=0.955]


                mean       std    median      5.0%     95.0%     n_eff     r_hat
         p      0.16      0.04      0.16      0.10      0.22   4064.12      1.00

Number of divergences: 0





In [43]:
mode(mcmc.get_samples()['p'].detach().numpy().round(3))

ModeResult(mode=array([0.147]), count=array([18]))

In [35]:
data = mcmc.get_samples()['p'].detach().numpy()
likelihood = np.random.binomial(100, .16, 1000) / 100
ff.create_distplot([np.random.beta(4,6, 1000), likelihood, data], ['prior', 'likelihood', 'mcmc-post'], show_rug=False, bin_size=.01)

In [47]:
# % chance pi < .2
d = pd.Series(data)
len(d.loc[d<.2])/len(d)

0.853

## Posterior Prediction

In [52]:
(22*20/100)

4.4

Scenario:

We are handed 20 new records. How many of these records would we predict are genx? 

Using the results from above

- Mean=0.16, ~3 people
- CI = 0.1 ... 0.22, ~0 - 4

Or we can simulate the results by plugging each value of $\pi$ into a `binomial(20, `$\pi$`)` and get the MLE for each

In [59]:
preds = [np.random.binomial(20, p) for p in data]

In [60]:
px.bar(pd.Series(preds).value_counts())

In [58]:
len(preds)

1000