Bayes Rule Book:

https://www.bayesrulesbook.com/chapter-2.html

Materials from the Bayes Rule github:

https://github.com/bayes-rules/bayesrules

# Imports

In [1]:
import pyreadr, math
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from os.path import exists

# Ch 2 - Bayes Rule

## Fake News

In [2]:
fake_news_data_url = "https://github.com/bayes-rules/bayesrules/raw/master/data/fake_news.rda"

if exists("/Users/zr/Geek/tutorials/bayesian_rules/ch2/fake_news.csv"):
    df = pd.read_csv("/Users/zr/Geek/tutorials/bayesian_rules/ch2/fake_news.csv")
else:
    # pyreadr downloads remote file, saves locally and converts the RDA datafile to a pandas DataFrame
    file_path = "/Users/zr/Geek/tutorials/bayesian_rules/ch2/fake_news.rda"
    pyreadr.download_file(fake_news_data_url, file_path)
    result = pyreadr.read_r(file_path)
    df = result['fake_news']
    df.to_csv("/Users/zr/Geek/tutorials/bayesian_rules/ch2/fake_news.csv")

### The Prior

<img src="images/ch2_tabyl.png" width=300/>

In [3]:
prior = df.groupby(by="type")["type"].count().to_frame("n_records")
prior["pct"] = prior.n_records / len(df)
prior.append(prior.sum().rename("total"))

Unnamed: 0_level_0,n_records,pct
type,Unnamed: 1_level_1,Unnamed: 2_level_1
fake,60.0,0.4
real,90.0,0.6
total,150.0,1.0


### The Likelihood

<img src="images/ch2_has_excl_table.png" width=300/>

In [4]:
likelihood = pd.pivot_table(df, values=["anger"], index=['title_has_excl'], columns=['type'], aggfunc="count", margins=True, margins_name="total")
likelihood.columns = likelihood.columns.droplevel()
likelihood

type,fake,real,total
title_has_excl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,44,88,132
True,16,2,18
total,60,90,150


### Concatenating Prior & Likelihood

<img src="images/ch2_prob_vs_likelihood.png" width=300 />

In [5]:
prior_prob_tbl        = prior.pct.rename("prior probability").to_frame().T
# tbl.index = tbl.title_has_excl
likelihood_tbl        = (likelihood[likelihood.index==True] / likelihood.sum()).rename({True: "likelihood"})
prob_vs_liklihood_tbl = prior_prob_tbl.append(likelihood_tbl)

prob_vs_liklihood_tbl["total"] = prob_vs_liklihood_tbl.sum(axis=1)
prob_vs_liklihood_tbl

Unnamed: 0,fake,real,total
prior probability,0.4,0.6,1.0
likelihood,0.133333,0.011111,0.204444


Quick notes on the above:

- `prior probability`: this is simply $P(fake)$
- `likelihood`: given *title_has_excl*, this measures the likelihood that it came from a fake or real source, e.g. $L(! | fake)$ & $L(! | -fake)$

Interpreting the above table:

The `prior probability` is our frequentist observation of fake|real articles without any data. We then observe that the *title_has_excl*, so we ask ourselves the likelihood that this would occur given that our article was fake vs real. 

The likelihood that we would see an "!" in our title, given our article comes from the fake distribution is far greater than the likelihood we'd see an "!" given our article came from a real distribution *(take that NY Post)*. 

### Solving the joint probability table

<img src="images/joint_prob_table.png" width=300 />

- $B | B^c = fake | real$ 
- $A | A^c = has !| no !$

|     |fake|real|Total|
| --  | -- | -- | --  |
| !   |    |    |     |
|~ !  |    |    |     |
|Total|0.4 | 0.6|    1|

In [6]:
joint_prob_table = likelihood / likelihood.sum().values.sum()
joint_prob_table["Total"] = joint_prob_table.sum(axis=1)
joint_prob_table.columns.name = ""
joint_prob_table = joint_prob_table.rename({False:"No Excl", True:"Has Excl"})
joint_prob_table.append(joint_prob_table.sum(axis=0).to_frame().T.rename({0:"Total"}))

Unnamed: 0,fake,real,total,Total
No Excl,0.073333,0.146667,0.22,0.44
Has Excl,0.026667,0.003333,0.03,0.06
total,0.1,0.15,0.25,0.5
Total,0.2,0.3,0.5,1.0


### Posterior Probability

<img src="images/prior_post_prob.png" width=300 />

In [7]:
def mul_col(col):
    result = 1
    for c in col:
        result *= c
    return result 

posterior_row = (prob_vs_liklihood_tbl.apply(mul_col, axis=0) / (df.title_has_excl.sum() / len(df))).to_frame().T.rename({0:"posterior"})
posterior_row.total = posterior_row.fake + posterior_row.real
prob_vs_liklihood_tbl.append(posterior_row)

Unnamed: 0,fake,real,total
prior probability,0.4,0.6,1.0
likelihood,0.133333,0.011111,0.204444
posterior,0.444444,0.055556,0.5


### Posterior Simulation

In [8]:
# Sampling articles with replacement
articles = df.loc[:, ['type', 'title_has_excl']]
sample = articles.sample(n=10000, weights=np.where(articles.type == "fake", prior.pct.fake, prior.pct.real), replace=True, random_state=711)

In [9]:
px.bar(sample.type.value_counts())

In [10]:
sample_prior = sample.groupby(by="type")["type"].count().to_frame("n_records")
sample_prior["pct"] = sample_prior.n_records / len(sample)
sample_prior.append(sample_prior.sum().rename("total"))

Unnamed: 0_level_0,n_records,pct
type,Unnamed: 1_level_1,Unnamed: 2_level_1
fake,3061.0,0.3061
real,6939.0,0.6939
total,10000.0,1.0


<img src="images/sample_articles_have_excl_tbl.png" width=300 />

In [11]:
sample["value"] = 1
article_sim = pd.pivot_table(sample, values="value", index="title_has_excl", columns="type", aggfunc="count", margins=True, margins_name="Total")
article_sim

type,fake,real,Total
title_has_excl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,2260,6790,9050
True,801,149,950
Total,3061,6939,10000


<img src="images/ch2_bar_plot_fake_real_excl.png" width=300 />

In [12]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Bar(x=article_sim.Total[:2].rename({False:"no", True: "yes"}).index, y=article_sim.Total[:2], showlegend=False),
    row=1, col=2
)
fig.add_trace(
    go.Bar(name='yes', x=article_sim.T[:2].drop(columns=["Total"]).index, y=article_sim.T[:2].drop(columns=["Total"])[True]),
    row=1, col=1
)
fig.add_trace(
    go.Bar(name='no', x=article_sim.T[:2].drop(columns=["Total"]).index, y=article_sim.T[:2].drop(columns=["Total"])[False]),
    row=1, col=1
)

fig.update_xaxes(title_text="type", row=1, col=1)
fig.update_xaxes(title_text="usage", row=1, col=2)
fig.update_yaxes(title_text="count", row=1, col=1)
fig.update_yaxes(title_text="count", row=1, col=2)

fig.update_layout(
    barmode='stack',
    legend={
        "yanchor": "top",
        "y": .99,
        "xanchor": "left",
        "x": .01
    }
)

## Pop, Soda, Coke

In [13]:
pop_soda_data_url = "https://github.com/bayes-rules/bayesrules/raw/master/data/pop_vs_soda.rda"

if exists("/Users/zr/Geek/tutorials/bayesian_rules/ch2/pop_vs_soda.csv"):
    df = pd.read_csv("/Users/zr/Geek/tutorials/bayesian_rules/ch2/pop_vs_soda.csv")
else:
    # pyreadr downloads remote file, saves locally and converts the RDA datafile to a pandas DataFrame
    file_path = "/Users/zr/Geek/tutorials/bayesian_rules/ch2/pop_vs_soda.rda"
    pyreadr.download_file(pop_soda_data_url, file_path)
    result = pyreadr.read_r(file_path)
    df = result['pop_vs_soda']
    df.to_csv("/Users/zr/Geek/tutorials/bayesian_rules/ch2/pop_vs_soda.csv")

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,state,region,word_for_cola,pop
0,0,alabama,south,pop,True
1,1,alabama,south,pop,True
2,2,alabama,south,pop,True
3,3,alabama,south,pop,True
4,4,alabama,south,pop,True


<img src="images/prior_sodapop_region_tbl.png" width=300 />

In [15]:
df.region.value_counts().to_frame().T / len(df)

Unnamed: 0,midwest,south,northeast,west
region,0.387698,0.240919,0.206247,0.165136


*Note: there appears to be an error in the book where the pct values are assigned to the incorrect region. I've added an issue to the bok's github here: https://github.com/bayes-rules/bayes-rules.github.io/issues/1*

*If the rest of examples build from this erroneous table, I'll remap the values to conform with the error*

In [16]:
# remapping values to align with error in book
df.region = df.region.replace({"northeast":"midwest", "west":"northeast", "midwest":"south", "south":"west"})

In [17]:
prior = df.region.value_counts().to_frame().T / len(df)
prior.round(2)

Unnamed: 0,south,west,midwest,northeast
region,0.39,0.24,0.21,0.17


<img src="images/pop_per_region_tbl.png" width=300>

In [18]:
likelihood = pd.pivot_table(df, index='pop', columns='region', values='state', aggfunc='count').apply(lambda r: r / r.sum(),axis=0)
likelihood

region,midwest,northeast,south,west
pop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,0.726603,0.705721,0.355296,0.920778
True,0.273397,0.294279,0.644704,0.079222


Note: These are likelihood values, e.g. $L(midwest|pop)=64.47\%$

Finding $P(south|pop)$

$P(south|pop) = \frac{L(south|pop)P(south)}{P(pop)}$

1. Find $L(south|pop)$
2. Find $P(south)$
3. Find $P(pop)$
4. Calculate posterior

In [19]:
likelihood_south_pop = likelihood.south[True]
likelihood_south_pop

0.6447041958427524

In [20]:
prior.south

region    0.387698
Name: south, dtype: float64

Using the **Law of Total Probability** to calculate 

$P(pop) = L(midwest|pop)P(midwest) + L(northeast|pop)P(northeast) + L(west|pop)P(west) + L(south|pop)P(south)$

In [21]:
p_pop = likelihood.midwest[True] * prior.midwest +\
        likelihood.northeast[True] * prior.northeast +\
        likelihood.west[True] * prior.west +\
        likelihood.south[True] * prior.south
p_pop

region    0.37402
dtype: float64

*Note: again the book makes an error due to incorrect rounding of values. They show a $P(pop)$ of .2826*

In [22]:
p_south_given_pop = (likelihood_south_pop * prior.south) / p_pop
p_south_given_pop

region    0.668281
dtype: float64

Apply to each region:

In [23]:
def calc_posterior(col):
    region = col.name
    l = likelihood[region][True]
    p = prior[region].values[0]
    posterior = (l*p) / p_pop
    return posterior

posterior = prior.apply(calc_posterior)

prior.index = ['prior']
posterior.index = ['posterior']

<img src="images/prior_posterior_pop_region_tbl.png" width=300 />

In [24]:
pop_prior_posterior = pd.concat([prior, posterior])[["midwest", "northeast", "south", "west"]]
pop_prior_posterior

Unnamed: 0,midwest,northeast,south,west
prior,0.206247,0.165136,0.387698,0.240919
posterior,0.15076,0.129928,0.668281,0.05103


In [25]:
pop_prior_posterior

Unnamed: 0,midwest,northeast,south,west
prior,0.206247,0.165136,0.387698,0.240919
posterior,0.15076,0.129928,0.668281,0.05103


In [26]:
go.Figure([
    go.Bar(
        x=pop_prior_posterior.columns, 
        y=pop_prior_posterior.loc["prior"], 
        text=(pop_prior_posterior.loc["prior"]*100).round(2), 
        name="prior"
    ),
    go.Bar(
        x=pop_prior_posterior.columns, 
        y=pop_prior_posterior.loc["posterior"], 
        text=(pop_prior_posterior.loc["posterior"]*100).round(2),
        name="posterior"
    )
])

# Kasparov

## Prior

In [27]:
prior = pd.DataFrame([[.2, .5, .8],[.1,.25,.65]], index=["pi", "prior"], columns=["weak", "mid", "strong"])
prior

Unnamed: 0,weak,mid,strong
pi,0.2,0.5,0.8
prior,0.1,0.25,0.65


## Likelihoods, using the Binomial Distribution

<img src="images/bin_probs_graph.png" width=300 />

In [28]:
def binomial(n, p):
    probs = []
    for k in range(n+1):
        # (n choose k)
        n_choose_k = math.factorial(n) / (math.factorial(k)*math.factorial(n - k))
        prob = n_choose_k * p**k * (1 - p)**(n-k)
        probs.append(round(prob, 2))
    return probs

fig = make_subplots(rows=1, cols=3, subplot_titles=[f'pi: {pi}' for pi in prior.loc['pi']])
for i, p in enumerate(prior.loc['pi']):
    fig.add_trace(
        go.Bar(y=binomial(6, p), x=[0,1,2,3,4,5,6], text=binomial(6, p)),
        row=1, col=i+1
    )
fig.update_layout(showlegend=False)

fig.show()

<img src="images/likelihood__kasparov_1_game.png" with=300 />

In [29]:
likelihoods = pd.DataFrame(data=[binomial(6, p) for p in prior.loc['pi']], index=["weak", "mid", "strong"])
likelihoods

Unnamed: 0,0,1,2,3,4,5,6
weak,0.26,0.39,0.25,0.08,0.02,0.0,0.0
mid,0.02,0.09,0.23,0.31,0.23,0.09,0.02
strong,0.0,0.0,0.02,0.08,0.25,0.39,0.26


In [30]:
p_l = prior.append(likelihoods[1].to_frame().T).rename({1:"likelihood"})
p_l

Unnamed: 0,weak,mid,strong
pi,0.2,0.5,0.8
prior,0.1,0.25,0.65
likelihood,0.39,0.09,0.0


## Calculating Posterior

To recenter, we have our *prior* and *likelihood* (k=1), now all we need to do is to calculate our *normalizing constant*, $P(K=1)$

Again, to do this, we sum  $\sum_{.2}^{.8}{L(pi|k=1|)*P(pi)}$

In [31]:
normalizing_constant = (p_l.weak.likelihood * p_l.weak.prior) + (p_l.mid.likelihood * p_l.mid.prior) + (p_l.strong.likelihood * p_l.strong.prior)

In [32]:
p_l = p_l.append(p_l.apply(lambda c: (c.prior * c.likelihood)/normalizing_constant).rename("posterior"))
p_l

Unnamed: 0,weak,mid,strong
pi,0.2,0.5,0.8
prior,0.1,0.25,0.65
likelihood,0.39,0.09,0.0
posterior,0.634146,0.365854,0.0


In [33]:
go.Figure([
    go.Bar(
        x=p_l.columns, 
        y=p_l.loc["prior"], 
        text=(p_l.loc["prior"]*100).round(2), 
        name="prior"
    ),
    go.Bar(
        x=p_l.columns, 
        y=p_l.loc["posterior"], 
        text=(p_l.loc["posterior"]*100).round(2),
        name="posterior"
    )
])

Interpretation: *Whereas originally we'd have thought there was a 65% chance Kasparov was a much stronger player $(pi = .8)$, we have updated our beliefs based on the data that of 6 games, Kasparov only won once, that Kasparov is most probably the lesser player*

How would our model have changed had Kasparov won N games?

In [34]:
def update_on_n_wins(n):
    p_l = prior.append(likelihoods[n].to_frame().T).rename({n:"likelihood"})
    normalizing_constant = (p_l.weak.likelihood * p_l.weak.prior) + (p_l.mid.likelihood * p_l.mid.prior) + (p_l.strong.likelihood * p_l.strong.prior)
    p_l = p_l.append(p_l.apply(lambda c: (c.prior * c.likelihood)/normalizing_constant).rename("posterior"))

    return [
        go.Bar(
            x=p_l.columns, 
            y=p_l.loc["prior"], 
            text=(p_l.loc["prior"]*100).round(2), 
            name="prior"
        ),
        go.Bar(
            x=p_l.columns, 
            y=p_l.loc["posterior"], 
            text=(p_l.loc["posterior"]*100).round(2),
            name="posterior"
        )
    ]


In [35]:
fig = make_subplots(rows=7, cols=1, subplot_titles=[f'wins: {n}' for n in range(7)])
for i in range(7):
    prior_bar, posterior_bar = update_on_n_wins(i)
    fig.add_trace(prior_bar, row=i+1, col=1)
    fig.add_trace(posterior_bar, row=i+1, col=1)

fig.update_layout(height=7*250, showlegend=False)
fig.show()

## Simulation

In [36]:
# sample 10k values of pi with prior weighting
sample = prior.loc['pi'].sample(n=10000, weights=prior.loc['prior'], replace=True)

In [37]:
# Simulate 10k match outcomes given pi
games_won = sample.apply(lambda p: np.random.binomial(7, p, 1)[0])
games_won.name = "games_won"

simulations = pd.concat([sample, games_won], axis=1).reset_index(drop=True)
simulations

Unnamed: 0,pi,games_won
0,0.5,2
1,0.8,6
2,0.5,3
3,0.5,2
4,0.5,3
...,...,...
9995,0.8,6
9996,0.8,6
9997,0.8,7
9998,0.8,5


In [38]:
tbl = pd.concat([simulations.pi.value_counts(),simulations.pi.value_counts() / len(simulations)], axis=1)
tbl.columns = ["n", "pct"]
tbl

Unnamed: 0,n,pct
0.8,6455,0.6455
0.5,2545,0.2545
0.2,1000,0.1


In [39]:
fig = make_subplots(rows=1, cols=3, subplot_titles=(".2", ".5", ".8"))

for i, pi in enumerate([.2,.5,.8]):
    value_counts = simulations.loc[simulations.pi==pi, 'games_won'].value_counts()
    fig.add_trace(go.Bar(x=value_counts.index, y=value_counts), row=1, col=i+1)

fig.show()

In [40]:
px.bar(simulations.loc[simulations.games_won==1, "pi"].value_counts())