Bayes Rule Book:

https://www.bayesrulesbook.com/chapter-14.html

Materials from the Bayes Rule github:

https://github.com/bayes-rules/bayesrules

# Imports

In [1]:
import math, pyreadr
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from scipy.stats import norm, beta, binom, mode
from os.path import exists

import pyro
import torch as t
import pyro.distributions as dist
import pyro.distributions.constraints as constraints
from pyro.infer import MCMC
from pyro.infer.mcmc.nuts import HMC, NUTS

device = t.device("cuda" if t.cuda.is_available() else "cpu")
t.set_default_tensor_type(t.FloatTensor)
if t.cuda.is_available():
    t.set_default_tensor_type(t.cuda.FloatTensor)

# Data

In [2]:
file_name = 'penguins_bayes'
folder = 'ch14'

data_url = f"https://github.com/bayes-rules/bayesrules/raw/master/data/{file_name}.rda"

if exists(f"/Users/zr/Geek/tutorials/bayesian_rules/{folder}/{file_name}.csv"):
    df = pd.read_csv(f"/Users/zr/Geek/tutorials/bayesian_rules/{folder}/{file_name}.csv")
else:
    # pyreadr downloads remote file, saves locally and converts the RDA datafile to a pandas DataFrame
    file_path = f"/Users/zr/Geek/tutorials/bayesian_rules/{folder}/{file_name}.rda"
    pyreadr.download_file(data_url, file_path)
    result = pyreadr.read_r(file_path)
    df = result[file_name]
    df.to_csv(f"/Users/zr/Geek/tutorials/bayesian_rules/{folder}/{file_name}.csv")

In [3]:
df = df.drop(columns=['sex', 'Unnamed: 0'])
df.above_average_weight = pd.to_numeric(df.above_average_weight)

In [4]:
px.bar(df.species.value_counts(), text=[f'{len(df[df.species==s])*100/len(df):.2f}%' for s in df.species.unique()])

In [5]:
def fillna_with_species_median(row):
    cols_with_na = row[row.isna()].index
    species = row.species
    for col in cols_with_na:
        species_median = df.loc[df.species==species, col].median()
        row[col] = species_median
    return row

df.loc[df.isna().any(axis=1)] = df.loc[df.isna().any(axis=1)].apply(fillna_with_species_median, axis=1)

# Model

## One Categorical

Consider we find a penguin that is **below average weight**:

In [6]:
above_avg_counts = df[['species', 'above_average_weight']].groupby('species').sum()
species_counts   = df.species.value_counts()
leftover         = above_avg_counts.apply(lambda s: species_counts[s.name] - s, axis=1)

fig = go.Figure()
fig.add_trace(go.Bar(x=above_avg_counts.index, y=leftover.values.squeeze(), text=leftover.values.squeeze(), name='below avg'))
fig.add_trace(go.Bar(x=above_avg_counts.index, y=above_avg_counts.values.squeeze(), text=above_avg_counts.values.squeeze(), name='above avg'))
fig.update_layout(barmode='stack')
fig

We can apply the Bayes rule for each of these:

$P(s|ba) = \frac{P(ba|s) * P(s)}{P(ba)}$

In [7]:
def calc_bayes_posteriors():
    results = []
    for species in df.species.unique():
        prior = len(df.loc[df.species==species]) / len(df)
        likelihood = len(df.loc[(df.above_average_weight==0.)&(df.species==species)]) / len(df.loc[df.species==species])
        norm = len(df.loc[(df.above_average_weight==0.)]) / len(df)
        posterior = (likelihood * prior)/norm
        results.append((species, posterior))
    return results

In [8]:
results = calc_bayes_posteriors()
fig = go.Figure()
fig.add_trace(
    go.Bar(x=[r[0] for r in results], y=[r[1] for r in results], text=[f'{r[1]*100:.2f}%' for r in results], name='posterior')
)
fig.add_trace(
    go.Bar(x=[r[0] for r in results], y=[len(df.loc[df.species==r[0]])/len(df) for r in results], text=[f'{len(df.loc[df.species==r[0]])*100/len(df):.2f}%' for r in results], name='prior')
)
fig