Bayes Rule Book:

https://www.bayesrulesbook.com/chapter-14.html

Materials from the Bayes Rule github:

https://github.com/bayes-rules/bayesrules

# Imports

In [1]:
import math, pyreadr
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from scipy.stats import norm, beta, binom, mode
from os.path import exists

import pyro
import torch as t
import pyro.distributions as dist
import pyro.distributions.constraints as constraints
from pyro.infer import MCMC
from pyro.infer.mcmc.nuts import HMC, NUTS

device = t.device("cuda" if t.cuda.is_available() else "cpu")
t.set_default_tensor_type(t.FloatTensor)
if t.cuda.is_available():
    t.set_default_tensor_type(t.cuda.FloatTensor)

# Data

In [2]:
file_name = 'penguins_bayes'
folder = 'ch14'

data_url = f"https://github.com/bayes-rules/bayesrules/raw/master/data/{file_name}.rda"

if exists(f"/Users/zr/Geek/tutorials/bayesian_rules/{folder}/{file_name}.csv"):
    df = pd.read_csv(f"/Users/zr/Geek/tutorials/bayesian_rules/{folder}/{file_name}.csv")
else:
    # pyreadr downloads remote file, saves locally and converts the RDA datafile to a pandas DataFrame
    file_path = f"/Users/zr/Geek/tutorials/bayesian_rules/{folder}/{file_name}.rda"
    pyreadr.download_file(data_url, file_path)
    result = pyreadr.read_r(file_path)
    df = result[file_name]
    df.to_csv(f"/Users/zr/Geek/tutorials/bayesian_rules/{folder}/{file_name}.csv")

In [3]:
df = df.drop(columns=['sex', 'Unnamed: 0'])
df.above_average_weight = pd.to_numeric(df.above_average_weight)

In [4]:
px.bar(df.species.value_counts(), text=[f'{len(df[df.species==s])*100/len(df):.2f}%' for s in df.species.unique()])

In [5]:
def fillna_with_species_median(row):
    cols_with_na = row[row.isna()].index
    species = row.species
    for col in cols_with_na:
        species_median = df.loc[df.species==species, col].median()
        row[col] = species_median
    return row

df.loc[df.isna().any(axis=1)] = df.loc[df.isna().any(axis=1)].apply(fillna_with_species_median, axis=1)

# Model

## Categorical Feature

Consider we find a penguin that is **below average weight**:

In [6]:
above_avg_counts = df[['species', 'above_average_weight']].groupby('species').sum()
species_counts   = df.species.value_counts()
leftover         = above_avg_counts.apply(lambda s: species_counts[s.name] - s, axis=1)

fig = go.Figure()
fig.add_trace(go.Bar(x=above_avg_counts.index, y=leftover.values.squeeze(), text=leftover.values.squeeze(), name='below avg'))
fig.add_trace(go.Bar(x=above_avg_counts.index, y=above_avg_counts.values.squeeze(), text=above_avg_counts.values.squeeze(), name='above avg'))
fig.update_layout(barmode='stack')
fig

We can apply the Bayes rule for each of these:

$P(s|ba) = \frac{P(ba|s) * P(s)}{P(ba)}$

In [7]:
def calc_bayes_posteriors():
    results = []
    for species in df.species.unique():
        prior = len(df.loc[df.species==species]) / len(df)
        likelihood = len(df.loc[(df.above_average_weight==0.)&(df.species==species)]) / len(df.loc[df.species==species])
        norm = len(df.loc[(df.above_average_weight==0.)]) / len(df)
        posterior = (likelihood * prior)/norm
        results.append((species, posterior))
    return results

In [8]:
results = calc_bayes_posteriors()
fig1 = go.Figure()
fig1.add_trace(
    go.Bar(x=[r[0] for r in results], y=[r[1] for r in results], text=[f'{r[1]*100:.2f}%' for r in results], name='posterior')
)
fig1.add_trace(
    go.Bar(x=[r[0] for r in results], y=[len(df.loc[df.species==r[0]])/len(df) for r in results], text=[f'{len(df.loc[df.species==r[0]])*100/len(df):.2f}%' for r in results], name='prior')
)
fig1

## Quantitative Feature

Consider a penguin with `bill_length_mm == 50.`

In [9]:
fig = ff.create_distplot(
    [df.loc[df.species=='Adelie', 'bill_length_mm'], df.loc[df.species=='Gentoo', 'bill_length_mm'], df.loc[df.species=='Chinstrap', 'bill_length_mm']],
    ['Adelie', 'Gentoo', 'Chinstrap'],
    show_rug=False, bin_size=1
)
fig.add_shape(type="line",
    x0=50, y0=0, x1=50, y1=.16,
    line=dict(color="black",width=3, dash="dot")
)
fig

>The naive Bayes method typically assumes that any quantitative predictor is **continuous and conditionally Normal**.

So our next task is to create 3 Normal distributions that represent each species.

In [10]:
gentoo_bill_dist    = norm(loc=df.loc[df.species=='Gentoo', 'bill_length_mm'].mean(),    scale=df.loc[df.species=='Gentoo', 'bill_length_mm'].std())
adelie_bill_dist    = norm(loc=df.loc[df.species=='Adelie', 'bill_length_mm'].mean(),    scale=df.loc[df.species=='Adelie', 'bill_length_mm'].std())
chinstrap_bill_dist = norm(loc=df.loc[df.species=='Chinstrap', 'bill_length_mm'].mean(), scale=df.loc[df.species=='Chinstrap', 'bill_length_mm'].std())

penguin_bill_dists = {'Gentoo': gentoo_bill_dist, 'Adelie': adelie_bill_dist, 'Chinstrap': chinstrap_bill_dist}

In [11]:
fig = ff.create_distplot(
    [
        df.loc[df.species=='Adelie', 'bill_length_mm'], 
        adelie_bill_dist.rvs(len(df)),
        df.loc[df.species=='Gentoo', 'bill_length_mm'], 
        gentoo_bill_dist.rvs(len(df)),
        df.loc[df.species=='Chinstrap', 'bill_length_mm'],
        chinstrap_bill_dist.rvs(len(df))
    ],
    ['Adelie', 'Adelie Dist', 'Gentoo', 'Gentoo Dist', 'Chinstrap', 'Chinstrap Dist'],
    show_rug=False, bin_size=1
)
fig

$P(s|x_1) = \frac{P(x_1|s) * P(s)}{P(x_1)}$

In [12]:
# P(x|s) * p(s) for each s
normalizing_constant = \
    (adelie_bill_dist.pdf(50)    * df.species.value_counts()['Adelie']    / len(df)) + \
    (gentoo_bill_dist.pdf(50)    * df.species.value_counts()['Gentoo']    / len(df)) + \
    (chinstrap_bill_dist.pdf(50) * df.species.value_counts()['Chinstrap'] / len(df))

def calc_bayes_posteriors():
    results = []
    for species in df.species.unique():
        prior = len(df.loc[df.species==species]) / len(df)
        likelihood = penguin_bill_dists[species].pdf(50) #given 50mm bill

        posterior = (likelihood * prior)/normalizing_constant
        results.append((species, posterior))

    return results

In [13]:
results = calc_bayes_posteriors()
fig2 = go.Figure()
fig2.add_trace(
    go.Bar(x=[r[0] for r in results], y=[r[1] for r in results], text=[f'{r[1]*100:.2f}%' for r in results], name='posterior')
)
fig2.add_trace(
    go.Bar(x=[r[0] for r in results], y=[len(df.loc[df.species==r[0]])/len(df) for r in results], text=[f'{len(df.loc[df.species==r[0]])*100/len(df):.2f}%' for r in results], name='prior')
)
fig2

## Two Predictors

Using both `bill_length_mm = 50` & `flipper_length_mm = 195`

In [14]:
fig = ff.create_distplot(
    [df.loc[df.species=='Adelie', 'flipper_length_mm'], df.loc[df.species=='Gentoo', 'flipper_length_mm'], df.loc[df.species=='Chinstrap', 'flipper_length_mm']],
    ['Adelie', 'Gentoo', 'Chinstrap'],
    show_rug=False, bin_size=1
)
fig.add_shape(type="line",
    x0=195, y0=0, x1=195, y1=.13,
    line=dict(color="black",width=3, dash="dot")
)
fig

In [15]:
fig = px.scatter(df, x='flipper_length_mm', y='bill_length_mm', color='species')
fig.add_shape(type="line",
    x0=df.flipper_length_mm.min(), y0=50, x1=195, y1=50,
    line=dict(color="black",width=3, dash="dot")
)
fig.add_shape(type="line",
    x0=195, y0=df.bill_length_mm.min(), x1=195, y1=50,
    line=dict(color="black",width=3, dash="dot")
)
fig.show()

> How do we calculate the likelihood function that incorporates two variables, $L(y|x2,x3)$ ?
> Naive Bayes classification assumes that predictors are conditionally independent, thus:

$L(y|x2,x3)=f(x2,x3|y)=f(x2|y)*f(x3|y)$

$P(s|x_1,x_2) = \frac{P(x_1,x_2|s) * P(s)}{P(x_1,x_2)}$

In [16]:
# create flipper length dists for each species
gentoo_flipper_dist    = norm(loc=df.loc[df.species=='Gentoo', 'flipper_length_mm'].mean(),    scale=df.loc[df.species=='Gentoo', 'flipper_length_mm'].std())
adelie_flipper_dist    = norm(loc=df.loc[df.species=='Adelie', 'flipper_length_mm'].mean(),    scale=df.loc[df.species=='Adelie', 'flipper_length_mm'].std())
chinstrap_flipper_dist = norm(loc=df.loc[df.species=='Chinstrap', 'flipper_length_mm'].mean(), scale=df.loc[df.species=='Chinstrap', 'flipper_length_mm'].std())

penguin_flipper_dists = {'Gentoo': gentoo_flipper_dist, 'Adelie': adelie_flipper_dist, 'Chinstrap': chinstrap_flipper_dist}


In [17]:
# P(x|s) * p(s) for each s
normalizing_constant = \
    (adelie_bill_dist.pdf(50)    * adelie_flipper_dist.pdf(195)    * df.species.value_counts()['Adelie']    / len(df)) + \
    (gentoo_bill_dist.pdf(50)    * gentoo_flipper_dist.pdf(195)    * df.species.value_counts()['Gentoo']    / len(df)) + \
    (chinstrap_bill_dist.pdf(50) * chinstrap_flipper_dist.pdf(195) * df.species.value_counts()['Chinstrap'] / len(df))

def calc_bayes_posteriors():
    results = []
    for species in df.species.unique():
        prior = len(df.loc[df.species==species]) / len(df)
        likelihood = penguin_bill_dists[species].pdf(50) * penguin_flipper_dists[species].pdf(195)

        posterior = (likelihood * prior)/normalizing_constant
        results.append((species, posterior))

    return results

In [18]:
results = calc_bayes_posteriors()
fig3 = go.Figure()
fig3.add_trace(
    go.Bar(x=[r[0] for r in results], y=[r[1] for r in results], text=[f'{r[1]*100:.2f}%' for r in results], name='posterior')
)
fig3.add_trace(
    go.Bar(x=[r[0] for r in results], y=[len(df.loc[df.species==r[0]])/len(df) for r in results], text=[f'{len(df.loc[df.species==r[0]])*100/len(df):.2f}%' for r in results], name='prior')
)
fig3

## Conclusion

Naive Bayes makes the follow assumptions:

- **Feature Indepenence**: Predictors $X_i$ are conditionally independent (*which allows us to simply multiply $p(x_1|y)*p(x_2|y)$*)
- **Normally Distributed**: For quantitative predictors $X_i$, the conditional pmf / pdf $p(x_i|y)$ is defined by a Normal model