In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Swain vs. Alabama ##

In [None]:
population_proportions = make_array(.26, .74)
population_proportions

In [None]:
sample_proportions(100, population_proportions)

In [None]:
def panel_proportion():
    return sample_proportions(100, population_proportions).item(0)

In [None]:
panel_proportion()

In [None]:
panels = make_array()

num_simulations = 10000

for i in np.arange(num_simulations):
    new_panel = panel_proportion() * 100
    panels = np.append(panels, new_panel)

In [None]:
Table().with_column('Number of Black Men on Panel of 100', panels).hist(bins=np.arange(5.5,40.))
plots.plot([8, 8], [0, .1], color='red', lw=2);

In [None]:
np.count_nonzero(panels <= 8) / num_simulations

## Alameda County Jury Panels ##

In [None]:
jury = Table().with_columns(
    'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),
    'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),
    'Panels', make_array(0.26, 0.08, 0.08, 0.54, 0.04)
)

jury

In [None]:
jury.barh('Ethnicity')

In [None]:
# Under the model, this is the true distribution of people
# from which the jurors are randomly sampled
model = jury.column('Eligible')
model

In [None]:
# Let's simulate a random draw of 1453 jurors from this distribution
simulated = sample_proportions(1453, model)
simulated

In [None]:
# The actual observed distribution (Panels) looks quite different
# from the simulation -- try running this several times to confirm!
jury_with_simulated = jury.with_column('Simulated', simulated)
jury_with_simulated

In [None]:
jury_with_simulated.barh('Ethnicity')

## Distance Between Distributions

In [None]:
# In this case, we need to understand how each of the 5 categories
# differ from their expected values according to the model.

diffs = jury.column('Panels') - jury.column('Eligible')
jury_with_difference = jury.with_column('Difference', diffs)
jury_with_difference

## Total Variation Distance

In [None]:
def tvd(dist1, dist2):
    return sum(abs(dist1 - dist2))/2

In [None]:
# The TVD of our observed data (Panels) from their expected values
# assuming the model is true (Eligbible)
obsvd_tvd = tvd(jury.column('Panels'), jury.column('Eligible'))
obsvd_tvd

In [None]:
# The TVD of a model simluation from its expected values
tvd(sample_proportions(1453, model), jury.column('Eligible'))

In [None]:
def simulated_tvd():
    return tvd(sample_proportions(1453, model), model)

tvds = make_array()

num_simulations = 10000
for i in np.arange(num_simulations):
    new_tvd = simulated_tvd()
    tvds = np.append(tvds, new_tvd)

In [None]:
title = 'Simulated TVDs (if model is true)'
bins = np.arange(0, .05, .005)

Table().with_column(title, tvds).hist(bins = bins)
plots.plot([obsvd_tvd, obsvd_tvd], [0, 50], color='red', lw=2);

## Example: Benford's Law

In [None]:
digits = np.arange(1, 10)
benford_model = np.log10(1 + 1/digits)

In [None]:
benford = Table().with_columns(
    'First digit', digits,
    'Benford Model Probability', benford_model)
benford.barh('First digit')

In [None]:
# You don't have to understand how this function works, 
# since it uses Python features from beyond STOR 120.
def first_digit(num):
    return int(str(num)[0])

In [None]:
first_digit(32)

In [None]:
first_digit(17719087)

### Benford's Law and COVID-19 Reporting

In [None]:
COVID_by_digit = Table().with_columns(
    'First Digit', np.arange(1, 10),
    "Count", make_array(194, 106, 72, 51, 52, 38, 36, 22, 10)
)

COVID_by_digit

In [None]:
COVID_by_digit = COVID_by_digit.with_column(
    'Proportion', COVID_by_digit.column('Count') / sum(COVID_by_digit.column('Count'))
)

COVID_by_digit

In [None]:
COVID_observed_tvd = sum(abs(COVID_by_digit.column('Proportion') - benford_model))/2
COVID_observed_tvd

In [None]:
def simulate_COVID_count_first_digits():
    simulated_frequencies = sample_proportions(sum(COVID_by_digit.column('Count')), benford_model)
    tvd = sum(abs(simulated_frequencies - benford_model))/2
    return tvd

In [None]:
simulate_COVID_count_first_digits()

In [None]:
COVID_simulated_tvds = make_array()

for i in np.arange(10000):
    COVID_simulated_tvds = np.append(COVID_simulated_tvds, simulate_COVID_count_first_digits())

In [None]:
COVID_bins = np.arange(0, 0.10, 0.005)

Table().with_column('COVID Simulated TVD', COVID_simulated_tvds).hist(0, bins = COVID_bins)
plots.plot([COVID_observed_tvd, COVID_observed_tvd], [0, 30], color='red', lw=2);

In [None]:
np.count_nonzero(COVID_simulated_tvds >= COVID_observed_tvd) / 10000