In [None]:
from datascience import *
import numpy as np
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline
import warnings
warnings.simplefilter(action="ignore", category=UserWarning)

## Alameda County Juries

In [None]:
# Data from an ACLU 2018 report
# Racial and Ethnic Disparities in Alameda County Jury Pools
# https://www.aclunc.org/sites/default/files/racial_and_ethnic_disparities_in_alameda_county_jury_pools.pdf
# 1453 people were included in the panels

alameda = Table().with_columns(       # build by columns (see below for by-row)
    'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),
    'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),
    'Panels', make_array(0.26, 0.08, 0.08, 0.54, 0.04)
)

alameda.set_format([1, 2], PercentFormatter(0))  # the data columns hold a 0:1 number, but show in percent

In [None]:
# plot categorial (i.e. not numeric) data as bar chart
alameda.barh(0)

## Total Variation Distance

In [None]:
# Use the difference between two values as a metric of how much they vary
diff = alameda.with_column('Difference',
                    alameda.column('Eligible') - alameda.column('Panels'))
diff

In [None]:
# take absolute value to keep all differences raising the metric
abs_diff = diff.with_column('Abs. Difference',
                        np.abs(diff.column('Difference')))
abs_diff

In [None]:
sum(abs_diff.column('Abs. Difference')) / 2  # if one bar goes up, another goes down => divide by 2

In [None]:
# define a function to compute the TVD between normalized arrays
def total_variation_distance(distribution_1, distribution_2):
    """Each distribution is an array of proportions that sums to 1."""
    return np.abs(distribution_1 - distribution_2).sum()/2

In [None]:
# define a function to compute the TVD between two table columns
def table_tvd(table, label_1, label_2):
    """ Calculate TVD from two specified columns """
    return total_variation_distance(table.column(label_1), table.column(label_2))

In [None]:
# and check it
table_tvd(alameda, 'Eligible', 'Panels')

## Simulating the statistic

In [None]:
# define a function to create a random panel 
def get_one_simulated_panel():
    """ Create a panel of 1453 people """
    return alameda.select('Ethnicity').sample(1453, weights=alameda.column('Eligible'))

In [None]:
# do a single simulation by adding a "Random" column
def simulate_once():
    """ Create one simulated table """
    simulated_panel = get_one_simulated_panel()
    counts = simulated_panel.group('Ethnicity')
    sim_proportions = counts.select('Ethnicity').with_column('Random',
                                    counts.column('count') / 1453)
    sim_proportions.set_format(1, PercentFormatter(0))
    return alameda.join('Ethnicity', sim_proportions)

In [None]:
simulate_once()

In [None]:
# Compute the empirical distribution of TVDs by simulation
tvds = make_array()

for i in np.arange(5000): # 5000 repetitions of the simulation
    sim_results = simulate_once()
    tvds = np.append(tvds, table_tvd(sim_results, 'Eligible', 'Random'))

results = Table().with_column('TVD', tvds)
results

In [None]:
results.hist(bins=np.arange(0, 0.2, 0.01))  # Try this one without binning argument

## P-value

In [None]:
results.where(0, are.above_or_equal_to(0.14)).num_rows / results.num_rows

## Addendum: Alameda County Race & Ethnicity Distribution

In [None]:
# According to the 2010 Census, https://www.census.gov/2010census/popmap/

alameda_race2010 = Table(['Race', 'Population']).with_rows([   # build by row
    ['White', 649122],
    ['African American', 198654],
    ['Asian', 394560],
    ['AIAN', 9799],
    ['NHPI', 12802],
    ['Some Other Race', 162540],
    ['Two or more Races', 90997],
])

alameda_ethnicity2010 = Table(['Ethnicity', 'Population']).with_rows([
    ['Hispanic or Latino', 339889],
    ['Not Hispanic or Latino', 1170382],
])

# according to the 2020 census  https://www.census.gov/2020census/popmap/
alameda_race = Table(['Race', 'Population']).with_rows([   # build by row
    ['White', 689396],
    ['African American', 190451],
    ['Asian', 605813],
    ['AIAN', 48407],
    ['NHPI', 25113],
    ['Some Other Race', 327619]  # 2020 includes two or more in this category
])

alameda_ethnicity = Table(['Ethnicity', 'Population']).with_rows([
    ['Hispanic or Latino', 393749],
    ['Not Hispanic or Latino', 1682353-393749],
])

# set up some nice formatting and show 2020 data

alameda_race.set_format("Population", DistributionFormatter).show()

alameda_ethnicity.set_format(1, DistributionFormatter).show()

In [None]:
# How can we compare the 2010 census, the comparison data from the report, and the 2020 census? 
# (Hint: this is easier if you use the 'join' operator from the lecture and Drinks notebook, 
# but it can be done manually with what you know now)

In [None]:
# Can we conclude anything from that? Maybe "demographics is hard"?

## Swain v. Alabama, a significant legal case

In [None]:
swain = Table().with_columns(
    'Ethnicity', make_array('Black', 'Other'),
    'Eligible', make_array(0.26, 0.74),
    'Panel', make_array(0.08, 0.92)
)

swain.set_format([1, 2], PercentFormatter(0))

In [None]:
swain.barh(0)

In [None]:
table_tvd(swain, 'Eligible', 'Panel')

In [None]:
def simulate_one_swain(number):
    simulated_panel = swain.select('Ethnicity').sample(number, weights=swain.column('Eligible'))
    counts = simulated_panel.group('Ethnicity')
    sim_proportions = counts.select('Ethnicity').with_column('Random',
                                    counts.column('count') / number)
    sim_proportions.set_format(1, PercentFormatter(0))
    return swain.join('Ethnicity', sim_proportions)

In [None]:
simulate_one_swain(100)

In [None]:
# Compute the empirical distribution of TVDs

tvds = make_array()

for i in np.arange(5000): # 50000 repetitions of the simulation - slow, but smoother output
    sim_results = simulate_one_swain(100)
    tvds = np.append(tvds, table_tvd(sim_results, 'Eligible', 'Random'))

results = Table().with_column('TVD between the population & a random sample', tvds)
results

In [None]:
results.hist(bins=np.arange(0, 0.20, 0.010))

In [None]:
# P value
results.where(0, are.above_or_equal_to(0.18)).num_rows / results.num_rows