In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Lecture 17 #

## Mendel and Pea Flowers ##

In [None]:
# Create the proportions that match the underlying population of pea colors
model = make_array(0.75, 0.25)

In [None]:
# Draw a sample of 929 plants and calculate proportions of colors
sample_proportions(929, model)

In [None]:
# Statistic: distance between sample percent (of purple plants) and 75

abs(100 * sample_proportions(929, model).item(0) - 75)

In [None]:
# Simulation: randomly drawing many samples of size 929

distances = make_array()

for i in np.arange(10000):
    new_distance = abs(100 * sample_proportions(929, model).item(0) - 75)
    distances = np.append(distances, new_distance)

In [None]:
Table().with_column('Distance from 75%', distances).hist()

In [None]:
# 705 of Mendel's 929 plants were purple flowering

observed_distance =  abs(100*(705/929) - 75)
observed_distance

In [None]:
# Compare observed data to the proportions generated by the model
Table().with_column('Distance from 75%', distances).hist()
plots.scatter(observed_distance, 0, color='red', s=30);

## Alameda County Jury Panels ##

In [None]:
# A table with the population demographics and ethnicities that were on jury panels
jury = Table().with_columns(
    'Ethnicity', make_array('Asian', 'Black', 'Latino', 'White', 'Other'),
    'Eligible', make_array(0.15, 0.18, 0.12, 0.54, 0.01),
    'Panels', make_array(0.26, 0.08, 0.08, 0.54, 0.04)
)

jury

In [None]:
# visualize the data
jury.barh('Ethnicity')

In [None]:
# calculate the difference between proportions in the population and on panels
jury_with_diffs = jury.with_column(
    'Difference', jury.column('Panels') - jury.column('Eligible')
)
jury_with_diffs

In [None]:
# claculate the absolute value of the differences
jury_with_diffs = jury_with_diffs.with_column(
    'Absolute Difference', np.abs(jury_with_diffs.column('Difference'))
)
jury_with_diffs

In [None]:
# sum the absolute values and divide by 2 to get the TVD
sum(jury_with_diffs.column('Absolute Difference')) /2

In [None]:
# let's create a function that can calculate the TVD
def total_variation_distance(distribution_1, distribution_2):
    return sum(np.abs(distribution_1 - distribution_2)) / 2

In [None]:
# apply the function to our actual data
total_variation_distance(jury.column('Eligible'), jury.column('Panels'))

In [None]:
# let's now exactract the eligible jurer values
eligible = jury.column('Eligible')
eligible

In [None]:
# generate proportions from a random sample of size 1453 
sample_distribution = sample_proportions(1453, eligible)
sample_distribution

In [None]:
# Add this random sample to our actual data
panels_and_sample = jury.with_column('Random Sample', sample_distribution)
panels_and_sample

In [None]:
# visual the one random sample in comparison to our actual data
panels_and_sample.barh('Ethnicity')

In [None]:
# get the TVD of our random sample
total_variation_distance(panels_and_sample.column('Random Sample'), eligible)

In [None]:
# calculate the real TVD
observed_tvd = total_variation_distance(jury.column('Panels'), eligible)
observed_tvd

In [None]:
# create an empirical distribution of many randomly generated TVDs
tvds = make_array()

for i in np.arange(10000):
    sample_distribution = sample_proportions(1453, eligible)
    new_tvd = total_variation_distance(sample_distribution, eligible)
    tvds = np.append(tvds, new_tvd)
    

In [None]:
# Visualize the distribution
Table().with_column('Total Variation Distance', tvds).hist(bins = 20)

In [None]:
# Add the actual data to the distribution 
Table().with_column('Total Variation Distance', tvds).hist(bins = 20)
plots.scatter(observed_tvd, 0, color='red', s=30);