In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Statistics

For this first example, our population is the United Airlines flights (national, not international) out of San Francisco airport from 6/1/15 to 8/9/15. `united.csv` stores information on our population, including each flight's destination and how late it departed (minutes).

In [None]:
united = Table.read_table('united.csv')
united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')
united.show(3)

In [None]:
# In this example, we have all the data on the entire population (this is unusual)
# So it's easy to calculate a (Population) parameter such as the median delay
...

In [None]:
# Imagine we only had partial information about our population.
# We could use a (sample) statistic to estimate the unknown parameter
# Use Table.sample(5) to draw a random sample of size 5
...
united.sample(5)

In [None]:
# Use Table.sample(5) to estimate the median delay for the whole population
# Run this cell MULTIPLE times to get a feeling for the sampling variability
size = 5
sample = united.sample(size).column('Delay')
...

In [None]:
# Use Table.sample(100) to estimate the median delay
# What do you notice about the amount of variability now?
size = 100
...
...

While every random sample has a chance of being a poor estimate, by the Law of Averages we feel more confident in an estimate based on a **large** random sample.

**Back to Slides...**

### Probability & Empirical Distributions of a Statistic

In [None]:
# We want to make an empirical distribution for the 'Delay' variable's sample median.
# Based on repeated sampling (with a fixed sample size), what values do we observe
# for the statistic?

# As usual, we begin coding our simulation by writing a function for one iteration

def sample_median(size):
    '''
    Uses a random sample of size `size` to compute and return a sample median
    from the United flights 'Delay' distribution
    '''
    ...

In [None]:
# Do a quick test: call the function with size = 5
sample_median(5)

In [None]:
# Choose a number of iterations for creating our empirical distribution
num_simulations = 2000

In [None]:
# Based on num_simulations (2000) random samples of size 5, compute an array of sample medians

sample_size = 5

...
...
...

sample_medians  # displays the 2000 sample medians

In [None]:
# Make a 1-column table showing the sample medians
Table().with_column('Sample medians (size=5)', sample_medians)

In [None]:
##### Use the table to make a histogram
mybins = np.arange(-10.5, 40.6, 1)
Table().with_column('Sample medians (size=5)', sample_medians).hist(bins=mybins)
plots.title("Empirical Distribution of the Sample Median (n=5)")

Because we used a large number of samples to create the histogram (`num_simulations = 2000`), this is a good estimate for the probability distribution of the sample median (size=5) over ALL samples of size 5 from the United population.

In [None]:
# Draw a histogram for the empirical distribution of the sample mean with 
# sample size 500

sample_medians = make_array()
sample_size = 500

for i in np.arange(num_simulations):
    new_median = sample_median(sample_size)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
mybins = np.arange(-10.5, 40.6, 1)  # same bins as before
Table().with_column('Sample medians (size=500)', sample_medians).hist(bins=mybins)

Discuss: Compare the empirical sampling distributions for the sample mean delay, size=5 versus size=500. Can you explain why they are so different?

#### Empirical Distributions of a Statistic (Overlayed)

In [None]:
# Here's a more ambitious visualization
# We show the empirical distributions for sample sizes 10, 100, and 1000
sample_medians_10 = make_array()
sample_medians_100 = make_array()
sample_medians_1000 = make_array()

num_simulations = 2000

for i in np.arange(num_simulations):
    new_median_10 = sample_median(10)
    sample_medians_10 = np.append(sample_medians_10, new_median_10)
    new_median_100 = sample_median(100)
    sample_medians_100 = np.append(sample_medians_100, new_median_100)
    new_median_1000 = sample_median(1000)
    sample_medians_1000 = np.append(sample_medians_1000, new_median_1000)

In [None]:
sample_medians = Table().with_columns('Size 10', sample_medians_10, 
                                      'Size 100', sample_medians_100,
                                      'Size 1000', sample_medians_1000)

In [None]:
sample_medians.hist(bins = np.arange(-5, 30))

Questions?

**Back to Slides...**

## Swain vs. Alabama ##

In [None]:
# The population here is the eligible jurors
# Recall, they were 26% Black and 74% White
# Make an array reflecting the population proportions
population_proportions = make_array(.26, .74)
population_proportions

In [None]:
# Now we simulate drawing a sample of size 100 from a population
# where 26% are Black and 74% are White

# Run this several times to observe sampling variability
sample_proportions(100, population_proportions)

In [None]:
# Make a function we can call to simulate one sample proportion
def panel_proportion():
    '''
    draws a random sample of size 100 from population_proportions
    returns the sample proportion for Black
    '''
    ...
    

In [None]:
# Try calling the function several times
...

In [None]:
# Now that we can simulate one iteration, a for loop will let
# accumulate 10,000 simulated sample proportions for sample size 100
# in an array named `panels`
panels = make_array()

num_iters = 10000

...
for i in range(num_iters):
    b_prop = panel_proportion()
    panels = np.append(panels, b_prop)

In [None]:
# Make a a-column table holding the panels array
# Use the column label 'Number of Black Men on Panel of 100' 
# Be sure the values in the column are whole numbers, not proportions
panel_tbl = Table().with_column(
    'Number of Black Men on Panel of 100', panels*100
)
panel_tbl.show(3)

In [None]:
# visualize the simulated counts in a histogram
panel_tbl.hist(bins=np.arange(5.5,40.))

# Plotting details
plots.ylim(-0.002, 0.09)
plots.scatter(8, 0, color='red', s=30);  # makes a RED dot at (8, 0), the observed value

**Back to Slides...**

## Mendel and Pea Flowers ##

In [None]:
# Actual Data: Mendel had 929 plants, of which 709 had purple flowers
observed_purples = 709 / 929
observed_purples

We don't know that 0.7632 is statistically close to 0.75 in the current context. With such a large sample size, maybe it would be extremely rare for 76% of the plants in a sample of size 929 to be purple-flowering when the population is actually 75% purple-flowering. We need a histogram to help us assess the model.

In [None]:
# Make one simulated sample proportion when population proportion is 75%
predicted_proportions = make_array(.75, .25)
sample_proportions(929, predicted_proportions)

In [None]:
def purple_flowers():
    '''
    generates our statistic for one random sample of size 929
    returns a percentage (instead of a proportion)
    '''
    return sample_proportions(929, predicted_proportions).item(0) * 100

In [None]:
# Test run
purple_flowers()

In [None]:
# Use a for loop to accumulate an array of 10000 simulated sample proportions
purples = make_array()

...

In [None]:
purples_tbl = Table().with_column('Percent of purple flowers in sample of 929', purples)
purples_tbl

In [None]:
# Under the assumption of the model, here's an estimate for the
# sample proportion distribution
purples_tbl.hist(bins=20)
plots.title("Empirical Distribution of the Sample Proportion for Purple");

Now we need to show the distribution of our "test statistic", the absolute deviaton of the purple proportion from 75. Make a 1-column table showing those absolute deviations, then draw a histogram.

In [None]:
abs_devs = Table().with_column('Discrepancy in sample of 929 if the model is true', abs(purples-75))
abs_devs.hist(bins=20)

In [None]:
# Recall that we know the observed proportion of purples from the actual data.
# We can convert that to a percentage and get its absolute deviation from 75
actual = abs(observed_purples * 100 - 75)

In [None]:
# Like we did in the previous example, we can add a red dot for the observed 
# proportion, to help us assess the believability of the model
abs_devs.hist(bins=20)
plots.scatter(actual, 0, color='red', s=80);  

**Back to Slides...**