In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

## A. Distribution of the Sample Average

In [None]:
# Delay column holds numerical data (number of minutes) for flights from SF
united = Table.read_table('united.csv')
united_bins = np.arange(-20, 300, 10)
united.hist('Delay', bins=united_bins)

In [None]:
# We saw these functions last time...
def one_sample_mean(sample_size):
    """Take a sample from the population of flights and compute its mean"""
    sampled_flights = united.sample(sample_size)  
    return np.mean(sampled_flights.column('Delay'))

In [None]:
def ten_thousand_sample_means(sample_size):
    """Use the bootstrap to approximate the distribution of the sample mean"""
    means = make_array()
    for i in np.arange(10000):
        mean = one_sample_mean(sample_size)
        means = np.append(means, mean)
    return means

In [None]:
# Get an array for estimating the sampling distribution when n = 400
# We know ahead of time that the result will be bell-shaped (CLT)
sample_means_400 = ten_thousand_sample_means(400)
Table().with_column('Means of 400 flight delays', sample_means_400).hist(bins=20)

In [None]:
united.num_rows

In [None]:
# How many possible sample means are there?
# 400 times, we have 13,825 options for which flight will be chosen. 
# so, 13825 to the 400th power...
united.num_rows ** 400

We approximated the distribution of all these means by simulating 10000 of them. It would be even easier just to use math to understand the probability distribution.

**Back to Slides**

## B. CLT: Center

In [None]:
delay_mean = np.average(united.column('Delay'))    # population mean
sample_means_400 = ten_thousand_sample_means(400)
Table().with_column('Means of 400 flight delays', sample_means_400).hist(bins=20)
plots.scatter(delay_mean, -0.01, marker='^', color='red', s=100)
print('Population Average:', delay_mean)  # red triangle

**Back to slides...**

## C. CLT: Spread
To investigate the effect of sample size on the spread of the sampling distribution, let's make an empirical distribution for sample size 900.

In [None]:
sample_means_900 = ten_thousand_sample_means(900)

In [None]:
means_tbl = Table().with_columns(
    '400', sample_means_400,
    '900', sample_means_900,
)

In [None]:
means_tbl.hist(bins = np.arange(5, 31, 0.5))
plots.title('Distribution of 10,000 Sample Averages');
plots.scatter(delay_mean, -0.01, marker='^', color='red', s=100)
print('Population Average:', delay_mean)  # red triangle

**Back to Slides...**

## D. Quantifying the Effect of Sample Size

In [None]:
"""Empirical distribution of random sample means"""

def plot_and_summarize_sample_means(sample_size):
    sample_means = ten_thousand_sample_means(sample_size)
    sample_means_tbl = Table().with_column('Sample Means', sample_means)
    
    # Find the population SD
    delay_sd = np.std(united.column('Delay'))
    
    # Print some information about the distribution of the sample means
    print("Sample size: ", sample_size)
    print("Population mean:", delay_mean)
    print("Average of sample means: ", np.mean(sample_means))
    print("\nPopulation SD:", delay_sd)
    print("SD of sample means:", np.std(sample_means))
    print("Ratio:", delay_sd / np.std(sample_means))

    # Plot a histogram of the sample means
    sample_means_tbl.hist(bins=20)
    plots.xlabel('Sample Means')
    plots.title('Sample Size ' + str(sample_size))

In [None]:
plot_and_summarize_sample_means(100)

In [None]:
plot_and_summarize_sample_means(400)

In [None]:
plot_and_summarize_sample_means(900)

Is there a pattern here? Let n denote sample size.
  - When n = 100, the ratio POP_SD / Sample_Means_SD ~ 10
  - When n = 400, the ratio POP_SD / Sample_Means_SD ~ 20
  - When n = 900, the ratio POP_SD / Sample_Means_SD ~ 30

The CLT guarantees that the ratio is always `sqrt(n)`. That is, without doing any simulation at all, we can know the sampling variation for a given sample size!

In [None]:
# population sd
delay_sd = np.std(united.column('Delay'))

# For sample size 100, the SD of the sample means is...
delay_sd / np.sqrt(100)

In [None]:
# For sample size 400, the SD of the sample means is...
delay_sd / np.sqrt(400)

In [None]:
# For sample size 900, the SD of the sample means is...
delay_sd / np.sqrt(900)

**Back to slides...**