In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
from scipy.stats import norm
plots.style.use('fivethirtyeight')

# Lecture 28

## Central Limit Theorem and Simulating Sample Mean ##

In [None]:
# get the data set of flight delays
united = Table.read_table('united_summer2015.csv')
united

In [None]:
# calculate the mean (mu) and sd (sigma) of the delays in the population

delays = united.column('Delay')
mean_delay = np.mean(delays)
sd_delay = np.std(delays)

# display mu and sigma
mean_delay, sd_delay


In [None]:
# visualize the data (which is not normal!)
united.hist('Delay', bins = np.arange(-20, 300, 10))
plots.plot([mean_delay, mean_delay], [0, .042], color = "red", lw = 2);

In [None]:
# Take random sample from population of size sample_size 
# Repeat to get empirical distribution of sample average

sample_size = 400

means_400 = make_array()

for i in np.arange(10000):
    sampled_flights = united.sample(sample_size)
    sample_mean = np.mean(sampled_flights.column('Delay'))
    means_400 = np.append(means_400, sample_mean)

In [None]:
# visualize the distribution of sample means
Table().with_columns('Sample Mean', means_400).hist(bins = 20)
plots.plot([np.mean(means_400), np.mean(means_400)], [0, .22], color = "red", lw = 2);

plots.title('Sample Size ' + str(sample_size))
plots.xlabel('Sample Average')


In [None]:
# Compare population mean to the mean of the sampling distribution
mean_delay, np.mean(means_400)

In [None]:
# Compare population standard deviation (sigma) to the standard devation of the sampling distribution (SE)
sd_delay, np.std(means_400)

## Variability of the Sample Average ##

In [None]:
# Let's examine a distribution of sample means, where each mean comes from a larger sample size (n = 900)

sample_size = 900

means_900 = make_array()

for i in np.arange(10000):
    sampled_flights = united.sample(sample_size)
    sample_mean = np.mean(sampled_flights.column('Delay'))
    means_900 = np.append(means_900, sample_mean)

In [None]:
# compare the sampling distributions for sample sizes of n = 400, and n = 900
means_tbl = Table().with_columns(
    '400', means_400,
    '900', means_900
)

In [None]:
means_tbl.hist(bins = np.arange(5, 31, 0.5))
plots.title('Distribution of Sample Average');

### Exploring the relationship between sample size (n) and the spread of the sampling distribution (SE)

In [None]:
# a function that computes the sampling distribution of sample means for the United flight delays

def sample_means(sample_size):
    """Empirical distribution of random sample means"""
    
    repetitions = 10000
    means = make_array()

    for i in range(repetitions):
        sampled_flights = united.sample(sample_size)
        sample_mean = np.mean(sampled_flights.column('Delay'))
        means = np.append(means, sample_mean)

    sample_means = Table().with_column('Sample Means', means)
    
    # Display empirical histogram and print all relevant quantities
    sample_means.hist(bins=np.arange(5, 36, 1))
    plots.xlabel('Sample Means')
    plots.title('Sample Size ' + str(sample_size))
    print("Sample size: ", sample_size)
    print("\nPopulation mean (mu):", np.mean(united.column('Delay')))
    print("Average of sample means: ", np.mean(means))
    print("\nPopulation SD (sigma):", np.std(united.column('Delay')))
    print("SD of sample means (SE):", np.std(means))
    print("Population SD divided by sqrt of the sample size (sigma/sqr(n)):", np.std(united.column('Delay'))/np.sqrt(sample_size))

In [None]:
# run the function for a sample size of n = 100
sample_means(100)

In [None]:
# run the function for a sample size of n = 400
sample_means(400)

In [None]:
# run the function for a sample size of n = 625
sample_means(625)

In [None]:
# compare standard error estimates for a sequence of sample sizes
sample_sizes = np.arange(50, 401, 50)

sd_of_sample_means = make_array()

for n in sample_sizes:
    means = make_array()
    for i in np.arange(10000):
        means = np.append(means, np.mean(united.sample(n).column('Delay')))
    sd_of_sample_means = np.append(sd_of_sample_means, np.std(means))

In [None]:
# display the results in a table
sd_comparison = Table().with_columns(
    'Sample Size n', sample_sizes,
    'SD of 10,000 Sample Means', sd_of_sample_means,
    'Population_SD/sqrt(n)', sd_delay/np.sqrt(sample_sizes)
)

In [None]:
sd_comparison

In [None]:
# visualize the comparison of theoretical SEs and empirical SEs
sd_comparison.scatter('Sample Size n')

## Confidence interval using the SE formula

In [None]:
# Get a random sample size n = 100 of flight delays
sample_size = 100
my_sample = united.sample(sample_size, with_replacement = False).column("Delay")
my_sample

In [None]:
# calculate the sample mean and sample SD
my_mean = np.mean(my_sample)
my_sd = np.std(my_sample)

In [None]:
# calculate an approximate standard error (really we need to use a t-distribution)
approx_SE = my_sd/np.sqrt(sample_size)


In [None]:
# calculate an approximate confidence interval using:  mean +/- 2 * SE
(my_mean - 2 * approx_SE, my_mean + 2 * approx_SE)


In [None]:
# Did it capture the population mean mu? 
np.mean(delays)

## Confidence interval for a proportion

In [None]:
# Let's look at the flights flying to JFK airport
united.group("Destination").where("Destination", "JFK")


In [None]:
# calculate the population proportion (pi)
pop_proportion = np.mean(united.column('Destination') == "JFK")
pop_proportion

In [None]:
sample_size = 900

proportions_900 = make_array()

for i in np.arange(10000):
    sampled_flights = united.sample(sample_size)
    sample_proportion = np.mean(sampled_flights.column('Destination') == "JFK")
    proportions_900 = np.append(proportions_900, sample_proportion)

In [None]:
Table().with_column('Sample Proportions', proportions_900).hist()

In [None]:
# calculate a confidence interval
# only valid when n * proportion and (n - 1) * proportion are greater than 10

sample_size = 900

one_sample = united.sample(sample_size)
sample_proportion = np.mean(one_sample.column('Destination') == "JFK")

SE_prop = .5/np.sqrt(sample_size)

(sample_proportion - 2 * SE_prop, sample_proportion + 2 * SE_prop)


In [None]:
# does the confidence interval capture the population proportion of flights going to JFK? 
pop_proportion