In [None]:
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np

# Confidence Intervals

## Another Example: Mean Maternal Age

In [None]:
# This time we have a sample, but no population data!
births = Table.read_table('baby.csv')
births.show(5)

In [None]:
# sample distribution

births.hist('Maternal Age')

In [None]:
# sample mean maternal age

mean_age = np.mean(births.column('Maternal Age'))
mean_age

In [None]:
# samples with replacement from our sample
# returns the mean maternal age of this bootstrap sample

def one_bootstrap_mean():
    return np.mean(births.sample().column('Maternal Age'))

In [None]:
bootstrap_means = make_array()

for i in np.arange(1000):
    new_mean = one_bootstrap_mean()
    bootstrap_means = np.append(bootstrap_means, new_mean)

# Determine the bounds for the middle 95% of bootstrap sample mean maternal ages
    
left = percentile(2.5, bootstrap_means)
right = percentile(97.5, bootstrap_means)

In [None]:
Table().with_column('Bootstrap means', bootstrap_means).hist()

plots.plot([left,right], [0,0], color="gold", lw=10, zorder=1);
plots.plot([mean_age,mean_age], [0,2.5], color="blue", lw=3, zorder=1);
plots.title('Bootstrap Means (1K Bootstraps from our Sample)');

print("We have 95% confidence that the mean maternal age is in the interval from", 
      np.round(left, 3),
      "to",
      np.round(right, 3),
      'years old.'
     )

## Back to an example with population data

In [None]:
united = Table.read_table('united.csv')
pop_median = np.median(united.column('Delay'))

our_sample = united.sample(100, with_replacement=False)
our_sample_median = np.median(our_sample.column('Delay'))

pop_median, our_sample_median

In [None]:
def one_bootstrap_median():
    single_sample = our_sample.sample()
    return np.median(single_sample.column('Delay'))

bootstrap_medians = make_array()

for i in np.arange(1000):
    new_median = one_bootstrap_median()
    bootstrap_medians = np.append(bootstrap_medians, new_median)

left = percentile(2.5, bootstrap_medians)
right = percentile(97.5, bootstrap_medians)

In [None]:
sampling_bins = np.arange(-3, 11, 1)    
    
Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins = sampling_bins)

plots.plot([left, right], [0,0], color="gold",lw=10, zorder=1);
plots.plot([pop_median, pop_median], [0,0.3], color="red",lw=2, zorder=1);
plots.plot([our_sample_median, our_sample_median], [0,0.3], color="blue",lw=2, zorder=1);

## Simulation of Simulations

In [None]:
left_ends = make_array()
right_ends = make_array()

total_delays = united.select('Delay')

def bootstrap_median(original_sample, label, replications):
    """Returns an array of bootstrapped sample medians:
    original_sample: table containing the original sample
    label: label of column containing the variable
    replications: number of bootstrap samples
    """
    just_one_column = original_sample.select(label)
    medians = make_array()
    for i in np.arange(replications):
        bootstrap_sample = just_one_column.sample()
        resampled_median = percentile(50, bootstrap_sample.column(0))
        medians = np.append(medians, resampled_median)
        
    return medians

In [None]:
# Generate 100 intervals, in the table intervals

for i in np.arange(100):
    first_sample = total_delays.sample(100, with_replacement=False)
    medians = bootstrap_median(first_sample, 'Delay', 1000)
    left_ends = np.append(left_ends, percentile(2.5, medians))
    right_ends = np.append(right_ends, percentile(97.5, medians))

intervals = Table().with_columns(
    'Left', left_ends,
    'Right', right_ends
)   

In [None]:
intervals

In [None]:
# transposes table and labels columns by replication
replication_number = np.ndarray.astype(np.arange(1, 101), str)
intervals2 = Table(replication_number).with_rows(make_array(left_ends, right_ends))
intervals2 

In [None]:
# makes the the plot bigger
plots.figure(figsize=(8,8))

#plot 100 intervals on one plot
for i in np.arange(100):
    ends = intervals2.column(i)
    plots.plot(ends, make_array(i+1, i+1), color='gold', lw=2, zorder=2)
    
plots.plot([pop_median, pop_median], make_array(0, 100), color='red', lw=5, zorder=1)
plots.xlabel('Median (minutes)')
plots.ylabel('Replication')
plots.title('Population Median and Intervals of Estimates');