In [1]:
# Confidence Intervals and Bootstrapping

# To use this tutorial, read the commands and execute the code section by section.

# The learning objective is to understand how to compute confidence 
# intervals (known as credible intervals when using Bayesian approaches),
# which are a fundamental piece of rigorous, quantitative science beacuse
# they describe the uncertainty that you have in estimating some property
# of the world given noisy, finite observations.

# For NGG students, this tutorial is meant to be used in tandem with this
# Discussion on the NGG Canvas site:

#  https://canvas.upenn.edu/courses/1358934/discussion_topics/5464266

# Copyright 2019 by Joshua I. Gold, University of Pennsylvania
# Originally written for Matlab, translated to Python 1.30.20 by CMH

import numpy as np
from scipy.stats import norm
from scipy import stats
from random import choices

# Exercise: Compute confidence/credible intervals for simulated data 
# sampled from a population that is Gaussian distributed with mean mu=10 
# and standard deviation sigma=2, for n=5, 10, 20, 40, 80 at a 
# 95% confidence level.
mu = 10
sigma = 5
alpha = 0.95
NB = 1000 # number of bootstraps

# Loop through the n's
# Note that the different approaches converge on the same answer as n gets large
for n in [5, 10, 20, 40, 80, 160, 1000]: 
   
    #  Simulate some data
    data = np.random.normal(mu, sigma, n)
   
    # Save the mean
    meand = np.mean(data)
   
    # Show the mean, n
    print('N = {0:.2f}, MEAN = {1:.2f}'.format(n, meand))
   
    # METHOD 1: analytic solution assuming Gaussian

    # Get the z-score for the given confidence level (make it negative
    # so we can subtract it to make the lower interval)
    z = norm.ppf((1-alpha)/2)
   
    # 1a. Use the given sigma
    sem = sigma/np.sqrt(n);
    CI_low = meand - (sem*z)
    CI_high = meand + (sem*z)
    print('\t1a: CI = [{0:.2f} {1:.2f}]'.format(CI_low, CI_high))

    # 1b. Use the sample sigma
    # BEST IF n IS LARGE (>30)
    sem = np.std(data)/np.sqrt(n); 
    CI_low = meand - (sem*z)
    CI_high = meand + (sem*z)
    print('\t1b: CI = [{0:.2f} {1:.2f}]'.format(CI_low, CI_high))
   
    # METHOD 2: analytic solution assuming t-distribution
    # BEST IF n IS SMALL (<30) ... note that as n increases, the t
    # distribution approaches a Gaussian and methods 1 and 2 become more
    # and more similar

    # Get the cutoff using the t distribution, which is said to have n-1
    # degrees of freedom
    t = - (stats.t.ppf((1-alpha)/2, n - 1))
    sem = np.std(data)/np.sqrt(n);
    CI_low = meand - (sem*t)
    CI_high = meand + (sem*t)
    print('\t2:  CI = [{0:.1f} {1:.2f}]'.format(CI_low, CI_high))
   
    # METHOD 3: bootstrap!

    # Resample the data with replacement to get new estimates of mu 
    # Note that here we do not make any assumptions about the nature of the real distribution.
    mu_star = np.zeros((NB, 1))
    for ii in range(NB - 1):
        random_sample = np.random.choice(data, n)
        mu_star[ii] = np.mean(random_sample);
   
    # Now report the CI directly from the bootstrapped distribution
    CI_low =  np.percentile(mu_star, 100*(1-alpha)/2)
    CI_high = np.percentile(mu_star, 100*(alpha+(1-alpha)/2))
    print('\t3:  CI = [{0:.2f} {1:.2f}]'.format(CI_low, CI_high))
   
    # Method 4: Credible interval
    # See the Canvas discussion -- under these assumptions (i.e., data
    # generated from a Gaussian distribution with known sigma), the answer
    # is exactly the same as with Method 1, above. Note that this
    # equivalence is NOT true in general, which means that frequentist
    # confidence intervals and Bayesian credible intervals can give
    # different answers for certain distributions.

N = 5.00, MEAN = 10.42
	1a: CI = [14.80 6.03]
	1b: CI = [13.77 7.06]
	2:  CI = [5.7 15.17]
	3:  CI = [6.66 13.17]
N = 10.00, MEAN = 9.14
	1a: CI = [12.24 6.05]
	1b: CI = [11.22 7.07]
	2:  CI = [6.7 11.54]
	3:  CI = [6.85 11.07]
N = 20.00, MEAN = 8.13
	1a: CI = [10.32 5.93]
	1b: CI = [10.23 6.02]
	2:  CI = [5.9 10.38]
	3:  CI = [6.02 10.22]
N = 40.00, MEAN = 9.90
	1a: CI = [11.45 8.35]
	1b: CI = [11.34 8.45]
	2:  CI = [8.4 11.38]
	3:  CI = [8.45 11.26]
N = 80.00, MEAN = 9.27
	1a: CI = [10.36 8.17]
	1b: CI = [10.20 8.34]
	2:  CI = [8.3 10.21]
	3:  CI = [8.33 10.20]
N = 160.00, MEAN = 9.91
	1a: CI = [10.69 9.14]
	1b: CI = [10.76 9.06]
	2:  CI = [9.1 10.77]
	3:  CI = [9.00 10.78]
N = 1000.00, MEAN = 10.07
	1a: CI = [10.38 9.76]
	1b: CI = [10.38 9.76]
	2:  CI = [9.8 10.38]
	3:  CI = [9.77 10.37]
