In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Lecture 15: Probability, Sampling, and Practice with Simulation 

## Probability

### Question:

Rick and Morty are two people in a group of 100 people. Suppose a sample of size 2 is chosen, **without replacement**. 

(a) What is the probability that neither Rick nor Morty appears in the sample?

(b) What is the probability that both Rick and Morty appear in the sample?

In [None]:
# Using a simulation, how would we estimate the answer to (a)?
people = np.arange(100)  # 0 represents Rick, 1 represents Morty
np.random.choice(people, 2, replace=False)  # one random sample, without replacement

In [None]:
# The result from one random sample
my_sample = np.random.choice(people, 2, replace=False)  
if sum(my_sample == 0) > 0:
    print("Rick was chosen")
else: print("Rick was not chosen")
if sum(my_sample == 1) > 0:
    print("Morty was chosen")
else: print("Morty was not chosen")
    

In [None]:
# Simulate one iteration with a custom function
def run_one_iteration():
    '''if neither Rick nor Morty is chosen, return True; otherwise, False'''
    people = np.arange(100)  # 0 represents Rick, 1 represents Morty
    my_sample = np.random.choice(people, 2, replace=False)  
    if 0 not in my_sample and 1 not in my_sample:
        return True
    else:
        return False

run_one_iteration()

In [None]:
# Now use a for loop to run 10,000 iterations
num_iters = 10000
results = make_array()  # to accumulate the results
for i in range(num_iters):
    neither_chosen = run_one_iteration()
    results = np.append(results, neither_chosen)

results

In [None]:
# The only problem starting with an empty array is, we ended up with the wrong data type
# `astype()` lets us convert the values in an array to a different type
results = results.astype('bool')  
results

In [None]:
results_tbl = Table().with_column('Neither Chosen', results).group('Neither Chosen')
results_tbl.show()

In [None]:
neither_chosen_proportion = (
    results_tbl.where('Neither Chosen', True)
    .column('count').item(0) / num_iters
)
print("Estimated Probability:", neither_chosen_proportion)

**The theoretical calculation**:

  - If we want to get neither Rick nor Morty, then we need the first person chosen to be one of 98 other possible people. That has a probability of 98 / 100
  - Given that the first person chosen was neither Rick nor Morty, the probability that the second person chosen is neither Rick nor Morty is 97 / 99 (why?).
  - By the Multiplication Rule, the probability that the sample includes neither Rick nor Morty is $(98/100)(97/99)$.

In [None]:
(98/100) * (97/99)

In [None]:
# Using a simulation, how would we estimate the answer to (b)?

# Recall, we are wondering about the probability that both Rick and Morty are chosen
# Redefine run_one_iteration to return True only when both are chosen

def run_one_iteration():
    '''if both Rick and Morty are chosen, return True; otherwise, False'''
    people = np.arange(100)  # 0 represents Rick, 1 represents Morty
    my_sample = np.random.choice(people, 2, replace=False)  
    if 0 in my_sample and 1 in my_sample:
        return True
    else:
        return False

run_one_iteration()


In [None]:
# Now use a for loop to run 10,000 iterations
num_iters = 10000
results = make_array()  # to accumulate the results
for i in range(num_iters):
    neither_chosen = run_one_iteration()
    results = np.append(results, neither_chosen)

results = results.astype('bool')  
results

In [None]:
# Make a table to show the results, and group by 'Both Chosen'
results_tbl = Table().with_column('Both Chosen', results).group('Both Chosen')
results_tbl.show()
neither_chosen_proportion = results_tbl.where('Both Chosen', True).column('count').item(0) / num_iters
print("Estimated Probability:", neither_chosen_proportion)

**The theoretical calculation**:

  - If we want to get *both* Rick and Morty, then we need the first person chosen to be Rick or Morty; that probability is 2/100.
  - Given that the first person chosen was Rick or Morty, what is the probability that the second person is also Rick or Morty? 1/99 (why?).
  - By the Multiplication Rule, the probability that both Rick and Morty are chosen is  $(2/100)(1/99)$.

In [None]:
(2/100)*(1/99)

## Random Sampling ##

We load in a dataset of all nation United flights out of San Francisco from 6/1/15 to 8/9/15, their destination, and how long the departure was delayed (in minutes).

In [None]:
united = Table.read_table('united.csv')

# Notice how we label each row with its index using `np.arange`
united = united.with_column('Row', np.arange(united.num_rows)).move_to_start('Row')
united

# Note: Delay is measured in minutes

Some deterministic samples:

In [None]:
# Here is a deterministic sample
# If we use this sampling method again, we get the exact same result
united.where('Destination', 'JFK') 

In [None]:
# Here is another deterministic sample
united.take(np.arange(0, united.num_rows, 1000))

In [None]:
# And another deterministic sample
united.take(make_array(34, 6321, 10040, 12345, 175))

A **random sample** is chosen with the help of a random process.

In [None]:
# Here's a random sample
# It is "systematic", not "simple": from a random starting point, take every 1000th flight
# Notice that resampling gives a different sample
start = np.random.choice(np.arange(1000))
systematic_sample = united.take(np.arange(start, united.num_rows, 1000))
systematic_sample.show()

## Distributions ##

In [None]:
# Start with a simpler example than plane flights
die = Table().with_column('Face', np.arange(1, 7))
die

In [None]:
# When the unique values are in a 1-column table, we can use the `sample()` method
die.sample(10)

In [None]:
# The theoretical distribution for one die roll
my_bins = np.arange(-0.5, 6.6, 1)
die.hist(bins=my_bins)

In [None]:
# Here's an empirical histogram, based on 10 trials
die.sample(10).hist(bins=my_bins)

In [None]:
# Here's an empirical histogram, based on 100 trials
die.sample(100).hist(bins=my_bins)

In [None]:
# And 10000 samples
die.sample(10000).hist(bins=my_bins)

## Large Random Samples ##

In [None]:
# Recall our population of plane flights from San Francisco
united 

In [None]:
# Distribution of the population
united_bins = np.arange(-20, 201, 5)
united.hist('Delay', bins = united_bins)

In [None]:
# What is the smallest delay?
min(united.column('Delay'))

Note: A negative delay means the flight actually departed early

In [None]:
# What is the largest delay?
max(united.column('Delay'))

580 minutes is almost 10 hours late. Sad.

In [None]:
# Population mean delay?
np.average(united.column('Delay'))

In [None]:
# Now we take a simple random sample of size 10 and visualize it
united.sample(10).hist('Delay', bins = united_bins)

In [None]:
# A larger sample size
united.sample(1000).hist('Delay', bins = united_bins)

## Simulating Statistics ##

In [None]:
# The median of an array is the middle number (after sorting)
# A summary value computed from the entire population is called a **parameter** 
# (Not a statistic.)
# Notice the value of a parameter does not depend on sampling.
np.median(united.column('Delay'))

In [None]:
# Here's the median as a statistic
# Notice the value of a statistic varies each time we take a new random sample:
np.median(united.sample(10).column('Delay'))

In [None]:
# As sample size increases, the median becomes more stable; it settles in near the 
# median of the population
# Here's the median as a statistic
# Notice the value of a statistic varies each time we take a new random sample:
for sample_size in np.arange(1,10000,1000):
    print(np.median(united.sample(sample_size).column('Delay')))
    print(np.median(united.sample(sample_size).column('Delay')))
    print(np.median(united.sample(sample_size).column('Delay')))

In [None]:
# a helper function
def sample_median(size):
    return np.median(united.sample(size).column('Delay'))

# collect up the statistics 
sample_size = make_array()
results = make_array()
for size in [1, 2, 4, 8, 16, 32, 64, 100, 200, 300, 400, 500, 600, 700]:
    for rep in [1, 2, 3]:
        sample_size = np.append(sample_size, size)
        median = sample_median(size)
        results = np.append(results, median)
    
# visualize with a scatterplot
# how does the sample median vary under repeated sampling, compared to sample size?
medians_tbl = Table().with_columns('Sample Size', sample_size, 'Median', results)
medians_tbl.scatter('Sample Size', 'Median')