In [None]:
from datascience import *
%matplotlib inline
path_data = '../../../assets/data/'
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import numpy as np

## for Statements

In [None]:
# for each item in a sequence, we execute the block of code following the colon
for pet in make_array('cat', 'dog', 'rabbit'):
    print('I love my ' + pet)

In [None]:
# Notice that we can "add" two strings
'I love my ' + 'cat'

In [None]:
# Could have created the same output without using a for loop -- but it's tedious
pets = make_array('cat', 'dog', 'rabbit')

pet = pets.item(0)
print('I love my ' + pet)

pet = pets.item(1)
print('I love my ' + pet)

pet = pets.item(2)
print('I love my ' + pet)

In [None]:
# Our for loop can iterate over an array of numbers
# Notice the loop variable `i` loops over the values 0, 1, 2, 3, 4
for i in np.arange(5):
    print(i)

In [None]:
# If we want to do exactly the same thing 5 times in a row, we just 
# ignore the loop variable `i` and do what we want
for i in np.arange(5):
    print('Go Panthers')

In [None]:
# Sometimes we use the variable name `_` for a value that we don't care about
for _ in np.arange(5):
    print('Go Panthers')

In [None]:
# We saw `np.append()` last time, here's a refresher
s = make_array(2, 3)

# `np.append(array, new_value)` DOES NOT MODIFY the array; it returns a NEW array
np.append(s, 4)
s

Question:
  - Why doesn't the new array `[2, 3, 4]` appear in the output?
  - How can we verify the result of appending 4 to s?
  - How can we use `.append()` to modify s?

In [None]:
# Similarly, we can add 3 to each alement of an array using the addition operator
s + 3
s

In [None]:
# But unless we assign a name to the result of adding 3, that result is lost

In [None]:
s = s + 3
s

In [None]:
s = np.append(s, 4)
s

In [None]:
# A common use of for loops is for accumulating the values of a new array

result = make_array()

for i in np.arange(5):     
    result = np.append(result, i**2 + 1)
    
result

**Back to Slides...**

## Simulating heads in 100 coin tosses

In [None]:
# How would we simulate flipping 100 coins to figure out how many came up "heads"?
coin = make_array('heads', 'tails')
coin

In [None]:
flips = np.random.choice(coin, 100) == 'heads'  # don't need a for loop for this
flips

In [None]:
sum(flips)

A for loop would be interesting here to repeat the "hundred flips" experiment thousands of times, collect up the number of heads in 100 tosses for each iteration, accumulate those numbers in an array, and visualize the results.

In [None]:
# A function to simulate one outcome of the experiment

def num_heads(n):
    return sum(np.random.choice(coin, n) == 'heads')

In [None]:
# Try calling the function to verify it seems to work
num_heads(100)

Run the previous cell multiple times. Do we always get the same answer? What's going on?

To investigate the amount of "random variation" when flipping a coin 100 times, we repeat the experiment 10,000 times and visualize the results.

In [None]:
# Decide how many times you want to repeat the experiment
repetitions = 10000

In [None]:
# Simulate that many outcomes with a for loop
# Use an initially-empty array, `outcomes`, to accumulate the results
outcomes = make_array()
for ...:
    ...

Discuss:

  - What kind of object is `outcomes`? 

  - How long is `outcomes`?

  - What kind of elements are in `outcomes`?

  - What's a reasonable way to visualize `outcomes`?

In [None]:
# Make a table
heads = Table().with_column('Heads', outcomes)
heads.show(3)

# Draw the histogram, centering the bins at integers 30, 31, ..., 69, 70
heads.hist(bins = np.arange(29.5, 70.6, 1))

Questions:

  - How often did the value 50 appear in `outcomes`? Is this surprising?
  
  - Redraw the histogram several times after re-running the experiment. Is 50 always the most frequent outcome?

  - What is the shape of the distribution?

Sometimes we can study random variation by having the computer run a simulation; that's what we've done here. We didn't use any rules of probabilty or fancy mathematics. We just had the computer do 1 million coin tosses for us and visualized the results.

Other times we may want to have a theoretical model which captures the pattern of a certain distribution mathematically. Knowing some probability theory can let us calculate probabilities without a lot of computation. Math and Computing are both fundamental tools for Data Science. 

We will use both simulations and analytical tools (probability theory) to investigate random variation in this course. So, let's get started learning the basics of probability.

**Back to Slides...**

## Chance

In [None]:
# Looking for at least one 6 in 4 rolls of a die
prob_no_sixes_4_rolls = (5/6)**4
prob_no_sixes_4_rolls

In [None]:
# Probability of at least one 6 in 4 rolls
1 - prob_no_sixes_4_rolls

In [None]:
# Generalize: probability of at least one 6 in n rolls? 
# Show in a 2-column table
rolls = np.arange(1,41)
results = Table().with_columns(
    'Rolls', rolls,
    'Chance of at Least One 6', 1 - (5/6)**rolls
)
results 

In [None]:
# Visualize results in a scatter plot
results.scatter('Rolls')
plt.ylim(0, 1.1)    # start the y axis at 0

**Back to Slides...**

### The Monty Hall Problem ###

In [None]:
# If contestant chooses a goat, there are two possibilities
goats = make_array('first goat', 'second goat')
goats

In [None]:
# If contestant chooses a goat, Monty will reveal the "other" goat
def other_goat(x):
    if x == 'first goat':
        return 'second goat'
    elif x == 'second goat':
        return 'first goat'

In [None]:
other_goat('first goat')

In [None]:
other_goat('second goat')

In [None]:
# Make an array with the three options
hidden_behind_doors = np.append(goats, 'car')
hidden_behind_doors

In [None]:
# Write a function that makes a random choice for the contestant and
# returns a list of length 3 showing the contestant's guess, what is revealed
# by Monty, and what remains hidden behind the other door.

def monty_hall_game():
    """Return 
    [contestant's guess, what Monty reveals, what remains behind the other door]"""
    
    contestant_guess = np.random.choice(hidden_behind_doors)  # Notice E.L.O.
    
    # If contestant's guess is a goat, Monty will show the other goat
    
    if contestant_guess == 'first goat':
        return [contestant_guess, 'second goat', 'car']
    
    if contestant_guess == 'second goat':
        return [contestant_guess, 'first goat', 'car']
    
    # If contestant's guess is the car, Monty will randomly show one of the goats
    
    if contestant_guess == 'car':
        revealed = np.random.choice(goats)
        return [contestant_guess, revealed, other_goat(revealed)]

In [None]:
# One round of the Monty Hall game
monty_hall_game()

In [None]:
# Make a table: 10000 calls to append will simulate 10000 rounds of the game

games = Table(['Guess', 'Revealed', 'Remaining']) # empty table, 3 column labels

for i in np.arange(10000):
    games.append(monty_hall_game())   # accumulate by appending table rows


In [None]:
games.show(6)

In [None]:
# Show the distribution of the contestant's guess
original_choice = games.group('Guess')
original_choice

In [None]:
# Show the distribution of the remaining door, the one they can switch to
remaining_door = games.group('Remaining')
remaining_door

In [None]:
# Join the two tables on the 'Guess' and 'Remaining' columns
joined = original_choice.join('Guess', remaining_door, 'Remaining')
print("joined result")
joined.show()

# Relabel the columns for clarity
combined = joined.relabeled(0, 'Item').relabeled(1, 'Original Door').relabeled(2, 'Remaining Door')
print("combined result")
combined

In [None]:
# Make a bar chart to visualize
combined.barh('Item')

If the contestant stays with her original door choice, she has a clear 1/3 chance of winning the car. But, if she switches to the remaining door, she has a 2/3 chance of winning the car.