# Curriculum Review


In [13]:
%matplotlib inline
import numpy as np
import pandas as pd

import viz # curriculum example visualizations

np.random.seed(29)

## Carnival Dice Rolls
    You are at a carnival and come across a person in a booth offering you a game of "chance" (as people in booths at carnivals tend to do).
    You pay 5 dollars and roll 3 dice. If the sum of the dice rolls is greater than 12, you get 15 dollars. If it's less than or equal to 12, you get nothing.
    Assuming the dice are fair, should you play this game? How would this change if the winning condition was a sum greater than or equal to 12?

In [14]:
n_trials = nrows = 10_000
n_dice = ncols = 3

rolls = np.random.choice([1, 2, 3, 4, 5, 6], n_trials * n_dice).reshape(nrows, ncols)
rolls

array([[6, 4, 5],
       [6, 3, 1],
       [1, 2, 2],
       ...,
       [6, 2, 1],
       [3, 4, 3],
       [4, 2, 4]])

In [15]:
sums_by_trial = rolls.sum(axis=1)
sums_by_trial

array([15, 10,  5, ...,  9, 10, 10])

In [17]:
wins = sums_by_trial > 12
wins

array([ True, False, False, ..., False, False, False])

In [18]:
wins.mean()

0.2633

In [19]:
# not necessary, just do mean and it will treat Trues and 1 and False as 0
win_rate = wins.astype(int).mean()
win_rate

0.2633

In [20]:
expected_winnings = win_rate * 15
cost = 5
expected_profit = expected_winnings - cost
expected_profit

-1.0505000000000004

In [21]:
wins = sums_by_trial >= 12
win_rate = wins.astype(int).mean()
expected_winnings = win_rate * 15
cost = 5
expected_profit = expected_winnings - cost
expected_profit

0.5860000000000003

## No Rest or Relaxation
    There's a 30% chance my son takes a nap on any given weekend day. What is the chance that he takes a nap at least one day this weekend? What is the probability that he doesn't nap at all? 

In [22]:
p_nap = .3
ndays = ncols = 2
n_simulated_weekends = nrows = 10**5

In [23]:
data = np.random.random((nrows, ncols))
data

array([[0.46762045, 0.70078355],
       [0.18897809, 0.54312897],
       [0.253291  , 0.43836437],
       ...,
       [0.15008559, 0.37577491],
       [0.34690321, 0.58934311],
       [0.97135998, 0.57219933]])

In [24]:
naps = data < p_nap
naps

array([[False, False],
       [ True, False],
       [ True, False],
       ...,
       [ True, False],
       [False, False],
       [False, False]])

In [25]:
naps.sum(axis=1)

array([0, 1, 1, ..., 1, 0, 0])

In [26]:
(naps.sum(axis=1) >= 1).mean()

0.50998

In [27]:
(naps.sum(axis=1) == 0).mean()

0.49002

## One With Dataframes
    Let's take a look at one more problem:
    What is the probability of getting at least one 3 in 3 dice rolls?

In [41]:
n_simulations = nrows = 10**5
n_dice_rolled = ncols = 3

rolls = np.random.choice([1, 2, 3, 4, 5, 6], nrows * ncols).reshape(nrows, ncols)
rolls

array([[3, 3, 6],
       [5, 3, 2],
       [4, 4, 6],
       ...,
       [5, 4, 1],
       [5, 2, 4],
       [1, 1, 6]])

In [42]:
pandarolls = pd.DataFrame(rolls)
pandarolls.head()

Unnamed: 0,0,1,2
0,3,3,6
1,5,3,2
2,4,4,6
3,6,2,1
4,3,6,4


In [43]:
boolpandarolls = pandarolls.apply(lambda row: 3 in row.values, axis=1)
boolpandarolls.head()

0     True
1     True
2    False
3    False
4     True
dtype: bool

In [44]:
boolpandarolls.mean()

0.42251

# Exercises

#### 1. 

How likely is it that you roll doubles when rolling two dice?

In [52]:
# define potential outcomes
outcomes = [1,2,3,4,5,6]
simulations = 1_000_000
rolls1 = np.random.choice(outcomes, simulations)
rolls2 = np.random.choice(outcomes, simulations)

same = rolls1 == rolls2
same.mean()

0.166536

In [53]:
# theoretical
6 / 36

0.16666666666666666

In [55]:
# try with different syntax for np.random.choice
outcomes = [1,2,3,4,5,6]
simulations = 1_000_000
rolls1 = np.random.choice(outcomes, size = simulations)
rolls2 = np.random.choice(outcomes, size = simulations)

same = rolls1 == rolls2
same.mean()
# looks like second argument defaults to being size without having to type "size ="

0.166924

#### 2. 

If you flip 8 coins, what is the probability of getting exactly 3 heads? What is the probability of getting more than 3 heads?

In [77]:
# create a matrix that is 8 x 1M where each value represets a flip

# create boolean mask where new array value == 3
outcomes = [1, 0] # 1 is heads and 0 is tails
ncoins = ncols = 8
nsimulations = nrows = 1_000_000

# flips = np.random.choice(outcomes, ncols * nrows).reshape(nrows, ncols) # this is the method shown in the curriculum, but there is a better way
flips = np.random.choice(outcomes, size=(nsimulations, ncols))

flips[0:5]

array([[1, 0, 1, 1, 0, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 1],
       [0, 1, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 0]])

In [78]:
# sum each row and add new values to new 1D array
sum_of_flips = flips.sum(axis=1)
sum_of_flips

array([4, 2, 4, ..., 2, 3, 3])

In [79]:
# create boolean mask where new array value == 3
three_heads = sum_of_flips == 3
three_heads

array([False, False, False, ..., False,  True,  True])

In [80]:
# find probability using mean of boolean mask
probability_of_three_heads = three_heads.mean()
probability_of_three_heads

0.218896

In [82]:
# getting more than three heads
# change condition for boolean mask and repeat rest of steps
more_than_three_heads = sum_of_flips >= 3
more_than_three_heads

array([ True, False,  True, ..., False,  True,  True])

In [84]:
probability_more_than_three = more_than_three_heads.mean()
probability_more_than_three

0.855686

#### 3. 

There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. Assuming that Codeup randomly selects an alumni to put on a billboard, what are the odds that the two billboards I drive past both have data science students on them?

In [87]:
# will have to use the p argument in np.random.choice to set probabilities
outcomes = [1, 0] #one for data science and 0 for webdev
prob = [0.25, 0.75]
nbillboards = ncols = 2
nsimulations = nrows = 1_000_000

billboard = np.random.choice(outcomes, size=(nsimulations, ncols), p=prob)
billboard

array([[0, 0],
       [1, 1],
       [0, 1],
       ...,
       [0, 1],
       [0, 0],
       [0, 0]])

In [89]:
# sum the rows to create new 1d array
sum_of_ds = billboard.sum(axis=1)
sum_of_ds

array([0, 2, 1, ..., 1, 0, 0])

In [91]:
# create boolean mask with condition where == 2
both_ds = sum_of_ds == 2
both_ds

array([False,  True, False, ..., False, False, False])

In [93]:
probability_of_both_ds = both_ds.mean()
probability_of_both_ds

0.062834

In [94]:
# theoretical
0.25 ** 2

0.0625

#### 4.  

Codeup students buy, on average, 3 poptart packages (+- 1.5) a day from the snack vending machine. If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts on Friday afternoon?

In [103]:
# use np.random.normal to create numbers
ndays = ncols = 5
nsimulations = nrows = 1_000_000

poptarts = np.random.normal(3, 1.5, (nrows, ncols))
poptarts

array([[3.24785696, 5.26920465, 3.80526465, 4.17531891, 3.94814869],
       [3.10775126, 3.61137905, 3.40837398, 4.88380256, 3.65230578],
       [4.74301626, 3.44715995, 4.95185758, 4.21623088, 6.51033716],
       ...,
       [3.73174214, 1.05063008, 4.15103049, 2.71041675, 3.52854683],
       [1.65063811, 2.22448541, 5.54082656, 5.05661653, 6.84130274],
       [1.71960837, 1.22646326, 1.41453264, 1.85094099, 4.06341107]])

In [99]:
# sum rows and compare to 17
sum_of_bought = poptarts.sum(axis=1)
sum_of_bought

array([18.62932646, 12.77864268, 16.08061129, ..., 16.26186138,
       14.5209858 , 19.86396312])

In [101]:
any_left = sum_of_bought > 17
any_left

array([ True, False, False, ..., False, False,  True])

In [102]:
any_left.mean()

0.275339

#### 5. 

Compare Heights
- Men have an average height of 178 cm and standard deviation of 8cm.
- Women have a mean of 170, sd = 6cm.
- Since you have means and standard deviations, you can use np.random.normal to generate observations.
- If a man and woman are chosen at random, P(woman taller than man)?

In [104]:
# use np.random.normal to create numbers 0.21
ncols = 1
nsimulations = nrows = 1_000_000

heights_men = np.random.normal(178, 8, (nrows, ncols))
heights_men

array([[187.84056005],
       [181.51032839],
       [192.83671763],
       ...,
       [172.25600002],
       [175.40716893],
       [194.26125042]])

In [105]:
ncols = 1
nsimulations = nrows = 1_000_000

heights_women = np.random.normal(170, 6, (nrows, ncols))
heights_women

array([[167.64489892],
       [175.16359697],
       [184.35736378],
       ...,
       [172.7989298 ],
       [161.6757952 ],
       [157.60098654]])

In [107]:
taller = heights_women > heights_men
taller

array([[False],
       [False],
       [False],
       ...,
       [ True],
       [False],
       [False]])

In [108]:
taller.mean()

0.212147

#### 6. 

When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted and the installation fails. What are the odds that after having 50 students download anaconda, no one has an installation issue? 100 students?
- What is the probability that we observe an installation issue within the first 150 students that download anaconda?
- How likely is it that 450 students all download anaconda without an issue?

In [118]:
# define probability
outcomes = [0, 1] # zero for success, 1 for issues
prob = [249/250, 1/250]
ndownloads = ncols = 50
nsimulations = nrows = 1_000_000

anaconda = np.random.choice(outcomes, size=(nsimulations, ncols), p=prob)
anaconda

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [120]:
#sum rows, if > 0, someone had issues
sum_of_issues = anaconda.sum(axis=1)
sum_of_issues

array([0, 0, 0, ..., 0, 0, 0])

In [124]:
issues = sum_of_issues == 0
issues

array([ True,  True,  True, ...,  True,  True,  True])

In [125]:
issues.mean()

0.819024

In [127]:
# redo using 100 students
outcomes = [0, 1] # zero for success, 1 for issues
prob = [249/250, 1/250]
ndownloads = ncols = 100
nsimulations = nrows = 1_000_000

anaconda = np.random.choice(outcomes, size=(nsimulations, ncols), p=prob)
sum_of_issues = anaconda.sum(axis=1)
issues = sum_of_issues == 0
issues.mean()

0.669015

In [130]:
# What is the probability that we observe an installation issue within the first 150 students that download anaconda?
# are looking for issues so logic will change to make the mask

outcomes = [0, 1] # zero for success, 1 for issues
prob = [249/250, 1/250]
ndownloads = ncols = 150
nsimulations = nrows = 1_000_000

anaconda = np.random.choice(outcomes, size=(nsimulations, ncols), p=prob)
sum_of_issues = anaconda.sum(axis=1)
issues = sum_of_issues > 0
issues.mean()

0.451561

In [132]:
# How likely is it that 450 students all download anaconda without an issue?
# switching back to condition where there are no issues
outcomes = [0, 1] # one for success, zero for issues
prob = [249/250, 1/250]
ndownloads = ncols = 450
nsimulations = nrows = 1_000_000

anaconda = np.random.choice(outcomes, size=(nsimulations, ncols), p=prob)
sum_of_issues = anaconda.sum(axis=1)
issues = sum_of_issues == 0
issues.mean()

0.164046

#### 7. 

There's a 70% chance on any given day that there will be at least one food truck at Travis Park. However, you haven't seen a food truck there in 3 days. How unlikely is this? How likely is it that a food truck will show up sometime this week?

In [134]:
outcomes = [1, 0] # 1 for seeing a food truck, 0 for not
prob = [0.7, 0.3]
ndownloads = ncols = 3
nsimulations = nrows = 1_000_000

travis = np.random.choice(outcomes, size=(nsimulations, ncols), p=prob)
travis

array([[0, 1, 1],
       [0, 1, 1],
       [1, 1, 0],
       ...,
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1]])

In [136]:
sum_of_sightings = travis.sum(axis=1)
sum_of_sightings

array([2, 2, 2, ..., 3, 3, 3])

In [138]:
no_sightings = sum_of_sightings == 0
no_sightings

array([False, False, False, ..., False, False, False])

In [139]:
no_sightings.mean()

0.027025

In [142]:
# How likely is it that a food truck will show up sometime this week?
outcomes = [1, 0] # 1 for seeing a food truck, 0 for not
prob = [0.7, 0.3]
ndownloads = ncols = 7
nsimulations = nrows = 1_000_000

travis = np.random.choice(outcomes, size=(nsimulations, ncols), p=prob)

sum_of_sightings = travis.sum(axis=1)

sighting_this_week = sum_of_sightings > 0

sighting_this_week.mean()


0.999773

#### 8. 

If 23 people are in the same room, what are the odds that two of them share a birthday? What if it's 20 people? 40?

In [149]:
# the probability of two people having different birthday is (365/365)*(364/365)
p_of_2_diff = (365/365)*(364/365)
p_of_2_diff

0.9972602739726028

In [154]:
outcomes = [1, 0] # 1 for same, 0 for diff
prob = [1 - p_of_2_diff, p_of_2_diff]
ncols = 23
nrows = 1_000_000

bday = np.random.choice(outcomes, size=(nsimulations, ncols), p=prob)
bday[0:5]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0]])

In [155]:
sum_of_same = bday.sum(axis=1)
sum_of_same

array([0, 0, 0, ..., 0, 0, 0])

In [156]:
twenty_three_same = sum_of_same > 0
twenty_three_same

array([False, False, False, ..., False, False, False])

In [157]:
twenty_three_same.mean()

0.061112

In [158]:
# repeat for 20

outcomes = [1, 0] # 1 for same, 0 for diff
prob = [1 - p_of_2_diff, p_of_2_diff]
ncols = 20
nrows = 1_000_000

bday = np.random.choice(outcomes, size=(nsimulations, ncols), p=prob)

sum_of_same = bday.sum(axis=1)

twenty_same = sum_of_same > 0

twenty_same.mean()

0.053123

In [159]:
# repeat for 40

outcomes = [1, 0] # 1 for same, 0 for diff
prob = [1 - p_of_2_diff, p_of_2_diff]
ncols = 40
nrows = 1_000_000

bday = np.random.choice(outcomes, size=(nsimulations, ncols), p=prob)

sum_of_same = bday.sum(axis=1)

forty_same = sum_of_same > 0

forty_same.mean()

0.103588

In [171]:
# this is not even close to approximating the theoretical probability
# the logic above that it is the same as taking 23 trials of 2 people is not correct, since as the number of people increases, so does the probability that 2 will have the same bday since the number of possible bdays is finite
# the probability will change for each trial in the row, so I'm not sure how to use simulation to estimate this one
# add another row or use a lambda function?

In [169]:
# function to find theoretical solution

def samebday(people):
    days = 365
    prob = 1
    for f in range(people): 
        prob *= (days-f) / days
    return 1 - prob

In [174]:
samebday(40)

0.891231809817949