In [42]:
import numpy as np
import pandas as pd

In [43]:
np.random.seed(29)


### 1. How likely is it that you roll doubles when rolling two dice?

In [44]:
#first, create an array representing 1,000,000 rolls of 2 dice
n_trials = nrows = 1_000_000
n_dice = ncols = 2
#here's your "roll": there are six possibilities
rolls = np.random.choice([1,2,3,4,5,6], n_trials * n_dice).reshape(nrows, ncols)
rolls

array([[6, 4],
       [5, 6],
       [3, 1],
       ...,
       [4, 4],
       [5, 3],
       [4, 2]])

In [45]:
rolls.shape

(1000000, 2)

In [46]:
#get it in a DF
rolls = pd.DataFrame(rolls, columns=['first_roll','second_roll'])
rolls.head()

Unnamed: 0,first_roll,second_roll
0,6,4
1,5,6
2,3,1
3,1,2
4,2,1


In [47]:
#in this example, i've created a function to count when they are the same
def count_doubles(row):
    return row.first_roll == row.second_roll
count_doubles
#and here, i create a new column full of booleans for when they are the same
rolls['doubles_count'] = rolls.apply(count_doubles, axis=1)
rolls.sample(20)

Unnamed: 0,first_roll,second_roll,doubles_count
731639,5,5,True
638494,3,5,False
205898,2,6,False
328909,4,5,False
165299,4,5,False
433413,2,4,False
292370,6,4,False
18811,6,3,False
750288,6,2,False
50665,3,6,False


In [48]:
#finally, get the average of where the two are the same
(rolls.doubles_count==True).mean()

0.166917

### 2. If you flip 8 coins, what is the probability of getting exactly 3 heads? What is the probability of getting more than 3 heads?

In [58]:
n_trials = 100_000
n_coins = 8
#similar to above ex, but here we're just using a true/false choice and avoiding having to reshape
flips = np.random.choice([0,1], size=(n_trials, n_coins))
flips

array([[0, 1, 0, ..., 1, 1, 1],
       [1, 0, 0, ..., 1, 1, 0],
       [1, 0, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 0, ..., 1, 0, 1],
       [0, 0, 1, ..., 0, 1, 1],
       [0, 1, 0, ..., 1, 1, 0]])

In [59]:
flips.shape

(100000, 8)

In [60]:
flips = pd.DataFrame(flips)
flips.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7
90018,0,0,0,0,0,1,0,1
58825,0,1,1,1,0,1,1,1
41789,0,1,0,0,0,1,1,0
17054,1,0,0,0,0,1,1,1
69216,1,0,1,0,0,1,1,0
32472,1,0,1,1,0,0,1,0
6709,0,0,0,1,1,1,0,1
31932,0,0,1,1,1,0,1,0
59519,0,0,1,1,1,0,1,1
95190,1,1,0,1,1,1,1,1


In [61]:
#add a counter for the number of heads in our series
flips['n_heads'] = flips.sum(axis=1)
flips.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,n_heads
49026,0,1,1,0,0,1,1,1,5
4915,0,1,0,0,0,1,1,0,3
12579,0,0,0,1,0,0,1,0,2
66768,1,1,0,0,1,0,0,0,3
75506,1,0,1,0,0,1,0,0,3
28259,1,1,0,0,0,0,1,1,4
10736,1,1,1,1,0,0,1,1,6
68503,0,1,1,1,0,1,1,0,5
65850,0,0,0,0,1,1,0,0,2
66497,0,0,1,0,1,0,1,1,4


In [62]:
#and get the average
(flips.n_heads == 3).mean()

0.21724

### 3. There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. Assuming that Codeup randomly selects an alumni to put on a billboard, what are the odds that the two billboards I drive past both have data science students on them?

In [66]:
n_trials = 100_000
n_billboards = 2

#get our array.  4 choices here; we'll say that '3' == ds student

billboards = np.random.choice([0,1,2,3], size=(n_trials, n_billboards))
billboards

array([[0, 3],
       [3, 2],
       [2, 0],
       ...,
       [2, 0],
       [3, 3],
       [2, 0]])

In [68]:
billboards = pd.DataFrame(billboards, columns=['first_bb','second_bb'])
billboards.head()

Unnamed: 0,first_bb,second_bb
0,0,3
1,3,2
2,2,0
3,2,1
4,3,0


In [69]:
#add a row that returns a boolean if both billboards were a '3'
billboards['ds_count'] = (billboards.first_bb == 3) & (billboards.second_bb == 3)
billboards.sample(10)

Unnamed: 0,first_bb,second_bb,ds_count
50278,1,1,False
7220,1,3,False
11735,2,0,False
82565,2,1,False
9364,1,3,False
8070,2,2,False
92730,0,2,False
97356,3,0,False
81418,3,1,False
53666,3,3,True


In [70]:
#and the average of ds_count == true (meaning both were ds students)
billboards.ds_count.mean()

0.06265

### 4. Codeup students buy, on average, 3 poptart packages with a standard deviation of 1.5 a day from the snack vending machine. If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts on Friday afternoon? (Remember, if you have mean and standard deviation, use the np.random.normal)

In [105]:
#here, we use np.random.normal to get our array, which is 5 (for 5 days) long, each value representing
#how many poptarts were bought that day.

mu, sigma = 3, 1.5
s = np.random.normal(mu,sigma,size = (100_000,5))
s

array([[2.31498356, 3.80299243, 4.04693461, 1.31504636, 4.83958929],
       [2.58405653, 4.00942842, 1.7295788 , 1.66791707, 1.42272179],
       [2.10188089, 3.23729999, 2.46168407, 1.68633274, 3.07175384],
       ...,
       [5.87093105, 3.26776733, 5.77376392, 4.83785905, 1.23505729],
       [2.58090789, 2.9717549 , 3.2056756 , 1.5668861 , 1.8344118 ],
       [2.10320685, 3.26454813, 2.65973007, 2.44223698, 4.29697168]])

In [106]:
#where the sum of them is less than 17 means there should be some left

(s.sum(axis=1) < 17).mean()

0.72479

### 5. Compare Heights

- Men have an average height of 178 cm and standard deviation of 8cm.
- Women have a mean of 170, sd = 6cm.
- Since you have means and standard deviations, you can use np.random.normal to generate observations.
- If a man and woman are chosen at random, P(woman taller than man)?

In [114]:
#similarly: 100_000 samples, but only 1 in each array
mu, sigma = 178, 8
men = np.random.normal(mu,sigma,size = (100_000,1))
men = pd.DataFrame(men)
men

Unnamed: 0,0
0,179.032519
1,170.532081
2,171.730234
3,164.897598
4,177.207476
...,...
99995,188.960693
99996,167.621551
99997,186.370176
99998,174.859755


In [123]:
mu, sigma = 170, 6
women = np.random.normal(mu,sigma,size = (100_000,1))
women = pd.DataFrame(women)
women

Unnamed: 0,0
0,169.440921
1,168.806771
2,174.152739
3,172.184862
4,175.907348
...,...
99995,167.518334
99996,178.472330
99997,178.692360
99998,168.317269


In [124]:
#since the arrays are identical, we can do a direct comparison
(women > men).mean()

0    0.21067
dtype: float64

### 6. When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted and the installation fails. What are the odds that after having 50 students download anaconda, no one has an installation issue? 100 students?


In [185]:
n_trials = 100_000
n_installs = 50
#0 == fails, 1 == succeeds
installs = np.random.choice([0,1], size=(n_trials, n_installs), p=(1/250,249/250))
installs

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 0, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [186]:
installs = pd.DataFrame(installs)
installs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,0,1,1
99996,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
99997,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,0,1,1
99998,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [187]:
#get a count of how many times all the installs were fine; 50 means they were all good
installs['count'] = installs.sum(axis=1)
installs.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,count
61632,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,50
27168,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,49
22634,1,1,1,1,1,1,1,1,1,1,...,1,1,1,0,1,1,1,1,1,48
67055,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,50
54895,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,50
82025,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,50
18801,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,50
77499,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,50
52551,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,50
79923,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,50


In [188]:
#the average of where the count was exactly 50, meaning all installs were good

(installs['count'] == 50).mean()

0.81822

In [190]:
#for 100 students:

n_trials = 100_000
n_installs = 100
installs = np.random.choice([0,1], size=(n_trials, n_installs), p=(1/250,249/250))
installs = pd.DataFrame(installs)
installs['count'] = installs.sum(axis=1)

(installs['count'] == 100).mean()


0.6698


### What is the probability that we observe an installation issue within the first 150 students that download anaconda?


In [195]:
#for 150 students:
n_trials = 100_000
n_installs = 150
#0 == fails, 1 == succeeds
installs = np.random.choice([0,1], size=(n_trials, n_installs), p=(1/250,249/250))
installs = pd.DataFrame(installs)
installs['count'] = installs.sum(axis=1)

(installs['count'] != 150).mean()

0.45161

### How likely is it that 450 students all download anaconda without an issue?

In [196]:
#for 450  students:
n_trials = 100_000
n_installs = 450
#0 == fails, 1 == succeeds
installs = np.random.choice([0,1], size=(n_trials, n_installs), p=(1/250,249/250))
installs = pd.DataFrame(installs)
installs['count'] = installs.sum(axis=1)

(installs['count'] == 450).mean()

0.16423

### 7. There's a 70% chance on any given day that there will be at least one food truck at Travis Park. However, you haven't seen a food truck there in 3 days. How unlikely is this?



In [197]:
n_trials = 100_000
n_days = 3
#0 = no food trucks; 1=yes food trucks
food_truck = np.random.choice([0,1], size=(n_trials, n_days), p=(.3,.7))
food_truck = pd.DataFrame(food_truck)
food_truck.sample(10)
#below, then, is our sample of food truck presence on 3 consecutive days

Unnamed: 0,0,1,2
96301,1,0,1
59874,1,0,1
40172,0,1,0
99049,1,0,1
94450,0,1,1
97831,1,1,1
81747,0,0,1
49055,0,1,1
91015,1,0,1
57343,0,1,1


In [198]:
food_truck['ft_count'] = food_truck.sum(axis=1)
food_truck

Unnamed: 0,0,1,2,ft_count
0,1,1,1,3
1,1,1,1,3
2,0,1,1,2
3,1,0,1,2
4,1,0,1,2
...,...,...,...,...
99995,1,1,0,2
99996,1,1,1,3
99997,1,1,1,3
99998,1,1,0,2


In [199]:
#here's our likelyhood of having no food trucks in 3 consecutive days:
(food_truck.ft_count == 0).mean()

0.02664

### How likely is it that a food truck will show up sometime this week?

In [201]:
n_trials = 100_000
n_days = 7
#0 = no food trucks; 1=yes food trucks
food_truck = np.random.choice([0,1], size=(n_trials, n_days), p=(.3,.7))
food_truck = pd.DataFrame(food_truck)
food_truck['ft_count'] = food_truck.sum(axis=1)
food_truck.sample(10)

#below, then, is our sample of food truck presence on 7 consecutive days

Unnamed: 0,0,1,2,3,4,5,6,ft_count
80288,1,1,1,1,1,1,0,6
53313,0,1,1,1,1,1,1,6
33156,1,1,1,1,1,1,0,6
57762,1,1,1,0,0,1,1,5
61365,1,1,1,1,0,1,1,6
59193,1,1,0,1,1,0,1,5
93762,1,1,1,1,1,0,0,5
18881,1,1,1,1,0,1,1,6
43713,0,0,0,0,1,0,1,2
77925,0,1,0,1,1,1,0,4


In [202]:
#here's our likelyhood of having a food truck on 1 day this week:
(food_truck.ft_count != 0).mean()

0.99987

### 8. If 23 people are in the same room, what are the odds that two of them share a birthday? 

In [203]:
n_trials = 100_000
n_bdays = 23
#0 = don't share bday, 1= share bday
birthday = np.random.choice([0,1], size=(n_trials, n_bdays), p=(364/365,1/365))
birthday = pd.DataFrame(birthday)
birthday['bd_count'] = birthday.sum(axis=1)
birthday.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,bd_count
92709,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
93097,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13597,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
93203,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6337,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59623,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95271,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42868,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
66882,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [204]:
(birthday.bd_count > 0).mean()

0.06282

### What if it's 20 people? 40?

In [205]:
n_trials = 100_000
n_bdays = 20
#0 = don't share bday, 1= share bday
birthday = np.random.choice([0,1], size=(n_trials, n_bdays), p=(364/365,1/365))
birthday = pd.DataFrame(birthday)
birthday['bd_count'] = birthday.sum(axis=1)
(birthday.bd_count > 0).mean()

0.05458

In [206]:
n_trials = 100_000
n_bdays = 40
#0 = don't share bday, 1= share bday
birthday = np.random.choice([0,1], size=(n_trials, n_bdays), p=(364/365,1/365))
birthday = pd.DataFrame(birthday)
birthday['bd_count'] = birthday.sum(axis=1)
(birthday.bd_count > 0).mean()

0.10412