In [1]:
from math import sqrt
from scipy import stats

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

---
# Variance & Standard Deviation

In [2]:
example_one = np.array([-10, 0, 10, 20, 30])
example_one

array([-10,   0,  10,  20,  30])

In [3]:
example_one.mean()

10.0

In [4]:
example_two = np.array([8, 9, 10, 11, 12])
example_two

array([ 8,  9, 10, 11, 12])

In [5]:
example_two.mean()

10.0

In [6]:
# numpy variance
example_one.var()

200.0

In [7]:
# numpy variance
example_two.var()

2.0

In [8]:
# manually calculating variance for example_one
((example_one[0] - example_one.mean()) ** 2 + (example_one[1] - example_one.mean()) ** 2 + (example_one[2] - example_one.mean()) ** 2 + (example_one[3] - example_one.mean()) ** 2 + (example_one[4] - example_one.mean()) ** 2) / len(example_one)

200.0

In [9]:
# manually calculating variance for example_two
((example_two[0] - example_two.mean()) ** 2 + (example_two[1] - example_two.mean()) ** 2 + (example_two[2] - example_two.mean()) ** 2 + (example_two[3] - example_two.mean()) ** 2 + (example_two[4] - example_two.mean()) ** 2) / len(example_two)


2.0

In [10]:
# manually calculating standard deviation for example_one
sqrt(((example_one[0] - example_one.mean()) ** 2 + (example_one[1] - example_one.mean()) ** 2 + (example_one[2] - example_one.mean()) ** 2 + (example_one[3] - example_one.mean()) ** 2 + (example_one[4] - example_one.mean()) ** 2) / len(example_one))

14.142135623730951

In [11]:
# manually calculating standard deviation for example_two
sqrt(((example_two[0] - example_two.mean()) ** 2 + (example_two[1] - example_two.mean()) ** 2 + (example_two[2] - example_two.mean()) ** 2 + (example_two[3] - example_two.mean()) ** 2 + (example_two[4] - example_two.mean()) ** 2) / len(example_two))

1.4142135623730951

---
# Simulation Exercises

#### 1. How likely is it that you roll doubles when rolling two dice?

In [12]:
# represent your data
# create a matrix
n_simulations = 10_000
n_trials = 2 # dice

dice_rolls = pd.DataFrame(np.random.choice(range(1,7), size=(n_simulations, n_trials)))
dice_rolls

Unnamed: 0,0,1
0,2,4
1,2,6
2,1,2
3,2,2
4,6,1
...,...,...
9995,5,5
9996,2,2
9997,3,1
9998,4,2


In [13]:
# aggregate by row
dice_rolls["doubles"] = dice_rolls[0] == dice_rolls[1]
dice_rolls

Unnamed: 0,0,1,doubles
0,2,4,False
1,2,6,False
2,1,2,False
3,2,2,True
4,6,1,False
...,...,...,...
9995,5,5,True
9996,2,2,True
9997,3,1,False
9998,4,2,False


In [14]:
# aggregate for probability
(dice_rolls["doubles"] == True).mean()

0.1604

#### 2. If you flip 8 coins, what is the probability of getting exactly 3 heads?

In [15]:
# represent your data
# create a matrix
n_simulations = 10_000
n_trials = 8 # coins

# 0 represents tails; 1 represents heads
coin_flips = pd.DataFrame(np.random.choice([0, 1], size=(n_simulations, n_trials)))
coin_flips

Unnamed: 0,0,1,2,3,4,5,6,7
0,0,0,1,0,1,0,1,1
1,0,1,0,0,0,1,1,1
2,0,1,0,0,0,1,1,0
3,1,0,0,0,1,0,1,1
4,0,1,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...
9995,1,0,0,0,1,0,0,0
9996,1,1,0,0,0,1,1,1
9997,1,0,1,0,1,0,0,1
9998,1,1,1,1,0,1,1,1


In [16]:
# aggregate by row
coin_flips["total_heads"] = coin_flips.sum(axis=1)
coin_flips

Unnamed: 0,0,1,2,3,4,5,6,7,total_heads
0,0,0,1,0,1,0,1,1,4
1,0,1,0,0,0,1,1,1,4
2,0,1,0,0,0,1,1,0,3
3,1,0,0,0,1,0,1,1,4
4,0,1,0,0,1,1,1,0,4
...,...,...,...,...,...,...,...,...,...
9995,1,0,0,0,1,0,0,0,2
9996,1,1,0,0,0,1,1,1,5
9997,1,0,1,0,1,0,0,1,4
9998,1,1,1,1,0,1,1,1,7


In [17]:
# aggregate for probability
(coin_flips["total_heads"] == 3).mean()

0.218

#### If you flip 8 coins, what is the probability of getting more than 3 heads?

In [18]:
# aggregate for probability
(coin_flips["total_heads"] > 3).mean()

0.6396

#### 3. There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. Assuming that Codeup randomly selects an alumni to put on a billboard, what are the odds that the two billboards I drive past both have data science students on them?

In [19]:
# represent data
# create matrix
n_simulations = 10_000
n_trials = 2 # billboards

# 0 represents web dev students; 1 represents data sci students
billboards = pd.DataFrame(np.random.choice([0, 1], size=(n_simulations, n_trials), p=[3/4, 1/4]))
billboards

Unnamed: 0,0,1
0,0,0
1,0,0
2,1,0
3,0,0
4,0,1
...,...,...
9995,1,1
9996,1,0
9997,0,0
9998,0,0


In [20]:
# aggregate by row
billboards["total_data_sci"] = billboards.sum(axis=1)
billboards

Unnamed: 0,0,1,total_data_sci
0,0,0,0
1,0,0,0
2,1,0,1
3,0,0,0
4,0,1,1
...,...,...,...
9995,1,1,2
9996,1,0,1
9997,0,0,0
9998,0,0,0


In [21]:
# aggregate for probability
(billboards["total_data_sci"] == 2).mean()

0.0639

#### 4. Codeup students buy, on average, 3 poptart packages (+- 1.5) a day from the snack vending machine. If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts on Friday afternoon?

In [35]:
# represent data
# create matrix
mean = 3
std = 1.5
n_simulations = 10_000
n_trials = 5 # weekdays

poptarts = np.round(np.random.normal(mean, std, size=(n_simulations, n_trials)))
poptarts

array([[ 1.,  3.,  2.,  3.,  3.],
       [ 3.,  2.,  3.,  2.,  1.],
       [ 5.,  5.,  4.,  5.,  1.],
       ...,
       [ 3.,  2.,  3.,  1.,  5.],
       [-1.,  4.,  4.,  3.,  4.],
       [ 4.,  2., -1.,  2.,  4.]])

In [36]:
poptarts = pd.DataFrame(poptarts)
poptarts

Unnamed: 0,0,1,2,3,4
0,1.0,3.0,2.0,3.0,3.0
1,3.0,2.0,3.0,2.0,1.0
2,5.0,5.0,4.0,5.0,1.0
3,4.0,4.0,5.0,3.0,3.0
4,3.0,4.0,2.0,4.0,-0.0
...,...,...,...,...,...
9995,3.0,5.0,4.0,4.0,4.0
9996,1.0,5.0,-0.0,5.0,3.0
9997,3.0,2.0,3.0,1.0,5.0
9998,-1.0,4.0,4.0,3.0,4.0


In [38]:
poptarts = pd.DataFrame(np.where(poptarts < 0, 0, poptarts))
poptarts

Unnamed: 0,0,1,2,3,4
0,1.0,3.0,2.0,3.0,3.0
1,3.0,2.0,3.0,2.0,1.0
2,5.0,5.0,4.0,5.0,1.0
3,4.0,4.0,5.0,3.0,3.0
4,3.0,4.0,2.0,4.0,-0.0
...,...,...,...,...,...
9995,3.0,5.0,4.0,4.0,4.0
9996,1.0,5.0,-0.0,5.0,3.0
9997,3.0,2.0,3.0,1.0,5.0
9998,0.0,4.0,4.0,3.0,4.0


In [39]:
# aggregate by row
poptarts["weekly_consumption"] = poptarts.sum(axis=1)
poptarts

Unnamed: 0,0,1,2,3,4,weekly_consumption
0,1.0,3.0,2.0,3.0,3.0,12.0
1,3.0,2.0,3.0,2.0,1.0,11.0
2,5.0,5.0,4.0,5.0,1.0,20.0
3,4.0,4.0,5.0,3.0,3.0,19.0
4,3.0,4.0,2.0,4.0,-0.0,13.0
...,...,...,...,...,...,...
9995,3.0,5.0,4.0,4.0,4.0,20.0
9996,1.0,5.0,-0.0,5.0,3.0,14.0
9997,3.0,2.0,3.0,1.0,5.0,14.0
9998,0.0,4.0,4.0,3.0,4.0,15.0


In [43]:
# aggregate for probabilty
(poptarts["weekly_consumption"] < 17).mean()

0.6644

#### Exercise 5
5. Compare Heights:
    - Men have an average height of 178 cm and standard deviation of 8cm.
    - Women have a mean of 170, sd = 6cm.
    - If a man and woman are chosen at random, P(woman taller than man)?

In [57]:
# represent data
m_mean = 178
m_std = 8

f_mean = 170
f_std = 6

# create matrices
n_simulations = 10_000

men = pd.DataFrame(np.random.normal(m_mean, m_std, size=(n_simulations)))
women = pd.DataFrame(np.random.normal(f_mean, f_std, size=(n_simulations)))

# aggregate by row
women > men

# aggregate for probability
(women > men).mean()

0    0.2081
dtype: float64

#### 6. When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted and the installation fails. What are the odds that after having 50 students download anaconda, no one has an installation issue?

In [61]:
# represent data
# create matrix
n_simulations = 10_000
n_trials = 50 # students

# 0 is fail; 1 is pass
conda_dls = pd.DataFrame(np.random.choice([0, 1], size=(n_simulations, n_trials), p=[1/250, 249/250]))
conda_dls

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
9996,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
9997,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
9998,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [62]:
# aggregate by row
conda_dls["total_dls"] = conda_dls.sum(axis=1)
conda_dls

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,total_dls
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,50
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,50
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,50
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,50
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,50
9996,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,50
9997,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,49
9998,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,49


In [64]:
# aggregate for probability
(conda_dls["total_dls"] == 50).mean()

0.8181

#### 7. There's a 70% chance on any given day that there will be at least one food truck at Travis Park. However, you haven't seen a food truck there in 3 days. How unlikely is this?

In [67]:
# represent data
# create matrix
n_simulations = 10_000
n_trials = 3 # three days

# 0 is no food trucks; 1 is food trucks
food_trucks = pd.DataFrame(np.random.choice([0, 1], size=(n_simulations, n_trials), p=[.3, .7]))
food_trucks

Unnamed: 0,0,1,2
0,1,1,1
1,0,1,0
2,1,1,0
3,0,1,1
4,0,1,1
...,...,...,...
9995,1,1,1
9996,0,1,1
9997,1,1,0
9998,1,0,0


In [68]:
# aggregate by row
food_trucks["total_trucks_over_three_days"] = food_trucks.sum(axis=1)
food_trucks

Unnamed: 0,0,1,2,total_trucks_over_three_days
0,1,1,1,3
1,0,1,0,1
2,1,1,0,2
3,0,1,1,2
4,0,1,1,2
...,...,...,...,...
9995,1,1,1,3
9996,0,1,1,2
9997,1,1,0,2
9998,1,0,0,1


In [70]:
# aggregate for probability
(food_trucks["total_trucks_over_three_days"] == 0).mean()

0.0277

#### 8. If 23 people are in the same room, what are the odds that two of them share a birthday?

In [78]:
# represent data
# create matrix
n_simulations = 10_000
n_trials = 23 # people

birthdays = pd.DataFrame(np.random.choice(range(0, 365), size=(n_simulations, n_trials)))
birthdays

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,66,131,23,262,79,274,160,167,218,153,...,278,42,262,208,349,105,45,99,350,273
1,331,263,348,163,223,248,177,163,37,93,...,134,320,239,178,279,283,174,66,56,131
2,203,96,247,110,106,3,90,102,127,29,...,174,213,71,128,241,73,82,2,149,353
3,364,155,236,325,47,15,206,327,207,209,...,215,333,146,237,222,208,319,95,97,200
4,163,78,128,276,358,284,213,215,125,294,...,67,22,132,325,362,290,364,323,137,225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,184,139,141,245,227,284,331,323,115,353,...,55,286,167,24,87,172,221,40,125,24
9996,198,246,155,157,187,224,332,16,199,76,...,162,321,210,350,234,44,84,44,156,225
9997,229,147,204,189,78,122,248,73,140,159,...,299,74,240,270,214,235,305,343,1,89
9998,96,307,199,56,201,342,203,194,45,41,...,150,51,73,211,4,124,358,47,88,69


In [79]:
# aggregate by row
birthdays["unique_birthdays"] = birthdays.nunique(axis=1)
birthdays

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,unique_birthdays
0,66,131,23,262,79,274,160,167,218,153,...,42,262,208,349,105,45,99,350,273,22
1,331,263,348,163,223,248,177,163,37,93,...,320,239,178,279,283,174,66,56,131,21
2,203,96,247,110,106,3,90,102,127,29,...,213,71,128,241,73,82,2,149,353,23
3,364,155,236,325,47,15,206,327,207,209,...,333,146,237,222,208,319,95,97,200,23
4,163,78,128,276,358,284,213,215,125,294,...,22,132,325,362,290,364,323,137,225,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,184,139,141,245,227,284,331,323,115,353,...,286,167,24,87,172,221,40,125,24,22
9996,198,246,155,157,187,224,332,16,199,76,...,321,210,350,234,44,84,44,156,225,22
9997,229,147,204,189,78,122,248,73,140,159,...,74,240,270,214,235,305,343,1,89,22
9998,96,307,199,56,201,342,203,194,45,41,...,51,73,211,4,124,358,47,88,69,23


In [80]:
# aggregate for probability
(birthdays["unique_birthdays"] < 23).mean()

0.5095