In [5]:
from random import shuffle 
from math import sqrt

**Part A: Use Monte Carlo Simulation to Estimate P(5 examples in training set)**

In [2]:
num_iter = 10**3
indexes = list(range(0, 100))

conditions_met = []
for _ in range(num_iter):
    
    # create a list of indexes and shuffle it
    shuffle(indexes)
    
    training_indices = indexes[0:75]
    testing_indices = indexes[75:100]
    
    conditions_met.append(all(x in training_indices for x in [0, 1, 2, 3, 4]))

estimated_probability = sum(conditions_met) / num_iter
print("Estimated Probability of having special examples in our training split: {}".format(estimated_probability))

Estimated Probability of having special examples in our training split: 0.233


**Part B: Provide an Estimate of the Magnitude of the Error**

There is a 95% chance that the actual probability will we within a distance of 
$\\\\$
$$1 / \sqrt{N}$$
$\\\\$
from the estimated value


In [6]:
# calculate error using the formula above 
error = 1 / sqrt(num_iter)
CI_95_percent = [round(estimated_probability - error, 3), round(estimated_probability + error, 3)]

print("A 95% Confidence Interval for the probability is given by: {}".format(CI_95_percent))

A 95% Confidence Interval for the probability is given by: [0.199, 0.262]


**Part C: Compute Monte Carlo Estimate of Number of Repititions For Which We Can be Sure that the error is accurate to three digits**

We want to be sure that our estimate is accurate to the third decimal place, so we can set up the following inequality to represent this (noting that .0005 rounds up to .001):

$$\frac{1}{\sqrt{N}} \le .0005$$

Analytically, we see that N must be at least 4,000,000 for us to be sure we're accurate within three decimal places. We check below using Monte Carlo simulation. 

In [10]:
N = 1 

while True:
    
    error = 1 / sqrt(N)
    
    if error <= .0005:
        print("Lowest N: {}".format(N))
        break 
        
    N += 1

Lowest N: 4000000


In [None]:
# finally, compute the final estimated probability value
# same process from above but with our new lowest value of N s.t. we can be sure the estimate will be within 3 decimal place

conditions_met = []
indexes = list(range(0, 100))

for _ in range(N):
    
    # create a list of indexes and shuffle it
    shuffle(indexes)
    
    training_indices = indexes[0:75]
    testing_indices = indexes[75:100]
    
    conditions_met.append(all(x in training_indices for x in [0, 1, 2, 3, 4]))

estimated_probability = sum(conditions_met) / N
print("Estimated Probability of having special examples in our training split: {}".format(estimated_probability))