In [1]:
from sklearn.datasets import load_wine
from math import sqrt
from scipy.stats import binom
from random import randint
from numpy import array
from numpy.random import shuffle

**Part A: Working with "Class 2" of the Wine Dataset**

In [2]:
wine = load_wine()

X = wine['data']
target = wine['target']

In [3]:
# determine what fraction of the data instances belong to class 2 
fraction_of_2 = sum(target == 2) / len(target)
print("Fraction of Instances belonging to Class 2: {}".format(fraction_of_2))

Fraction of Instances belonging to Class 2: 0.2696629213483146


Suppose we take a sample size of n=40, and for each independent sample, the probability of getting an instance of class 2 is the value we just determined. Then, the number of class 2 samples is modeled by a binomial distribution with 

$$E[X] = np = 40(.2697)$$

In [4]:
# determine the integer closed to the expected value of the number of class 2 instances in a sample size of 40 

print("Int closest to expected value: {}".format(round(fraction_of_2 * 40)))

Int closest to expected value: 11


**Part B: Probability that the fraction will be < 5 units from p**

In [5]:
interval = (fraction_of_2 - .05, fraction_of_2 + .05)
print(interval)

# this is the intervals of frequencies that are less than or equal to .05 units away from p 
# we can iterate through every value between 0 and 40 and see if the current value falls within the range 

satisfying_integers = []
bound1 = interval[0]
bound2 = interval[1]

for k in range(0, 40+1):
    
    # if we have k successes, we can write this as a ratio
    proportion = k/40

    if proportion > bound1 and proportion < bound2:
        satisfying_integers.append(k)

print(satisfying_integers)

(0.2196629213483146, 0.3196629213483146)
[9, 10, 11, 12]


In [11]:
# now that we have the integers satifsying this condtion, we can use the pmf of the binomial distribution to find the probability that we get this many successes 

probability_in_range = sum([binom.pmf(k, 40, fraction_of_2) for k in satisfying_integers])
print("Probability of being in range: {}".format(round(probability_in_range, 3)))

Probability of being in range: 0.524


**Part C: Use Monte Carlo Simulation to Estimate the probability value in the preceding part**

For choosing the number of iterations, we need to consider that the magnitude of error is given by $1/\sqrt{N}$ where N is the number of iterations. We set up the following inequality to determine how many iterations we should use, considering that we want to be within three decimals places:

$$
\frac{1}{\sqrt{N}} \le .0005 \implies N \ge 4,000,000
$$

In [19]:
num_iter = 4 * 10**6
successes = 0

# create a copy of the target array so we can shuffle but preserve the original target values 
target_copy = target
for _ in range(num_iter):

    # simulate process of getting a sampling a single set of size 40 from the original dataset 
    sample = []

    # shuffle(target_copy)
    # sample = target_copy[:40]
    
    for __ in range(40):
        index = randint(0, len(target_copy)-1)
        # shuffle(target_copy)
        sample.append(target_copy[index])

    # typecast sample array to numpy array for boolean indexing
    sample = array(sample)

    num_success = sum(sample == 2)

    if num_success in satisfying_integers:
        successes += 1

print("Probability of being in range (monte): {}".format(round(successes/num_iter, 3)))


Probability of being in range (monte): 0.524
