In [2]:
import numpy as np
np.random.seed(2023 - 6 - 8)
import pandas as pd

# ___Estimating a Difference in Two Population Proportions With Confidence___
------------------

In [3]:
population_1 = np.random.randint(low = 0, high = 20, size = 1_000_000)
population_2 = np.random.choice(a = [1, 3, 5, 7, 9, 2, 4, 6, 11, 13, 15], size = population_1.size)

# ___Q: What is the difference in population proportions of odd numbers in the two populations?___

In [4]:
# Odd numbers in population 1

(population_1 % 2).sum() / population_1.size

0.50117

In [5]:
# Odd numbers in population 2

(population_2 % 2).sum() / population_2.size

0.726999

In [6]:
# The metric we are interested in is =>

# ___$P_{odd_{Population~1}} - P_{odd_{Population~2}}$___

## ___Population 1___
---------------------

In [13]:
# Proportion of odd numbers in Population 1, with just one sample.

prop_pop_1 = sum(population_1[np.random.randint(0, population_1.size, size = 1000)] % 2) / 1000

In [15]:
prop_pop_1

0.518

In [None]:
# Confidence interval = best estimate +- margin of error
# MoE = multiplier * standard error

In [18]:
# 95% confidence interval 

moerror = 1.96 * np.sqrt(prop_pop_1 * (1 - prop_pop_1) / 1000)
moerror

0.03097023283089748

In [19]:
# 95% confidence interval for population 1

prop_pop_1 - moerror, prop_pop_1 + moerror

(0.4870297671691025, 0.5489702328308975)

In [16]:
# Proportion of odd numbers in population 2, again with just one sample.

prop_pop_2 = (population_2[np.random.randint(0, population_2.size, size = 1000)] % 2).mean()

In [17]:
prop_pop_2

0.726

In [20]:
moerror = 1.96 * np.sqrt(prop_pop_2 * (1 - prop_pop_1) / 1000)
moerror

0.036664680159521366

In [22]:
prop_pop_2 - moerror, prop_pop_2 + moerror

(0.6893353198404786, 0.7626646801595214)

## ___What about the Margin of Error for the Difference?___
---------------------

In [23]:
# The difference is 

np.absolute(prop_pop_1 - prop_pop_2)

0.20799999999999996

In [25]:
# What is the confidence interval for the above difference?
# What is the MoE for the above difference?

# ___$Confidence~interval=Best~estimate+MoE$___
# ___$= (\bar{x} - \bar{y}) \pm \hat{Z} \cdot (x_{stderr} - y_{stderr})$___
# ___$= (\bar{x} - \bar{y}) \pm 1.96 \cdot \sqrt{\frac{\bar{x}(1 - \bar{x})}{n_x} + \frac{\bar{y}(1 - \bar{y})}{n_y}}$___

In [26]:
# Note that the bar notations are primarily used for means, but here there are used to represent proportions.

In [32]:
# best estimate (x_bar - y_bar)

best_est = np.absolute(prop_pop_1 - prop_pop_2)
best_est

0.20799999999999996

In [34]:
moerror = 1.96 * np.sqrt((prop_pop_1 * (1 - prop_pop_1) / 1000) + (prop_pop_2 * (1 - prop_pop_2) / 1000))
moerror

0.041513151651012964

In [33]:
best_est - moerror, best_est + moerror

(0.166486848348987, 0.24951315165101293)

In [35]:
# So what do we make of this?

In [None]:
# With 95% confidence, the population proportion of odd numbers in Population 2 is 16.64% to 24.95% higher than the population proportion of 
# odd numbers in Population 1.