In [4]:
import numpy as np
np.random.seed(2023 - 6 - 29)
import pandas as pd
from scipy.stats import (norm, t)

# ___Setting Up a Test for Differences Between Population Proportions.___
--------------

In [5]:
data = pd.read_csv("../data/MetroPT3(AirCompressor).csv", usecols = ["Reservoirs", "Oil_temperature"])

In [6]:
data.Reservoirs.min(), data.Reservoirs.max()

(0.7119999999999997, 10.3)

In [7]:
# Let's divide this dataset into two. 
# We'll consider the two resulting datasets as two samples from two different populations.

mask_high = data.Reservoirs > 9

high_res = data.Oil_temperature.loc[mask_high]
low_res = data.Oil_temperature.loc[~mask_high]

In [8]:
# Low_res - oil temperatures when the pressure in the reserviour was less than or equal to 9
# high_res - oil temperatures when the pressure in the reserviour was greater than 9

low_res.size, high_res.size

(526274, 473725)

In [9]:
# Significance levels must be set even before postulating the null and alternative hypotheses.

alpha = 0.1 # 10%

In [10]:
# Q: Is there a significant difference between the proportion of temperatures above 60 degrees between the low pressure reservoir
# and high pressure reservoir populations.

# Null: There isn't any significant difference between the proportion of oil temperatures above 60 degrees between the low pressure and high pressure
# populations.
# prop_hres - prop_lres = 0

# Alternative: There is a significant difference between the proportion of oil temperatures above 60 degrees between the low pressure and high pressure
# populations. 
# prop_hres - prop_lres != 0   (Here, we do not care about the direction, it doesn't matter whether the relationship is < or >) !

lr_prop = (low_res > 60).mean()
lr_prop

0.33425744004073926

In [11]:
# It appears that when the pressure inside the pressure chamber was high, the oils in the pressure chambers were hotter.
# There IS INDEED A SIGNIFICANT difference.

hr_prop = (high_res > 60).mean()
hr_prop

0.670659137685366

In [12]:
# Assumptions: The two samples are random and independent.
# The samples are large enough, that the estimates will represent a normal distribution.

# WE'LL CONSIDER THE SPLIT DATASETS AS SAMPLES FROM TWO LARGER POPULATIONS.
# Each sample should have at least 10 values representing each of the two classes.

# Sample made from the high reservoir population has the following number of lower and higher temperatures.

sum(high_res < 60), sum(high_res >= 60)

(156017, 317708)

In [13]:
# Sample made from the low reservoir population has the following number of lower and higher temperatures.

sum(low_res < 60), sum(low_res >= 60)

(350363, 175911)

In [14]:
# That looks good.

In [15]:
# Best estimate is the difference between the sample proportions

best_est = hr_prop - lr_prop
best_est

0.33640169764462674

# ___$SE_{combined} = \sqrt{\frac{p_1(1 - p_1)}{n_1}} + \sqrt{\frac{p_2(1 - p_2)}{n_2}}$___

or;

# ___$\hat{p} = \frac{(p_1 \cdot n_1) + (p_2 \cdot n_2)}{(n_1 + n_2)}$___
# ___$SE_{combined} = \sqrt{\hat{p}(1 - \hat{p})(\frac{1}{n_1} + \frac{1}{n_2})}$___

In [16]:
# test statistic = (best estimate - hypothesized estimate) / standard error of the estimate

# One way to calculate the combined standard error is;
# simply the sum of the two standard errors.

stderr_m1 = np.sqrt((hr_prop * (1 - hr_prop) / high_res.size) + (lr_prop * (1 - lr_prop) / low_res.size))
stderr_m1

0.0009429167122665274

In [17]:
# another way using the common population proportion is; (See What to do When The Data is Not Available? for more details)

p_hat_oil = ((hr_prop * high_res.size) + (lr_prop * low_res.size)) / (high_res.size + low_res.size)
p_hat_oil

0.49361949361949364

In [18]:
stderr_m2 = np.sqrt(p_hat_oil * (1 - p_hat_oil) * ((1 / high_res.size) + (1 / low_res.size)))
stderr_m2

0.0010013025305188217

In [19]:
# There is a very minor difference between them, but that's okay!

print(f"The difference between the standard errors computed in the two different ways is {stderr_m2 - stderr_m1:.10f}")

The difference between the standard errors computed in the two different ways is 0.0000583858


In [21]:
best_est

0.33640169764462674

In [20]:
# We'll use the actual, more accurate stdndard error, computed using the two standard errors.

z = (best_est - 0.0) / stderr_m1
z

356.76713888759514

In [106]:
# Now, that's a bloody big number.
# Because our proportions were apparently very different!

(hr_prop, high_res.size),  (lr_prop, low_res.size)

((0.670659137685366, 473725), (0.33425744004073926, 526274))

In [22]:
# p value
# the probability of a random value from normal distribution being less than or equal to the given score is 

norm.cdf(z)

1.0

In [23]:
# p value > alpha (0.05)
# We cannot reject the Null hypothesis.
# This states that there is NO SIGNIFICANT DIFFERENCE IN THE PROPORTION OF OIL TEMPERATURES ABOVE 60 DEGREES IN THE TWO POPULATIONS.

# WHICH IS VERY UNLIKELY GIVEN THE DIFFERENCE IN THE SAMPLES.

lr_prop, hr_prop

(0.33425744004073926, 0.670659137685366)

In [None]:
# THIS IS A HUGE SHORTCOMING OF P VALUES, CALLED P-HACKING.
# YOU ARE OFTEN GURANTEED TO GET A LARGER P VALUE WITH HUGE SAMPLE SIZES.

## ___What to do When The Data is Not Available?___
--------

In [19]:
# Above, we have the data. Suppose we do not and the only metrics available at hand are the proportions.
# let's say that we have two samples.
# One sample comes from men and the next from women.
# These samples contain responses to a yes/no question. The only options are yes/no.

# Say, that the proportion of yesses in the men's sample is 0.716 and that in women's sample is 0.699
# Sample sizes are 1204 and 1171 for men and women respectively.

# We need to compute a common proportion p_hat, and make sure that 

# p_hat x size(men) >= 10
# (1 - p_hat) x size(men) >= 10
# p_hat x size(women) >= 10
# (1 - p_hat) x size(women) >= 10

In [20]:
# Note that we are not using the proportions for men and women to compute separate counts.

In [21]:
prop_yes_men, prop_yes_women = 0.716, 0.699
smen, swomen = 1204, 1171

In [22]:
# Null => No significant difference between the proportion of mena nd women who answered yes to the given question.
# Alternative => The proportion of men who answered yes to the given question is not equal to the women who said yes.

# H0: P_m = P_w
# H1: P_m != P_w

In [23]:
# significance 10%

alpha = 0.1

In [24]:
# Assumptions:
# Two independent random samples - check.
# Large enough samples - ?

# How do we find the p_hat? the common sample proportion?
# This is computed as the = total number of yesses / total number of people 

p_hat = ((smen * prop_yes_men) + (swomen * prop_yes_women)) / (smen + swomen)
p_hat

0.7076181052631578

In [25]:
# p_hat here is the proportion of people who said yes, to the question, regardless of the gender.

# men yes, no
(p_hat * smen) > 10, ((1 - p_hat) * smen) > 10

(True, True)

In [26]:
# women yes, no

(p_hat * swomen) > 10, ((1 - p_hat) * swomen) > 10

(True, True)

In [27]:
# Okay!

In [114]:
# Best estimate - difference between the sample proportions

best_est = prop_yes_men - prop_yes_women
best_est

0.017000000000000015

In [115]:
stderr = np.sqrt(p_hat * (1 - p_hat) * ((1 / smen) + (1 / swomen)))
z = (best_est - 0.0) / stderr
z

0.9106135307056814

In [116]:
# Cumulative distribution function, the probability of a random value from normal distribution being less than or equal to given score.

p = norm.cdf(z)
p

0.8187504806548868

In [None]:
# p > alpha

# That p value is greater than our significance, so we cannot reject the Null hypothesis.
# So, there's sufficient evidence that there is no significant difference in the proportion of men and women who said yes to the question.

In [31]:
# Repeat this to the oil temperature

In [32]:
hr_prop, lr_prop

(0.670659137685366, 0.33425744004073926)

In [33]:
p_hat_oil = ((hr_prop * high_res.size) + (lr_prop * low_res.size)) / (high_res.size + low_res.size)
p_hat_oil

0.49361949361949364

In [34]:
p_hat_oil * low_res.size, (1 - p_hat_oil) * low_res.size

(259779.1053851054, 266494.8946148946)

In [35]:
p_hat_oil * high_res.size, (1 - p_hat_oil) * high_res.size

(233839.8946148946, 239885.1053851054)

In [36]:
# This works too, but when we have the data at hand, it's better to use it.