In [2]:
import numpy as np
np.random.seed(2023 - 6 - 29)
import pandas as pd

# ___Setting Up a Test for Differences Between Population Proportions.___
--------------

In [10]:
data = pd.read_csv("../data/MetroPT3(AirCompressor).csv", usecols = ["Reservoirs", "Oil_temperature"])

In [14]:
data.Reservoirs.min(), data.Reservoirs.max()

(0.7119999999999997, 10.3)

In [27]:
# Let's divide this dataset into two. 
# We'll consider the two resulting datasets as two samples from two different populations.

mask_high = data.Reservoirs > 9

high_res = data.Oil_temperature.loc[mask_high]
low_res = data.Oil_temperature.loc[~mask_high]

In [29]:
# Low_res - oil temperatures when the pressure in the reserviour was less than or equal to 9
# high_res - oil temperatures when the pressure in the reserviour was greater than 9

low_res.size, high_res.size

(526274, 473725)

In [36]:
# Significance levels must be set even before postulating the null and alternative hypotheses.

alpha = 0.1 # 10%

In [31]:
# Q: Is there a significant difference between the proportion of temperatures above 60 degrees between the low pressure reservoir
# and high pressure reservoir populations.

# Null: There isn't any significant difference between the proportion of oil temperatures above 60 degrees between the low pressure and high pressure
# populations.
# prop_hres - prop_lres = 0

# Alternative: There is a significant difference between the proportion of oil temperatures above 60 degrees between the low pressure and high pressure
# populations. 
# prop_hres - prop_lres != 0   (Here, we do not care about the direction, it doesn't matter whether the relationship is < or >) !

lr_prop = (low_res > 60).mean()
lr_prop

0.33425744004073926

In [34]:
# It appears that when the pressure inside the pressure chamber was high, the oils in the pressure chambers were hotter.
# There IS INDEED A SIGNIFICANT difference.

hr_prop = (high_res > 60).mean()
hr_prop

0.670659137685366

In [43]:
# Assumptions: The two samples are random and independent.
# The samples are large enough, that the estimates will represent a normal distribution.

# WE'LL CONSIDER THE SPLIT DATASETS AS SAMPLES FROM TWO LARGER POPULATIONS.
# Each sample should have at least 10 values representing each of the two classes.

# Sample made from the high reservoir population has the following number of lower and higher temperatures.

sum(high_res < 60), sum(high_res >= 60)

(156017, 317708)

In [44]:
# Sample made from the low reservoir population has the following number of lower and higher temperatures.

sum(low_res < 60), sum(low_res >= 60)

(350363, 175911)

In [46]:
# That looks good.

## ___What to do When The Data is Not Available?___
--------

In [47]:
# Above, we have the data. Suppose we do not and the only metrics available at hand are the proportions.
# let's say that we have two samples.
# One sample comes from men and the next from women.
# These samples contain responses to a yes/no question. The only options are yes/no.

# Say, that the proportion of yesses in the men's sample is 0.76 and that in women's sample is 0.619
# Sample sizes are 1204 and 871 for men and women respectively.

# We need to compute a common proportion p_hat, and make sure that 

# p_hat x size(men) >= 10
# (1 - p_hat) x size(men) >= 10
# p_hat x size(women) >= 10
# (1 - p_hat) x size(women) >= 10

In [None]:
# Note that we are not using the proportions for men and women to compute separate counts.