In [2]:
import numpy as np
import pandas as pd

# ___Setting Up a Test for Population Proportion___
----------------

In [1]:
# Why do we formulate hypotheses and test them?

# Typically we will have a research question, 
# We'd use data to support or debunk that claim.
# This is where hypothesis testing comes in.

## ___Could the Mean Oil Temperature be $58.5 \degree C$ ?___
___________

In [8]:
oil_temps = pd.read_csv("./../data/MetroPT3(AirCompressor).csv", usecols = ["Oil_temperature"]).to_numpy().ravel()

In [9]:
# The true mean oil temperature.

oil_temps.mean()

60.91453961453962

In [10]:
# population -> a set of engine oil temperatures.

oil_temps.size

999999

In [11]:
# Parameter that interests us -> mean oil temperature.
# The hypothesis we want to test is whether the true population mean is significantly different than our proposed mean of 58.5 degrees celcius.

# ___Hypotheses___
-------------------

## ___Null hypothesis: $H_0$ &rarr; $\bar{temp} = 58.5 \degree C$___
## ___Alternative hypothesis: $H_1$ &rarr; $\bar{temp} \neq 58.5 \degree C$___

In [12]:
# Simply having a less specific alternative hypothesis like the above gives 2 options,
# the mean temperature could either be greater than the proposed temperature or less than the proposed temperature.
# To improve the specificity, we could change the alternative hypothesis to be based on just one of the options.

## ___Null hypothesis: $H_0$ &rarr; $\bar{temp} = 58.5 \degree C$___
## ___Alternative hypothesis: $H_1$ &rarr; $\bar{temp} \gt 58.5 \degree C$___

In [13]:
# We also need to set an alpha level (significance), to which 0.05 is used customarily.

## ___Assumptions___
-------------

In [15]:
# 1) Our sample is a simple random sample. 
# 2) Sample is large enough, that the sample metrics form a normal distribution. (we'll consider the whole dataset to be a sample)

To ensure that our sample metrics comply to a normal distribution,

# $N \cdot P(H_0) \ge 10~and~N \cdot P(H_1) \ge 10$

In [19]:
# That's an unexpectedly huge number!

(oil_temps == 58.5).sum()

2881

In [20]:
(oil_temps > 58.5).sum()

577715

In [21]:
# So, yes both evaluate to counts greater than 10.