In [24]:
import pandas as pd
import numpy as np
np.random.seed(2023 - 6 - 23)
import matplotlib.pyplot as plt
from scipy.stats import t
import os

In [5]:
os.listdir("./../data")

['Cartwheeldata.csv',
 'CommonCLZvalues.png',
 'Data Description_Metro.pdf',
 'EV_Population_Data.csv',
 'MetroPT3(AirCompressor).csv',
 'nap_no_nap.csv',
 'nhanes_2015_2016.csv',
 'USDA_plants_database.csv']

In [6]:
metro = pd.read_csv("./../data/MetroPT3(AirCompressor).csv")

In [9]:
# Let's consider this dataset as our population.
# We are interested in figuring out the mean oil temperature.
# The true population parameter (mean oil temp) is

metro.Oil_temperature.mean()

60.91453961453962

In [12]:
# However, in real world scenarios, we cannot measure this true population parameter of interest.
# The size of our population is,

metro.shape[0]

999999

In [21]:
# We'll make a sample of size 10,000

sample = metro.Oil_temperature[np.random.randint(low = 0, high = metro.shape[0], size = 10_000)].to_numpy()

In [23]:
# Now we have a sample os size 10,000
# Our best estimate is the sample mean,

best_estimate = sample.mean()
best_estimate

60.88413500000001

In [25]:
# Now we need a confidence interval,
# Since we are interested in the population mean, we need to use a t instead of a Z multiplier.

# Let's say that we'd like a 95% confidence interval,
# that implies, if we sampled the population a 100 times, truly randomly, creating simple random samples,
# the confidence intervals computed using the sample parameter (sample mean) will cover the true population parameter (population mean) 95 out of 100 
# times.

# confidence interval = best estimate +- margin of error
# margin of error = multiplier x standard error

# t multiplier, at 95% confidence level, for a sample of size 10,000 is,
# given our sample is succinctly large, which is true because we samples 1/100 th of the whole population, CLT becomes valid,
# which assures that the Z scores will follow a student's t distribution.

# The probability of the needed quantile covering 95% of the density of the t distribution (assumed normal) needs to be computed.
# P(-t < quantile < +t) = 0.95
# This P equals to 1 - ((1 - alpha) / 2), where alpha is the confidence.
# @ 95% confidence alpha = 0.95

P = 1 - ((1 - 0.95) / 2)
P

0.975

In [27]:
t_multiplier = t.ppf(P, df = sample.size - 1)
t_multiplier

1.9602012636213575

In [28]:
# For means standard error = sample standard deviation / square root of sample size

stderr = sample.std() / np.sqrt(sample.size)
stderr

0.06685538940637097

In [29]:
margin_of_err = t_multiplier * stderr
margin_of_err

0.13105001879426628

In [31]:
# And here's the 95% confidence interval, for population mean.

best_estimate - margin_of_err, best_estimate + margin_of_err

(60.75308498120574, 61.01518501879428)

In [56]:
nhanes = pd.read_csv("./../data/nhanes_2015_2016.csv", usecols = ["SMQ020", "BMXBMI", "RIAGENDR"])

In [57]:
nhanes = nhanes.rename({"SMQ020": "smoker", "BMXBMI": "bmi", "RIAGENDR": "gender"}, axis = 1)
nhanes

Unnamed: 0,smoker,gender,bmi
0,1,1,27.8
1,1,1,30.8
2,1,1,28.8
3,2,2,42.4
4,2,2,20.3
...,...,...,...
5730,1,2,21.5
5731,2,1,33.8
5732,1,2,31.0
5733,1,1,26.0


In [58]:
nhanes.gender.unique()

array([1, 2], dtype=int64)

In [59]:
nhanes.smoker.unique()

array([1, 2, 7, 9], dtype=int64)

In [60]:
nhanes.gender = nhanes.gender.apply(lambda g: 'M' if g == 1 else 'F')
nhanes.smoker = nhanes.smoker.apply(lambda s: False if s == 2 else True if s == 1 else np.nan)

In [61]:
nhanes.isna().sum(axis = 0)

smoker    10
gender     0
bmi       73
dtype: int64

In [63]:
nhanes.dropna(axis = 0, inplace = True)

In [64]:
nhanes

Unnamed: 0,smoker,gender,bmi
0,True,M,27.8
1,True,M,30.8
2,True,M,28.8
3,False,F,42.4
4,False,F,20.3
...,...,...,...
5730,True,F,21.5
5731,False,M,33.8
5732,True,F,31.0
5733,True,M,26.0


In [65]:
pd.crosstab(nhanes.gender, nhanes.smoker)

smoker,False,True
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,2044,896
M,1322,1390


In [66]:
nhanes.groupby("gender").mean()

Unnamed: 0_level_0,smoker,bmi
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,0.304762,29.943571
M,0.512537,28.774668


In [67]:
nhanes.groupby(["gender", "smoker"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,bmi
gender,smoker,Unnamed: 2_level_1
F,False,29.661057
F,True,30.588058
M,False,28.592511
M,True,28.947914
