In [1]:
import numpy as np
np.random.seed(2023 - 6 - 1)
import pandas as pd
import matplotlib.pyplot as plt

# ___Estimating Population Proportions with Confidence___
-----------------

In [2]:
# Confidence interval = best estimate +- margin of error.
# Best estimate is an unbiased point estimate. (x_bar)

In [3]:
# margin of error -> composed of two parts
# 1) a multiplier representing the needed leve of confidence.
# 2) measure of sampling variability in our statistic

In [4]:
# e.g. for 95% confidence interval -> the multiplier is 1.96
# 95% confidence level corresponds to 0.05 (5%) significance.

In [5]:
# We'll use a dataset from C.S Mott Children's Hospital in Ann Arbor, Michigan.
# This data is collected to record parents taking necessary measures to ensure their children's safety when travelling.

## ___The main question we are interested in is that what proportion of parents use a special seat for their toddlers in their cars?___
![Toddler's car seat](./toddler.webp)

In [6]:
# Setting the context for the analyses.

# Population -> All parents with a toddler.
# Statistic of interest -> a proportion

In [2]:
# Unfortunately the Ann Arbor data is not available for download. So, let's use this dataset for analysis.
# Let's estimate the proportion of a brand of ev among all the EVs. -> 

evdata = pd.read_csv("../data/EV_Population_Data.csv")

In [3]:
# evdata.to_csv("./EV_Population_Data.csv", index = False)

In [4]:
evdata.shape

(130443, 3)

In [5]:
evdata.Make.unique()

array(['TESLA', 'HONDA', 'NISSAN', 'FORD', 'AUDI', 'KIA', 'CHEVROLET',
       'SMART', 'BMW', 'TOYOTA', 'JEEP', 'FIAT', 'VOLVO', 'CHRYSLER',
       'LEXUS', 'PORSCHE', 'CADILLAC', 'HYUNDAI', 'MERCEDES-BENZ',
       'RIVIAN', 'VOLKSWAGEN', 'JAGUAR', 'LINCOLN', 'MITSUBISHI', 'MINI',
       'POLESTAR', 'GENESIS', 'SUBARU', 'LUCID', 'TH!NK', 'LAND ROVER',
       'FISKER', 'AZURE DYNAMICS', 'WHEEGO ELECTRIC CARS', 'BENTLEY'],
      dtype=object)

In [6]:
# Let's focus of Nissan EVs.

# Our population -> all the EVs available in our dataset.
# Metric of interest -> the proportion of Nissan EVs.

In [7]:
evdata["is_Nissan"] = evdata.Make.apply(lambda name: True if name == "NISSAN" else False)

In [8]:
# Population size is

print(f"{evdata.shape[0]:,}")

130,443


In [9]:
# We can sample 1000 cars 1000 times. :)

NISSANs_in_sample = []
LIMIT = evdata.shape[0]

for i in range(1000):
    NISSANs_in_sample.append(evdata.is_Nissan[np.random.randint(low = 0, high = LIMIT, size = 1000)].mean())

In [10]:
# Now, that we have sampled the data 1000 times with 1000 records in each sample.

np.mean(NISSANs_in_sample)

0.09976100000000002

In [11]:
evdata.is_Nissan.mean()

0.0998367102872519

In [12]:
# The results are pretty close.
# Since we cannot do 1000 repeated samplings in real life, we'd next use a single sample of 20,000 records.

In [13]:
sample = evdata.copy().loc[np.random.randint(low = 0, high = LIMIT, size = 20000), :]

In [14]:
sample

Unnamed: 0,City,State,Make,is_Nissan
94928,Redmond,WA,TESLA,False
27995,Sammamish,WA,FIAT,False
43418,Seattle,WA,NISSAN,True
40937,Sammamish,WA,TESLA,False
109475,Spokane,WA,TESLA,False
...,...,...,...,...
120087,Langley,WA,BMW,False
54238,Mercer Island,WA,PORSCHE,False
45303,Federal Way,WA,NISSAN,True
58883,Kirkland,WA,TESLA,False


In [15]:
# Size of the sample 

sample.shape[0]

20000

In [21]:
# Proportion of Nissan Evs

x_bar = sample.is_Nissan.mean()
x_bar

0.099

In [17]:
# Still quite representative!

In [18]:
# Our best estimate statistic is 0.099
# From our sample we propose that of all EVs, 9.9% are Nissans.

In [19]:
# Now we have the best estimate statistic, how do we compute the margin of error

# ___$\bar{x}~\pm~multiplier \cdot standard~err$___

In [20]:
# At 95% confidence interval, multiplier = 1.96

# ___$stderr~=~\sqrt{\frac{\bar{x}(1 - \bar{x})}{n}}$___
# ___$\bar{x} - sample~statistic$___
# ___$n - sample~size$___

In [23]:
multiplier = 1.96
stderr = np.sqrt(x_bar * (1 - x_bar) / sample.shape[0])

In [25]:
print(f"Confidence interval: {x_bar - (multiplier * stderr)} - {x_bar + (multiplier * stderr)}")

Confidence interval: 0.0948607556341767 - 0.10313924436582331


In [26]:
# A confidence interval will always define a lower bound and an upper bound.
# And the center of this interval will be our best estimate!

## ___What does a confidence interval mean?___
---------------------

In [33]:
# Note: Here we assume the whole population to be our dataset.
# Even though, in reality, our dataset itself is a sample.

# Means a range of reasonable values for our parameter.
# That is to say, with 95% confidence, the proportion of Nissans among EVs is between 

print(f"That is to say, with 95% confidence, the proportion of Nissans among EVs is between {(x_bar - (multiplier * stderr)) * 100:.4f}% & \
{(x_bar + (multiplier * stderr)) * 100:.4f}%")

That is to say, with 95% confidence, the proportion of Nissans among EVs is between 9.4861% & 10.3139%


In [34]:
# This range is the limits where we believe our statistc to be.
# It might be in there, it might not be, we can never be sure. But we are 95% confident that IT IS THERE!