In [21]:
import numpy as np
np.random.seed(2023 - 6 - 6)
import pandas as pd

# ___Conservative Approach & Sample Size Considerations___
--------------

In [1]:
# Generally a confidence interval is defined as best estimate of a statistic += margin of error.
# Margin of error is defined as multiplier x standard error

In [2]:
# In the conservative approach, we play a little bit safer and define a bigger margin of error.
# We can evaluate how wide or narrow a confidence interval would be based of our sample size.

In [3]:
# For 95% confidence interval 

# ___$\bar{x} \pm (1.96 \cdot \sqrt{\frac{\bar{x}(1-\bar{x})}{n}})$___

In [73]:
# Let's load in the USDA plants database of 2023

plants = pd.read_csv("../data/USDA_plants_database.csv")

In [74]:
plants.columns

Index(['Symbol', 'Synonym Symbol', 'Scientific Name with Author',
       'Common Name', 'Family'],
      dtype='object')

In [76]:
plants.isna().sum(axis = 0)

Symbol                             0
Synonym Symbol                 48994
Scientific Name with Author        0
Common Name                    49381
Family                         44163
dtype: int64

In [27]:
plants.shape

(93157, 5)

In [77]:
plants.loc[~plants.Family.isna(), :].shape

(48994, 5)

In [78]:
# NaNs in Family :(

93157 - 48994

44163

In [79]:
plants = plants.loc[~plants.Family.isna(),:].reset_index(drop = True)

In [80]:
plants

Unnamed: 0,Symbol,Synonym Symbol,Scientific Name with Author,Common Name,Family
0,ABAB,,Abutilon abutiloides (Jacq.) Garcke ex Hochr.,shrubby Indian mallow,Malvaceae
1,ABAB70,,Abietinella abietina (Hedw.) Fleisch.,abietinella moss,Thuidiaceae
2,ABAL,,Abronia alpina Brandegee,Ramshaw Meadows sand verbena,Nyctaginaceae
3,ABAL3,,Abies alba Mill.,silver fir,Pinaceae
4,ABAM,,Abies amabilis (Douglas ex Loudon) Douglas ex ...,Pacific silver fir,Pinaceae
...,...,...,...,...,...
48989,ZYRE,,Zygodon reinwardtii (Hornsch.) A. Br.,Reinwardt's zygodon moss,Orthotrichaceae
48990,ZYSE,,Zygophlebia sectifrons (Kunze ex Mett.) Bishop,octopus fern,Grammitidaceae
48991,ZYVI2,,Zygodon viridissimus (Dicks.) Brid.,zygodon moss,Orthotrichaceae
48992,ZYVIR,,Zygodon viridissimus (Dicks.) Brid. var. rupes...,zygodon moss,Orthotrichaceae


In [81]:
plants.Family.value_counts()

Family
Asteraceae          4891
Fabaceae            3809
Poaceae             3072
Rosaceae            1519
Cyperaceae          1432
                    ... 
Pleuroziaceae          2
Trichocomaceae         2
Halymeniaceae          2
Rafflesiaceae          2
Zannichelliaceae       2
Name: count, Length: 548, dtype: int64

In [82]:
# There are 548 Families

In [83]:
# Let's examine the proportion of Fabaceae in samples of size 1,000

lim = plants.shape[0]
sample_props = []

for _ in range(1000):
    sample_props.append((plants.Family[np.random.randint(0, lim, size = 1000)] == "Fabaceae").mean())

sample_props = np.array(sample_props)

In [86]:
# Population proportion

pop_prop = (plants.Family == "Fabaceae").mean()
pop_prop

0.07774421357717272

In [87]:
sample_props.mean()

0.07818199999999999

In [43]:
def confidence_interval(ssize: int, best_estimate: float, confidence: float) -> tuple[float, float]:
    
    """
    Arguments:
        ssize: int  (Sample size)
        best_estimate: float 
        confidence: float (confidence as a floating point scalar, e.g 95% confidence => 0.95)

    Returns: -> Confidence interval
        (lower_bound, upper_bound): tuple[float, float]
    """
    
    cints = {.70: 1.04, .75: 1.15, .80: 1.28, .85: 1.44, .90: 1.645, .95: 1.96, .98: 2.33, .99: 2.58}
    assert (confidence in cints), "Z score for the given confidence is not available in this routine!"
    stderr = np.sqrt(best_estimate * (1 - best_estimate) / ssize)
    z = cints.get(confidence)
    return (best_estimate - (z * stderr), best_estimate + (z * stderr))

In [47]:
confidence_interval(1000, sample_props[np.random.randint(0, sample_props.size)], 0.7)

(0.09108999503531912, 0.1109100049646809)

In [48]:
confidence_interval(659, 0.85, .95)

(0.8227373256215749, 0.8772626743784251)

In [49]:
# All is fine, but what if our sample metric isn't accurate?

In [64]:
# See the difference

pop_prop, sample_props[np.random.randint(0, sample_props.size)]

(0.07774421357717272, 0.093)

In [65]:
# If we did a bad job at making a random sample, our sample may not be representative of the population!
# Then the estimate computed using that sample will be inaccurate.

In [66]:
# Due to this reason, we may want to maximize the standard error to widen our confidence interval, so that our confidence interval
# covers the population metric.

# This is accomplished by plugging in 0.5 for x_bar

In [67]:
# @ x_bar = 0.5, standard error will be.

# ___$=\sqrt{\frac{0.5(1 - 0.5)}{n}}$___
# ___$=\frac{0.5}{\sqrt{n}}$___

In [68]:
# Conservative standard error = 0.5 / sqrt(n)

conservative_stderr = lambda ssize: 0.5 / np.sqrt(ssize)

In [69]:
diff = 1.96 * conservative_stderr(1000)

In [71]:
best = sample_props[np.random.randint(0, sample_props.size)]

best - diff, best + diff

(0.046009678930349886, 0.10799032106965012)

In [72]:
# We end up with a wider confidence interval.
# Here, our margin of error is only dependent on the sample size.

In [88]:
# Margin of error (MoE) only depends on 
    # 1) Confidence interval
    # 2) Sample size

In [90]:
# What sample size we'd need to have a 95% conservative confidence interval with a MoE 3%?

# MoE = 0.03
# multiplier * stderr = 0.03
# since the confidence level is 95%, multiplier = 1.96

stderr = 0.03 / 1.96
stderr

0.015306122448979591

# ___$Confidence~interval = \bar{x} \pm (\hat{Z} \cdot \sqrt{\frac{\bar{x}(1 - \bar{x})}{n}})$___
# ___$MoE = \hat{Z} \cdot \sqrt{\frac{\bar{x}(1 - \bar{x})}{n}}$___
# ___$MoE_{conservative} = \hat{Z} \cdot \frac{0.5}{\sqrt{n}}$___
# ___$n = (\frac{0.5 \cdot \hat{Z}}{MoE_{conservative}})^2$___

In [99]:
# However, in conservative approach, moe = 1 / sqrt(n)
# n = 

((0.5 * 1.96) / 0.03) ** 2

1067.1111111111109

In [100]:
# Since we cannot have sample sizes in decimals, always round the value up.

np.ceil(((0.5 * 1.96) / 0.03) ** 2)

1068.0

In [96]:
# What if we wanted a 99% confidence level with 3% MoE?

# stderr * multiplier = 0.03
# stderr * 2.58 = 0.03           @ 99% confidence level, Z_hat = 2.58

stderr = 0.03 / 2.58
stderr

0.011627906976744186

In [101]:
# stderr = 0.5 / sqrt(n)
# n = (0.5 / stderr) ^ 2

(0.5 / stderr) ** 2

1849.0

In [None]:
# With the same margin of error, if we want to improve the confidence level (95% -> 99%) we need to increase the sample size.