In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#  read the dataset
df = pd.read_csv("2012-sat-results.csv")

print(df.info())
print("")

# convert all values to numeric
df["SAT Critical Reading Avg. Score"] = pd.to_numeric(df["SAT Critical Reading Avg. Score"], errors="coerce")
df["SAT Math Avg. Score"] = pd.to_numeric(df["SAT Math Avg. Score"], errors="coerce")
df["SAT Writing Avg. Score"] = pd.to_numeric(df["SAT Writing Avg. Score"], errors="coerce")

# Drop rows with NaN values
df = df.dropna(subset=["SAT Critical Reading Avg. Score", "SAT Math Avg. Score", "SAT Writing Avg. Score"])

print(df.info())
print("")

# population params
mu = df["SAT Writing Avg. Score"].mean()
tao = df["SAT Writing Avg. Score"].sum()
sigmasq = df["SAT Writing Avg. Score"].var(ddof=0)

print(f"The mu is: {mu}")
print(f"The tao is: {tao}")
print(f"The sigma^2 is: {sigmasq}")

print("")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   DBN                              478 non-null    object
 1   SCHOOL NAME                      478 non-null    object
 2   Num of SAT Test Takers           478 non-null    object
 3   SAT Critical Reading Avg. Score  478 non-null    object
 4   SAT Math Avg. Score              478 non-null    object
 5   SAT Writing Avg. Score           478 non-null    object
dtypes: object(6)
memory usage: 22.5+ KB
None

<class 'pandas.core.frame.DataFrame'>
Index: 421 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              421 non-null    object 
 1   SCHOOL NAME                      421 non-null    object 
 2   Num of SA

In [2]:
df.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score
0,01M292,HENRY STREET SCHOOL FOR INTERNATIONAL STUDIES,29,355.0,404.0,363.0
1,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,91,383.0,423.0,366.0
2,01M450,EAST SIDE COMMUNITY SCHOOL,70,377.0,402.0,370.0
3,01M458,FORSYTH SATELLITE ACADEMY,7,414.0,401.0,359.0
4,01M509,MARTA VALLE HIGH SCHOOL,44,390.0,433.0,384.0


# 1. Divide your population into strata

In [3]:
# Creating Strata based on Number of SAT Test Takers per school

df["Num of SAT Test Takers"] = pd.to_numeric(df["Num of SAT Test Takers"], errors='coerce')

# strata ranges
bins = [0, 50, 100, 200, 400, df["Num of SAT Test Takers"].max()]
labels = [0, 1, 2, 3, 4]

df["Testers_Stratum"] = pd.cut(df["Num of SAT Test Takers"], bins=bins, labels=labels, include_lowest=True)

testers = {
    "Nh": df.groupby("Testers_Stratum", observed=False).size().tolist(),
    "sigma_sq_h": df.groupby("Testers_Stratum", observed=False)["Num of SAT Test Takers"].var(ddof=0).tolist()
}
testers

{'Nh': [148, 173, 50, 26, 24],
 'sigma_sq_h': [191.754930606282,
  166.8447325336631,
  688.8196000000003,
  3789.940828402369,
  46378.74305555556]}

In [4]:
# Creating Strata based on SAT Critical Reading Avg. Score

# Intervals for stratum and labels for the intervals
bins = [350, 375, 400, 425, 450, 515, df["SAT Critical Reading Avg. Score"].max()]
labels = [0, 1, 2, 3, 4, 5]

df["Reading_Stratum"] = pd.cut(df["SAT Critical Reading Avg. Score"], bins=bins, labels=labels, include_lowest=True)

# Grouping by stratum, and calculating variances for each stratum
reading = {
    "Nh": df.groupby("Reading_Stratum", observed=False).size().tolist(),
    "sigma_sq_h": df.groupby("Reading_Stratum", observed=False)["SAT Critical Reading Avg. Score"].var(ddof=0).tolist()
}
reading

{'Nh': [102, 116, 75, 31, 36, 21],
 'sigma_sq_h': [49.47260668973475,
  53.543029131985755,
  51.17582222222231,
  33.33402705515095,
  290.65663580246917,
  2211.9319727891143]}

In [5]:
# Creating Strata based on SAT Math Avg. Score

# Intervals for stratum and labels for the intervals
bins = [350, 375, 400, 425, 450, 515, df["SAT Math Avg. Score"].max()]
labels = [0, 1, 2, 3, 4, 5]

df["Math_Stratum"] = pd.cut(df["SAT Math Avg. Score"], bins=bins, labels=labels, include_lowest=True)

# Grouping by stratum, and calculating variances for each stratum
math = {
    "Nh": df.groupby("Math_Stratum", observed=False).size().tolist(),
    "sigma_sq_h": df.groupby("Math_Stratum", observed=False)["SAT Math Avg. Score"].var(ddof=0).tolist()
}
math

{'Nh': [96, 109, 65, 43, 52, 30],
 'sigma_sq_h': [45.99728732638883,
  52.81440956148471,
  52.962366863905295,
  51.59221200648994,
  280.4715236686394,
  3212.0766666666673]}

# 2. Evaluate Population Stratification

In [6]:
sigma_sq = df["SAT Writing Avg. Score"].var(ddof=0)
N = len(df)
n= 80

Nh = np.array(testers["Nh"])
sigma_sq_h = np.array(testers["sigma_sq_h"])

delta_testers = (N - 1) * sigma_sq - np.sum((Nh - 1) * sigma_sq_h)
delta_testers

np.float64(188464.96477095643)

In [7]:
sigma_sq = df["SAT Writing Avg. Score"].var(ddof=0)
N = len(df)
n= 80

Nh = np.array(reading["Nh"])
sigma_sq_h = np.array(reading["sigma_sq_h"])

delta_reading = (N - 1) * sigma_sq - np.sum((Nh - 1) * sigma_sq_h)
delta_reading

np.float64(1370209.169962898)

In [8]:
sigma_sq = df["SAT Writing Avg. Score"].var(ddof=0)
N = len(df)
n= 80

Nh = np.array(math["Nh"])
sigma_sq_h = np.array(math["sigma_sq_h"])

delta_math = (N - 1) * sigma_sq - np.sum((Nh - 1) * sigma_sq_h)
delta_math

np.float64(1317477.5710010633)

In [9]:
max_delta =  max( max(delta_math, delta_reading), delta_testers )
print(f"Largest delta {max_delta}")

Largest delta 1370209.169962898


#### Use SAT Critical Reading Avg. Score strata from here on

# Sampling Procedures

### 3. Stratified Random Sample with Equal Allocation:
Take stratified random sample with size n (chosen in Report 2) with equal allocation. $n_h = \frac{n}{L}$

In [10]:
# use reading strata divisions
reading

{'Nh': [102, 116, 75, 31, 36, 21],
 'sigma_sq_h': [49.47260668973475,
  53.543029131985755,
  51.17582222222231,
  33.33402705515095,
  290.65663580246917,
  2211.9319727891143]}

In [11]:
# n chosen in report 2 is 80.
N_h = np.array(reading['Nh'])
sigma_sq_h = np.array(reading['sigma_sq_h'])

n = 80
L = len(reading["Nh"])
n_h = int(np.ceil(n / L))
n_h

14

In [12]:
# take samples
stratified_samples = []

for stratum_label in range(L):
    stratum = df[df["Reading_Stratum"] == stratum_label]
    stratified_samples.append(stratum.sample(n=n_h, replace=False))
stratified_sample_df = pd.concat(stratified_samples).reset_index(drop=True)
stratified_sample_df

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,Testers_Stratum,Reading_Stratum,Math_Stratum
0,13K350,URBAN ASSEMBLY HIGH SCHOOL OF MUSIC AND ART,52,360.0,364.0,356.0,1,0,0
1,18K569,KURT HAHN EXPEDITIONARY LEARNING SCHOOL,43,368.0,378.0,346.0,0,0,1
2,02M305,"URBAN ASSEMBLY ACADEMY OF GOVERNMENT AND LAW, THE",50,375.0,388.0,385.0,0,0,1
3,15K520,PACIFIC HIGH SCHOOL,9,352.0,341.0,300.0,0,0,
4,05M685,BREAD & ROSES INTEGRATED ARTS HIGH SCHOOL,61,369.0,361.0,355.0,1,0,0
...,...,...,...,...,...,...,...,...,...
79,30Q580,BACCALAUREATE SCHOOL FOR GLOBAL EDUCATION,61,524.0,568.0,544.0,1,5,5
80,22K535,LEON M. GOLDSTEIN HIGH SCHOOL FOR THE SCIENCES,259,524.0,561.0,542.0,3,5,5
81,02M418,MILLENNIUM HIGH SCHOOL,144,528.0,553.0,533.0,2,5,5
82,31R605,STATEN ISLAND TECHNICAL HIGH SCHOOL,227,635.0,682.0,636.0,3,5,5


# 4. Parameter Estimation and Variance

### a) Estimate your parameter of interest using an unbiased estimator
$\hat{\tau_{\text{st}}} = \sum_{h=1}^L N_h\bar{y}_h$

$\hat{\mu_{\text{st}}} = \frac{1}{N}\hat{\tau_{\text{st}}}$

In [13]:
# mu_st_hat = tau_st_hat / N
# tau_st_hat = sum from 1 to L of (N_h * ybar_h)
# ybar_h = (sum from 1 to n_h of (y_hi)) / n_h

ybar_h = stratified_sample_df.groupby("Reading_Stratum", observed=False)["SAT Writing Avg. Score"].mean().tolist()
tau_st_hat = sum(N_h * ybar_h)
mu_st_hat = tau_st_hat / N
round(float(mu_st_hat), 2)

359.87

### b) Estimate its variance and provide a confidence interval at the alpha level chosen in Report 2.
$\hat{\text{var}}(\hat{\tau_{\text{st}}}) = \sum_{h=1}^L N_h(N_h - n_h)\frac{\sigma^2_h}{n_h}$

$\hat{\text{var}}(\hat{\mu_{\text{st}}}) = \frac{1}{N^2}\hat{\text{var}}(\hat{\tau_{\text{st}}})$

In [14]:
# variance
# var_hat_mu_st_hat = (1/N^2) * var_hat_tau_st_hat
# var_hat_tau_st_hat = sum from 1 to L ((N_h * (N_h - n_h) * (sigma_h^2 / n_h)))

var_hat_tau_st_hat = sum(N_h * (N_h - n_h) * (sigma_sq_h / n_h))
var_hat_mu_st_hat = (1 / N**2) * var_hat_tau_st_hat
round(float(var_hat_mu_st_hat), 2)

0.76

### c) Use the Satterthwaite formula for adjusted degrees of freedom
$\large a_h = \frac{N_h(N_h - n_h)}{n_h}$

$\large d = \frac{(\sum_{h=1}^L a_h s^2_h)^2}{\sum_{h=1}^L \frac{(a_h s^2_h)^2}{n_h - 1}}$

In [15]:
# adjusted degrees of freedom
# d = (sum of 1 to L of (a_h * s_h^2))
#      / (sum of 1 to L of (a_h * s_h^2) / (n_h - 1))
# a_h = (N_h * (N_h - n_h)) / n_h
s_h_squared = (
    stratified_sample_df
    .groupby("Reading_Stratum", observed=False)["SAT Writing Avg. Score"]
    .var(ddof=1)
    .tolist()
)
s_h_squared = np.array(s_h_squared)
a_h = N_h * (N_h - n_h) / n_h
d = sum(a_h * s_h_squared)**2 / sum((a_h * s_h_squared)**2 / (n_h - 1))
float(d)

42.581783064386144

In [16]:
from scipy.stats import t
# Confidence Interval with alpha = .05
# mu_st_hat +- t(alpha/2, d) * sqrt(var_hat_mu_st_hat)
alpha = .05
t_crit = t.ppf(1-(alpha/2), d)
SDE = t_crit * np.sqrt(var_hat_mu_st_hat)

CI = mu_st_hat - SDE, mu_st_hat + SDE
print(f"95% CI for mu with Stratified Random Sample with Equal Allocation is: ({CI[0].round(2)}, {CI[1].round(2)}) \n")

95% CI for mu with Stratified Random Sample with Equal Allocation is: (358.11, 361.62) 



# 5. Stratified Random Sample with Proportional Allocation

$n_h = \frac{nN_h}{N}$

In [17]:
nh_proportional = [
    int(round((n * Nh) / N))
    for Nh in reading["Nh"]
]
nh_proportional

[19, 22, 14, 6, 7, 4]

In [18]:
while sum(nh_proportional) != n:
    if sum(nh_proportional) < n:
        nh_proportional[nh_proportional.index(min(nh_proportional))] += 1
    else:
        nh_proportional[nh_proportional.index(max(nh_proportional))] -= 1

stratified_samples_prop = []
for stratum_label in range(L):
    stratum_data = df[df["Reading_Stratum"] == stratum_label]
    sample_size = nh_proportional[stratum_label]

    if sample_size > 0:
        sample = stratum_data.sample(n=sample_size, replace=False)
        stratified_samples_prop.append(sample)
stratified_sample_prop_df = pd.concat(stratified_samples_prop).reset_index(drop=True)

stratified_sample_prop_df

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,Testers_Stratum,Reading_Stratum,Math_Stratum
0,13K350,URBAN ASSEMBLY HIGH SCHOOL OF MUSIC AND ART,52,360.0,364.0,356.0,1,0,0
1,07X670,HEALTH OPPORTUNITIES HIGH SCHOOL,114,363.0,368.0,364.0,2,0,0
2,02M620,NORMAN THOMAS HIGH SCHOOL,154,370.0,379.0,359.0,2,0,1
3,02M303,"FACING HISTORY SCHOOL, THE",76,353.0,358.0,340.0,1,0,0
4,02M460,WASHINGTON IRVING HIGH SCHOOL,151,373.0,387.0,360.0,2,0,1
...,...,...,...,...,...,...,...,...,...
75,25Q525,TOWNSEND HARRIS HIGH SCHOOL,278,621.0,651.0,638.0,3,5,5
76,14K449,"BROOKLYN LATIN SCHOOL, THE",72,586.0,584.0,570.0,1,5,5
77,03M485,FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A...,531,566.0,564.0,577.0,4,5,5
78,10X445,BRONX HIGH SCHOOL OF SCIENCE,731,632.0,688.0,649.0,4,5,5


 ## 6. Parameter Estimation and Variance

### a) Estimate your parameter of interest using an unbiased estimator
$\hat{\tau_{\text{st}}} = \sum_{h=1}^L N_h\bar{y}_h$

$\hat{\mu_{\text{st}}} = \frac{1}{N}\hat{\tau_{\text{st}}}$

In [19]:
ybar_h_prop= stratified_sample_prop_df.groupby("Reading_Stratum", observed=False)["SAT Writing Avg. Score"].mean().tolist()

tau_st_hat_prop=sum(N_h * ybar_h_prop)
mu_st_hat_prop = tau_st_hat_prop/N

round(float(mu_st_hat_prop), 2)

364.34

### b) Estimate its variance and provide a confidence interval at the α level chosen in Report 2.
$\hat{\text{var}}(\hat{\tau_{\text{st}}}) = \sum_{h=1}^L N_h(N_h - n_h)\frac{\sigma^2_h}{n_h}$

$\hat{\text{var}}(\hat{\mu_{\text{st}}}) = \frac{1}{N^2}\hat{\text{var}}(\hat{\tau_{\text{st}}})$

In [20]:
var_hat_tau_st_hat_prop = sum(N_h * (N_h - nh_proportional) * (sigma_sq_h / nh_proportional))
var_hat_mu_st_hat_prop = (1/(N**2)) * var_hat_tau_st_hat_prop

round(float(var_hat_mu_st_hat_prop), 2)

1.02

### c) Use the Satterthwaite formula for adjusted degrees of freedom.
$\large a_h = \frac{N_h(N_h - n_h)}{n_h}$

$\large d = \frac{(\sum_{h=1}^L a_h s^2_h)^2}{\sum_{h=1}^L \frac{(a_h s^2_h)^2}{n_h - 1}}$

In [21]:
s_h_squared_prop = (
    stratified_sample_prop_df
    .groupby("Reading_Stratum", observed=False)["SAT Writing Avg. Score"]
    .var(ddof=1)
    .tolist()
)
s_h_squared_prop = np.array(s_h_squared_prop)
nh_proportional = np.array(nh_proportional)
a_h = N_h * (N_h - nh_proportional) / nh_proportional
d_prop = sum(a_h * s_h_squared_prop)**2 / sum((a_h * s_h_squared_prop)**2 / (nh_proportional - 1))
float(d_prop)

60.804577631331036

In [30]:
from scipy.stats import t

alpha = 0.05
t_crit = t.ppf(1-(alpha/2), d_prop)

SDE = t_crit*np.sqrt(var_hat_mu_st_hat_prop)
CI_prop = mu_st_hat_prop - SDE, mu_st_hat_prop + SDE
print(f"95% CI for mu with Stratified Random Sample with Proportional Allocation is: ({CI_prop[0].round(2)}, {CI_prop[1].round(2)}) \n")

95% CI for mu with Stratified Random Sample with Proportional Allocation is: (362.32, 366.35) 



# 7. Stratified Random Sample with Optimum Allocation

$n_h = \frac{nN_h\sigma_h}{\sum_{h=1}^L N_h\sigma_h}$

In [23]:
N_h = np.array(reading['Nh'])
sigma2_h = np.array(reading['sigma_sq_h'])
sigma_h = np.sqrt(sigma2_h)

N = len(df)
L = len(N_h)
n = 80

n_h_opt = ((n * N_h * sigma_h) / sum(N_h * sigma_h)).round()
n_h_opt = n_h_opt.astype(int).tolist()
n_h_opt

[15, 17, 11, 4, 13, 20]

In [24]:
stratified_samples_opt = []
for strata in range(L):
    stratum_data = df[df["Reading_Stratum"] == strata]
    n = n_h_opt[strata]

    if n > 0:
        sample = stratum_data.sample(n=n, replace=False)
        stratified_samples_opt.append(sample)

stratified_sample_opt_df = pd.concat(stratified_samples_opt).reset_index(drop=True)
stratified_sample_opt_df

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,Testers_Stratum,Reading_Stratum,Math_Stratum
0,10X667,GRACE DODGE YABC,10,350.0,356.0,317.0,0,0,0
1,14K474,PROGRESS HIGH SCHOOL FOR PROFESSIONAL CAREERS,144,364.0,379.0,371.0,2,0,1
2,02M460,WASHINGTON IRVING HIGH SCHOOL,151,373.0,387.0,360.0,2,0,1
3,23K514,FREDERICK DOUGLASS ACADEMY VII HIGH SCHOOL,55,361.0,371.0,359.0,1,0,0
4,12X690,MONROE ACADEMY FOR BUSINESS/LAW,31,375.0,387.0,376.0,0,0,1
...,...,...,...,...,...,...,...,...,...
75,02M418,MILLENNIUM HIGH SCHOOL,144,528.0,553.0,533.0,2,5,5
76,31R605,STATEN ISLAND TECHNICAL HIGH SCHOOL,227,635.0,682.0,636.0,3,5,5
77,03M479,BEACON HIGH SCHOOL,261,577.0,575.0,592.0,3,5,5
78,22K535,LEON M. GOLDSTEIN HIGH SCHOOL FOR THE SCIENCES,259,524.0,561.0,542.0,3,5,5


### 7a) Parameter Estimation and Variance

### b) Estimate your parameter of interest using an unbiased estimator
$\hat{\tau_{\text{st}}} = \sum_{h=1}^L N_h\bar{y}_h$

$\hat{\mu_{\text{st}}} = \frac{1}{N}\hat{\tau_{\text{st}}}$

In [25]:
y_bar_h_opt = stratified_sample_opt_df.groupby("Reading_Stratum", observed=False)["SAT Writing Avg. Score"].mean().tolist()

tau_st_hat_opt = sum(N_h * y_bar_h_opt)
mu_st_hat_opt = tau_st_hat_opt / N

round(float(mu_st_hat_prop), 2)

364.34

### c) Estimate its variance and provide a confidence interval at the $\alpha$ level chosen in Report 2.
$\hat{\text{var}}(\hat{\tau_{\text{st}}}) = \sum_{h=1}^L N_h(N_h - n_h)\frac{\sigma^2_h}{n_h}$

$\hat{\text{var}}(\hat{\mu_{\text{st}}}) = \frac{1}{N^2}\hat{\text{var}}(\hat{\tau_{\text{st}}})$

In [26]:
var_hat_tau_st_hat_opt = sum(N_h * (N_h - n_h_opt) * (sigma2_h / n_h_opt))

var_hat_mu_st_hat_opt = (1 / (N**2)) * var_hat_tau_st_hat_opt

round(float(var_hat_mu_st_hat_opt), 2)

0.65

### d) Use the Satterthwaite formula for adjusted degrees of freedom
$\large a_h = \frac{N_h(N_h - n_h)}{n_h}$

$\large d = \frac{(\sum_{h=1}^L a_h s^2_h)^2}{\sum_{h=1}^L \frac{(a_h s^2_h)^2}{n_h - 1}}$

In [27]:
s2_h_opt = (
    stratified_sample_opt_df
    .groupby("Reading_Stratum", observed=False)["SAT Writing Avg. Score"]
    .var(ddof=1)
    .tolist()
)
s2_h_opt = np.array(s2_h_opt)
n_h_opt = np.array(n_h_opt)

a_h_opt = N_h * (N_h - n_h_opt) / n_h_opt

d_opt = sum(a_h * s2_h_opt)**2 / sum((a_h * s2_h_opt)**2 / (n_h_opt - 1))
float(d_opt)

63.87955638584482

In [28]:
from scipy.stats import t

alpha = 0.05
t_crit = t.ppf(1-alpha/2, d_opt)

SDE = t_crit * float(np.sqrt(var_hat_mu_st_hat_opt))

CI_opt = mu_st_hat_opt - SDE, mu_st_hat_opt + SDE
print(f"95% CI for mu with Stratified Random Sample with Optimum Allocation is: ({CI_opt[0].round(2)}, {CI_opt[1].round(2)}) \n")

95% CI for mu with Stratified Random Sample with Optimum Allocation is: (360.3, 363.53) 



# 8. Best Estimator Selection

In [29]:
print(CI[0].round(2).astype(float), CI[1].round(2).astype(float))
print(CI_prop[0].round(2).astype(float), CI_prop[1].round(2).astype(float))
print(CI_opt[0].round(2).astype(float), CI_opt[1].round(2).astype(float))
print()
print("Variance of mu estimate for Equal allocation:", round(var_hat_mu_st_hat, 3))
print("Variance of mu estimate for Proportional allocation:", round(var_hat_mu_st_hat_prop, 3))
print("Variance of mu estimate for Optimal allocation:", round(var_hat_mu_st_hat_opt, 3))

358.11 361.62
362.32 366.35
360.3 363.53

Variance of mu estimate for Equal allocation: 0.76
Variance of mu estimate for Proportional allocation: 1.015
Variance of mu estimate for Optimal allocation: 0.652


Of the three estimators, the estimator for $\mu$ under Stratified Random Sampling via Optimal Allocation has the lowest variance of $0.652$. Therefore it is the best Estimator of the three.