In [131]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

#  read the dataset
df = pd.read_csv("2012-sat-results.csv")

print(df.info())
print("")

# convert all values to numeric
df["SAT Critical Reading Avg. Score"] = pd.to_numeric(df["SAT Critical Reading Avg. Score"], errors="coerce")
df["SAT Math Avg. Score"] = pd.to_numeric(df["SAT Math Avg. Score"], errors="coerce")
df["SAT Writing Avg. Score"] = pd.to_numeric(df["SAT Writing Avg. Score"], errors="coerce")

# Drop rows with NaN values
df = df.dropna(subset=["SAT Critical Reading Avg. Score", "SAT Math Avg. Score", "SAT Writing Avg. Score"])

print(df.info())
print("")

# population params
mu = df["SAT Writing Avg. Score"].mean()
tao = df["SAT Writing Avg. Score"].sum()
sigmasq = df["SAT Writing Avg. Score"].var(ddof=0)

print(f"The mu is: {mu}")
print(f"The tao is: {tao}")
print(f"The sigma^2 is: {sigmasq}")

print("")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   DBN                              478 non-null    object
 1   SCHOOL NAME                      478 non-null    object
 2   Num of SAT Test Takers           478 non-null    object
 3   SAT Critical Reading Avg. Score  478 non-null    object
 4   SAT Math Avg. Score              478 non-null    object
 5   SAT Writing Avg. Score           478 non-null    object
dtypes: object(6)
memory usage: 22.5+ KB
None

<class 'pandas.core.frame.DataFrame'>
Index: 421 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              421 non-null    object 
 1   SCHOOL NAME                      421 non-null    object 
 2   Num of SA

#### M = 421
#### ${\mu}$ = 393.9857
#### sample_size = 80

### Problem 1: Divide your population into N (3 to 5) primary units with different number of secondary units (Nh should not be equal).

In [541]:
# N = 5 groups

bins = [280, 370, 410, 460, 500, df["SAT Critical Reading Avg. Score"].max()]
labels = [0, 1, 2, 3, 4]

# Create the Reading_Stratum column
df["Reading_Stratum"] = pd.cut(df["SAT Critical Reading Avg. Score"], bins=bins, labels=labels, include_lowest=True)

# Calculate Nh and sigma_sq_h
reading = {
    "Nh": df.groupby("Reading_Stratum", observed=False).size().tolist(),
    "sigma_sq_h": df.groupby("Reading_Stratum", observed=False)["SAT Critical Reading Avg. Score"].var(ddof=1).tolist()
}
reading

{'Nh': [120, 178, 75, 23, 24],
 'sigma_sq_h': [359.5436974789913,
  125.25550688757683,
  192.81981981981997,
  153.30039525691697,
  2583.644927536234]}

In [565]:
np.random.seed(422)

N = 5
n = 4
sample_size = 80

primary_grps = np.random.choice(labels, size=n, replace=False)
primary_grps

array([0, 1, 4, 3])

# Problem 2: Perform two stage design with SRS at each stage

In [587]:
df_selected = df[df["Reading_Stratum"].isin(primary_grps)]

Mi = df_selected.groupby("Reading_Stratum").size()

# mi proportional to size of each primary unit
m_i = (Mi / Mi.sum() * sample_size).round().astype(int)

# Make sure that m_i is exactly 80
while m_i.sum() != sample_size:
    diff = sample_size - m_i.sum()
    idx = np.random.choice(m_i.index)
    m_i[idx] += diff

sampled_df_list = []

# Sample mi[i] units for primary_group[i]
for group, num_samples in m_i.items():
    group_df = df_selected[df_selected["Reading_Stratum"] == group]
    sampled_group = group_df.sample(n=num_samples, random_state=5577) #70 #5577
    sampled_group["Selected_Primary_Group"] = group
    sampled_df_list.append(sampled_group)

final_sampled_df = pd.concat(sampled_df_list)

  Mi = df_selected.groupby("Reading_Stratum").size()


In [589]:
final_sampled_df.head()

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score,Reading_Stratum,Selected_Primary_Group
291,16K393,FREDERICK DOUGLASS ACADEMY IV SECONDARY SCHOOL,20,355.0,355.0,358.0,0,0
383,24Q530,INTERNATIONAL HIGH SCHOOL AT LAGUARDIA COMMUNI...,69,326.0,409.0,329.0,0,0
170,09X403,BRONX INTERNATIONAL HIGH SCHOOL,49,314.0,312.0,339.0,0,0
300,17K524,INTERNATIONAL HIGH SCHOOL AT PROSPECT HEIGHTS,71,287.0,335.0,291.0,0,0
115,06M467,HIGH SCHOOL FOR LAW AND PUBLIC SERVICE,94,363.0,378.0,361.0,0,0


## Problem 3: Estimate your parameter of interest by unbiased estimator. Estimate its variance and standard deviation.

In [592]:
ybar_i = final_sampled_df.groupby("Selected_Primary_Group")["SAT Critical Reading Avg. Score"].mean()

Mi_selected = Mi.loc[ybar_i.index]
mi_selected = m_i.loc[ybar_i.index]
yhat_i = Mi_selected * ybar_i

In [594]:
yhat_i

Selected_Primary_Group
0    41091.428571
1    69003.219512
3    10819.200000
4    12944.000000
dtype: float64

In [596]:
ybar_i

Selected_Primary_Group
0    342.428571
1    387.658537
3    470.400000
4    539.333333
Name: SAT Critical Reading Avg. Score, dtype: float64

In [598]:
M = 421
mu_hat = N * sum(yhat_i) / (n * M)
mu_hat

397.4401665190727

In [600]:
M = 421
mu_hat_1 = 1/n * sum(yhat_i)

su_sq = ((yhat_i - mu_hat_1)**2).sum() / (n - 1)
si_sq = final_sampled_df.groupby("Selected_Primary_Group")["SAT Critical Reading Avg. Score"].var(ddof=1)
var_hat_tau_hat = N*(N-n)*su_sq/n + (N/n)*(Mi_selected*(Mi_selected - mi_selected)*si_sq/mi_selected).sum()
var_hat_mu_hat = var_hat_tau_hat / M**2

In [602]:
var_hat_mu_hat

5303.92544410031