In [127]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

#  read the dataset
df = pd.read_csv("2012-sat-results.csv")

print(df.info())
print("")

# convert all values to numeric
df["SAT Critical Reading Avg. Score"] = pd.to_numeric(df["SAT Critical Reading Avg. Score"], errors="coerce")
df["SAT Math Avg. Score"] = pd.to_numeric(df["SAT Math Avg. Score"], errors="coerce")
df["SAT Writing Avg. Score"] = pd.to_numeric(df["SAT Writing Avg. Score"], errors="coerce")

# Drop rows with NaN values
df = df.dropna(subset=["SAT Critical Reading Avg. Score", "SAT Math Avg. Score", "SAT Writing Avg. Score"])

print(df.info())
print("")

# population params
mu = df["SAT Writing Avg. Score"].mean()
tao = df["SAT Writing Avg. Score"].sum()
sigmasq = df["SAT Writing Avg. Score"].var(ddof=0)

print(f"The mu is: {mu}")
print(f"The tao is: {tao}")
print(f"The sigma^2 is: {sigmasq}")

print("")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   DBN                              478 non-null    object
 1   SCHOOL NAME                      478 non-null    object
 2   Num of SAT Test Takers           478 non-null    object
 3   SAT Critical Reading Avg. Score  478 non-null    object
 4   SAT Math Avg. Score              478 non-null    object
 5   SAT Writing Avg. Score           478 non-null    object
dtypes: object(6)
memory usage: 22.5+ KB
None

<class 'pandas.core.frame.DataFrame'>
Index: 421 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              421 non-null    object 
 1   SCHOOL NAME                      421 non-null    object 
 2   Num of SA

In [128]:
random.seed(420)

In [129]:
# Making length of the DF not a prime number by randomly removing one entry
idx = random.randint(0, len(df) - 1)
df = df.drop(index=idx).reset_index(drop=True)

In [130]:
print(f"mu: {df["SAT Writing Avg. Score"].mean()}")
print(f"Population size M: {len(df)}")
print(f"sample size n: 80")

mu: 393.8952380952381
Population size M: 420
sample size n: 80


## 1. Perform a systematic sample with (approximately) the same total sample size as you used in previous Reports

In [131]:
# Performing a 4 - in - 21 systematic sample

N = 21
n = 4
M = len(df)
sample_size = 80

Mi = M // N # 20 groups (4 * Mi) = 80

# random starting indices
s1 = random.sample(range(N), n)

# sampling from each group
indices = [[] for _ in range(n)]
samples = [[] for _ in range(n)]

# Systematic sampling with fixed offset
for i in range(Mi):  # for each group
    s = [pos + i * N for pos in s1]  # apply offset to each of the starting pos
    for j in range(n):
        if s[j] < M:  # make sure index is within bounds
            indices[j].append(s[j])
            samples[j].append(df['SAT Writing Avg. Score'].iloc[s[j]])

samples

[[np.float64(378.0),
  np.float64(428.0),
  np.float64(477.0),
  np.float64(402.0),
  np.float64(416.0),
  np.float64(407.0),
  np.float64(357.0),
  np.float64(394.0),
  np.float64(356.0),
  np.float64(341.0),
  np.float64(389.0),
  np.float64(368.0),
  np.float64(388.0),
  np.float64(386.0),
  np.float64(411.0),
  np.float64(331.0),
  np.float64(395.0),
  np.float64(392.0),
  np.float64(377.0),
  np.float64(352.0)],
 [np.float64(628.0),
  np.float64(592.0),
  np.float64(431.0),
  np.float64(358.0),
  np.float64(404.0),
  np.float64(363.0),
  np.float64(376.0),
  np.float64(371.0),
  np.float64(414.0),
  np.float64(365.0),
  np.float64(395.0),
  np.float64(380.0),
  np.float64(351.0),
  np.float64(379.0),
  np.float64(374.0),
  np.float64(440.0),
  np.float64(318.0),
  np.float64(360.0),
  np.float64(394.0),
  np.float64(397.0)],
 [np.float64(362.0),
  np.float64(391.0),
  np.float64(512.0),
  np.float64(411.0),
  np.float64(367.0),
  np.float64(364.0),
  np.float64(415.0),
  np.float6

## 2. Display the indexes of your sampled units. When systematic sampling is executed correctly, the indexes of the sampled units will follow a distinct pattern

In [132]:
print("Sampled units indices:", indices)

Sampled units indices: [[11, 32, 53, 74, 95, 116, 137, 158, 179, 200, 221, 242, 263, 284, 305, 326, 347, 368, 389, 410], [8, 29, 50, 71, 92, 113, 134, 155, 176, 197, 218, 239, 260, 281, 302, 323, 344, 365, 386, 407], [12, 33, 54, 75, 96, 117, 138, 159, 180, 201, 222, 243, 264, 285, 306, 327, 348, 369, 390, 411], [3, 24, 45, 66, 87, 108, 129, 150, 171, 192, 213, 234, 255, 276, 297, 318, 339, 360, 381, 402]]


## 3. Estimate your parameter of interest by an unbiased estimator. Estimate its variance and give a confidence interval of α level chosen in Report 2.

- mu_hat = tau_hat / M
- tau_hat = N * y_bar = N * SUM from 1 to n of (yi/n)
- where yi is the total of y-values in the ith primary unit



In [133]:
yi = [sum(sample) for sample in samples]
tau_hat = N * (sum(yi) / n)
mu_hat = tau_hat / M
mu_hat

np.float64(388.7375)

##### Estimated Variance
- var_hat_mu_hat = (1/M^2) * var_hat_tau_hat
- var_hat_tau_hat = N * (N-n) * s_u_squared / n
- s_u_squared = (1/(n-1)) * SUM from 1 to n (yi - y_bar)^2

In [134]:
y_bar = np.mean(yi)
s_u_squared = np.sum((yi - y_bar) ** 2) / (n - 1)
var_hat_tau_hat = N * (N - n) * s_u_squared / n
var_hat_mu_hat = var_hat_tau_hat / (M ** 2)
var_hat_mu_hat

np.float64(24.407269345238095)

##### Confidence Interval of alpha=.05
- mu_hat +- t(n-1, alpha/2)*sqrt(var_hat(mu_hat))

In [135]:
from scipy.stats import t
# Confidence Interval with alpha = .05
alpha = .05
t_crit = t.ppf(1-(alpha/2), n-1)
SDE = t_crit * np.sqrt(var_hat_mu_hat)

CI = mu_hat - SDE, mu_hat + SDE
print(f"95% CI for mu: ({CI[0].round(2)}, {CI[1].round(2)}) \n")

95% CI for mu: (373.02, 404.46) 

