In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

#  read the dataset
df = pd.read_csv("2012-sat-results.csv")

print(df.info())
print("")

# convert all values to numeric
df["SAT Critical Reading Avg. Score"] = pd.to_numeric(df["SAT Critical Reading Avg. Score"], errors="coerce")
df["SAT Math Avg. Score"] = pd.to_numeric(df["SAT Math Avg. Score"], errors="coerce")
df["SAT Writing Avg. Score"] = pd.to_numeric(df["SAT Writing Avg. Score"], errors="coerce")

# Drop rows with NaN values
df = df.dropna(subset=["SAT Critical Reading Avg. Score", "SAT Math Avg. Score", "SAT Writing Avg. Score"])

print(df.info())
print("")

# population params
mu = df["SAT Writing Avg. Score"].mean()
tao = df["SAT Writing Avg. Score"].sum()
sigmasq = df["SAT Writing Avg. Score"].var(ddof=0)

print(f"The mu is: {mu}")
print(f"The tao is: {tao}")
print(f"The sigma^2 is: {sigmasq}")

print("")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   DBN                              478 non-null    object
 1   SCHOOL NAME                      478 non-null    object
 2   Num of SAT Test Takers           478 non-null    object
 3   SAT Critical Reading Avg. Score  478 non-null    object
 4   SAT Math Avg. Score              478 non-null    object
 5   SAT Writing Avg. Score           478 non-null    object
dtypes: object(6)
memory usage: 22.5+ KB
None

<class 'pandas.core.frame.DataFrame'>
Index: 421 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              421 non-null    object 
 1   SCHOOL NAME                      421 non-null    object 
 2   Num of SA

In [46]:
random.seed(420)

In [47]:
# Making length of the DF not a prime number by randomly removing one entry
idx = random.randint(0, len(df) - 1)
df = df.drop(index=idx).reset_index(drop=True)

In [48]:
print(f"mu: {df['SAT Writing Avg. Score'].mean()}")
print(f"Population size M: {len(df)}")
print(f"sample size n: 80")

mu: 393.8952380952381
Population size M: 420
sample size n: 80


## 1. Perform a systematic sample with (approximately) the same total sample size as you used in previous Reports

In [49]:
# Performing a 4 - in - 21 systematic sample

N = 21
n = 4
M = len(df)
sample_size = 80

Mi = M // N # 20 groups (4 * Mi) = 80

# random starting indices
s1 = random.sample(range(N), n)

# sampling from each group
indices = [[] for _ in range(n)]
samples = [[] for _ in range(n)]

# Systematic sampling with fixed offset
for i in range(Mi):  # for each group
    s = [pos + i * N for pos in s1]  # apply offset to each of the starting pos
    for j in range(n):
        if s[j] < M:  # make sure index is within bounds
            indices[j].append(s[j])
            samples[j].append(df['SAT Writing Avg. Score'].iloc[s[j]])

samples

[[378.0,
  428.0,
  477.0,
  402.0,
  416.0,
  407.0,
  357.0,
  394.0,
  356.0,
  341.0,
  389.0,
  368.0,
  388.0,
  386.0,
  411.0,
  331.0,
  395.0,
  392.0,
  377.0,
  352.0],
 [628.0,
  592.0,
  431.0,
  358.0,
  404.0,
  363.0,
  376.0,
  371.0,
  414.0,
  365.0,
  395.0,
  380.0,
  351.0,
  379.0,
  374.0,
  440.0,
  318.0,
  360.0,
  394.0,
  397.0],
 [362.0,
  391.0,
  512.0,
  411.0,
  367.0,
  364.0,
  415.0,
  382.0,
  317.0,
  360.0,
  368.0,
  341.0,
  359.0,
  381.0,
  375.0,
  311.0,
  382.0,
  425.0,
  405.0,
  361.0],
 [359.0,
  523.0,
  400.0,
  408.0,
  398.0,
  357.0,
  365.0,
  402.0,
  352.0,
  368.0,
  349.0,
  371.0,
  358.0,
  355.0,
  380.0,
  373.0,
  380.0,
  401.0,
  359.0,
  417.0]]

## 2. Display the indexes of your sampled units. When systematic sampling is executed correctly, the indexes of the sampled units will follow a distinct pattern

In [50]:
print("Sampled units indices:", indices)

Sampled units indices: [[11, 32, 53, 74, 95, 116, 137, 158, 179, 200, 221, 242, 263, 284, 305, 326, 347, 368, 389, 410], [8, 29, 50, 71, 92, 113, 134, 155, 176, 197, 218, 239, 260, 281, 302, 323, 344, 365, 386, 407], [12, 33, 54, 75, 96, 117, 138, 159, 180, 201, 222, 243, 264, 285, 306, 327, 348, 369, 390, 411], [3, 24, 45, 66, 87, 108, 129, 150, 171, 192, 213, 234, 255, 276, 297, 318, 339, 360, 381, 402]]


## 3. Estimate your parameter of interest by an unbiased estimator. Estimate its variance and give a confidence interval of α level chosen in Report 2.

- $\hat\mu\ = \frac{\hat\tau}{M}$
- $\hat\tau = N\bar{y} = N\sum_{i=1}^n\frac{y_i}{n}$ where $y_i$ is the total of $y$-values in the $i$-th primary unit



In [51]:
yi = [sum(sample) for sample in samples]
tau_hat = N * (sum(yi) / n)
mu_hat = tau_hat / M
mu_hat

388.7375

##### Estimated Variance
- $\hat{\text{var}}(\hat\mu) = \frac{1}{M^2}\hat{\text{var}}({\hat\tau})$
- $\hat{\text{var}}({\hat\tau}) = N(N-n)\frac{s^2_u}{n}$
- $s^2_u = \frac{1}{n-1} \sum_{i=1}^n (y_i - \bar{y})^2$

In [52]:
y_bar = np.mean(yi)
s_u_squared = np.sum((yi - y_bar) ** 2) / (n - 1)
var_hat_tau_hat = N * (N - n) * s_u_squared / n
var_hat_mu_hat = var_hat_tau_hat / (M ** 2)
var_hat_mu_hat

24.407269345238095

##### Confidence Interval of $\alpha=0.05$
- $\hat\mu \pm t_{n-1, \frac{\alpha}{2}}\sqrt{\hat{\text{var}}(\hat\mu)}$

In [53]:
from scipy.stats import t
# Confidence Interval with alpha = .05
alpha = .05
t_crit = t.ppf(1-(alpha/2), n-1)
SDE = t_crit * np.sqrt(var_hat_mu_hat)

CI = mu_hat - SDE, mu_hat + SDE
print(f"95% CI for mu: ({CI[0].round(2)}, {CI[1].round(2)}) \n")

95% CI for mu: (373.02, 404.46) 



# 4. Perform another systematic sampling approach, but this time, let’s change the value of $N$ that you used before. 
(As result, your $n$ will bechanged also).

In [54]:
M = 420
for i in range(1,60):
    n = 80 / (M / i)
    print(i, n)

1 0.19047619047619047
2 0.38095238095238093
3 0.5714285714285714
4 0.7619047619047619
5 0.9523809523809523
6 1.1428571428571428
7 1.3333333333333333
8 1.5238095238095237
9 1.7142857142857144
10 1.9047619047619047
11 2.0952380952380953
12 2.2857142857142856
13 2.4761904761904763
14 2.6666666666666665
15 2.857142857142857
16 3.0476190476190474
17 3.238095238095238
18 3.428571428571429
19 3.619047619047619
20 3.8095238095238093
21 4.0
22 4.190476190476191
23 4.380952380952381
24 4.571428571428571
25 4.761904761904762
26 4.9523809523809526
27 5.142857142857143
28 5.333333333333333
29 5.523809523809524
30 5.714285714285714
31 5.904761904761904
32 6.095238095238095
33 6.2857142857142865
34 6.476190476190476
35 6.666666666666667
36 6.857142857142858
37 7.047619047619048
38 7.238095238095238
39 7.428571428571428
40 7.619047619047619
41 7.809523809523809
42 8.0
43 8.190476190476192
44 8.380952380952381
45 8.571428571428571
46 8.761904761904763
47 8.952380952380953
48 9.142857142857142
49 9.3333

In [55]:
# Performing a 5 - in - 25 systematic sample

N = 42
n = 8
M = len(df)
sample_size = 80

Mi = M // N # 20 groups (4 * Mi) = 80

# random starting indices
s1 = random.sample(range(N), n)

# sampling from each group
indices = [[] for _ in range(n)]
samples = [[] for _ in range(n)]

# Systematic sampling with fixed offset
for i in range(Mi):  # for each group
    s = [pos + i * N for pos in s1]  # apply offset to each of the starting pos
    for j in range(n):
        if s[j] < M:  # make sure index is within bounds
            indices[j].append(s[j])
            samples[j].append(df['SAT Writing Avg. Score'].iloc[s[j]])

samples

[[411.0, 335.0, 442.0, 399.0, 339.0, 384.0, 369.0, 363.0, 443.0, 363.0],
 [366.0, 475.0, 345.0, 393.0, 372.0, 362.0, 411.0, 370.0, 385.0, 384.0],
 [391.0, 411.0, 364.0, 382.0, 360.0, 341.0, 381.0, 311.0, 425.0, 361.0],
 [479.0, 359.0, 326.0, 297.0, 383.0, 359.0, 373.0, 365.0, 384.0, 450.0],
 [381.0, 577.0, 383.0, 365.0, 354.0, 350.0, 359.0, 359.0, 368.0, 370.0],
 [682.0, 395.0, 375.0, 330.0, 398.0, 385.0, 333.0, 550.0, 442.0, 426.0],
 [628.0, 431.0, 404.0, 376.0, 414.0, 395.0, 351.0, 374.0, 318.0, 394.0],
 [349.0, 351.0, 370.0, 370.0, 360.0, 383.0, 393.0, 332.0, 400.0, 334.0]]

# 5. Estimate your parameter of interest by an unbiased estimator. Estimate its variance and give a confidence interval of $\alpha$ level chosen in Report 2.

- $\hat\mu\ = \frac{\hat\tau}{M}$
- $\hat\tau = N\bar{y} = N\sum_{i=1}^n\frac{y_i}{n}$ where $y_i$ is the total of $y$-values in the $i$-th primary unit

In [56]:
yi = [sum(sample) for sample in samples]
tau_hat = N * (sum(yi) / n)
mu_hat = tau_hat / M
float(mu_hat)


389.025

##### Estimated Variance
- $\hat{\text{var}}(\hat\mu) = \frac{1}{M^2}\hat{\text{var}}({\hat\tau})$
- $\hat{\text{var}}({\hat\tau}) = N(N-n)\frac{s^2_u}{n}$
- $s^2_u = \frac{1}{n-1} \sum_{i=1}^n (y_i - \bar{y})^2$

In [57]:
y_bar = np.mean(yi)
s_u_squared = np.sum((yi - y_bar) ** 2) / (n - 1)
var_hat_tau_hat = N * (N - n) * s_u_squared / n
var_hat_mu_hat = var_hat_tau_hat / (M ** 2)
float(var_hat_mu_hat)

46.81758078231293

##### Confidence Interval of $\alpha=0.05$
- $\hat\mu \pm t_{n-1, \frac{\alpha}{2}}\sqrt{\hat{\text{var}}(\hat\mu)}$

In [58]:
from scipy.stats import t
# Confidence Interval with alpha = .05
alpha = .05
t_crit = t.ppf(1-(alpha/2), n-1)
SDE = t_crit * np.sqrt(var_hat_mu_hat)

CI = mu_hat - SDE, mu_hat + SDE
print(f"95% CI for mu: ({CI[0].round(2)}, {CI[1].round(2)}) \n")

95% CI for mu: (372.85, 405.2) 



# 6. Order your population with respect to y (variable of interest). Repeat steps 1-4 for the ordered data

In [59]:
df_ordered = df.sort_values(by='SAT Writing Avg. Score').reset_index(drop=True)
df_ordered

Unnamed: 0,DBN,SCHOOL NAME,Num of SAT Test Takers,SAT Critical Reading Avg. Score,SAT Math Avg. Score,SAT Writing Avg. Score
0,19K583,MULTICULTURAL HIGH SCHOOL,29,279.0,322.0,286.0
1,17K524,INTERNATIONAL HIGH SCHOOL AT PROSPECT HEIGHTS,71,287.0,335.0,291.0
2,09X365,ACADEMY FOR LANGUAGE AND TECHNOLOGY,54,315.0,339.0,297.0
3,12X388,PAN AMERICAN INTERNATIONAL HIGH SCHOOL AT MONROE,30,321.0,351.0,298.0
4,15K520,PACIFIC HIGH SCHOOL,9,352.0,341.0,300.0
...,...,...,...,...,...,...
415,10X696,HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE,92,636.0,648.0,636.0
416,31R605,STATEN ISLAND TECHNICAL HIGH SCHOOL,227,635.0,682.0,636.0
417,25Q525,TOWNSEND HARRIS HIGH SCHOOL,278,621.0,651.0,638.0
418,10X445,BRONX HIGH SCHOOL OF SCIENCE,731,632.0,688.0,649.0


In [60]:
# Performing a 4 - in - 21 systematic sample

N = 21
n = 4
M = len(df_ordered)
sample_size = 80

Mi = M // N # 20 groups (4 * Mi) = 80

# random starting indices
s1 = random.sample(range(N), n)

# sampling from each group
indices = [[] for _ in range(n)]
samples = [[] for _ in range(n)]

# Systematic sampling with fixed offset
for i in range(Mi):  # for each group
    s = [pos + i * N for pos in s1]  # apply offset to each of the starting pos
    for j in range(n):
        if s[j] < M:  # make sure index is within bounds
            indices[j].append(s[j])
            samples[j].append(df_ordered['SAT Writing Avg. Score'].iloc[s[j]])

samples

[[312.0,
  339.0,
  349.0,
  354.0,
  358.0,
  362.0,
  367.0,
  370.0,
  375.0,
  379.0,
  383.0,
  388.0,
  393.0,
  398.0,
  405.0,
  414.0,
  426.0,
  442.0,
  477.0,
  587.0],
 [316.0,
  340.0,
  350.0,
  354.0,
  359.0,
  363.0,
  367.0,
  370.0,
  376.0,
  380.0,
  384.0,
  388.0,
  393.0,
  399.0,
  407.0,
  416.0,
  428.0,
  448.0,
  481.0,
  592.0],
 [311.0,
  335.0,
  349.0,
  354.0,
  358.0,
  362.0,
  366.0,
  370.0,
  374.0,
  378.0,
  383.0,
  387.0,
  392.0,
  396.0,
  404.0,
  414.0,
  425.0,
  442.0,
  475.0,
  570.0],
 [298.0,
  333.0,
  345.0,
  352.0,
  357.0,
  361.0,
  365.0,
  369.0,
  373.0,
  377.0,
  382.0,
  385.0,
  391.0,
  395.0,
  402.0,
  411.0,
  423.0,
  440.0,
  467.0,
  533.0]]

In [61]:
print("Sampled units indices:", indices)

Sampled units indices: [[10, 31, 52, 73, 94, 115, 136, 157, 178, 199, 220, 241, 262, 283, 304, 325, 346, 367, 388, 409], [13, 34, 55, 76, 97, 118, 139, 160, 181, 202, 223, 244, 265, 286, 307, 328, 349, 370, 391, 412], [8, 29, 50, 71, 92, 113, 134, 155, 176, 197, 218, 239, 260, 281, 302, 323, 344, 365, 386, 407], [3, 24, 45, 66, 87, 108, 129, 150, 171, 192, 213, 234, 255, 276, 297, 318, 339, 360, 381, 402]]


In [62]:
yi = [sum(sample) for sample in samples]
tau_hat = N * (sum(yi) / n)
mu_hat = tau_hat / M
mu_hat

392.4125

In [63]:
y_bar = np.mean(yi)
s_u_squared = np.sum((yi - y_bar) ** 2) / (n - 1)
var_hat_tau_hat = N * (N - n) * s_u_squared / n
var_hat_mu_hat = var_hat_tau_hat / (M ** 2)
var_hat_mu_hat

2.158519345238095

In [64]:
from scipy.stats import t
# Confidence Interval with alpha = .05
alpha = .05
t_crit = t.ppf(1-(alpha/2), n-1)
SDE = t_crit * np.sqrt(var_hat_mu_hat)

CI = mu_hat - SDE, mu_hat + SDE
print(f"95% CI for mu: ({CI[0].round(2)}, {CI[1].round(2)}) \n")

95% CI for mu: (387.74, 397.09) 



In [65]:
M = 420
for i in range(1,60):
    n = 80 / (M / i)
    print(i, n)

1 0.19047619047619047
2 0.38095238095238093
3 0.5714285714285714
4 0.7619047619047619
5 0.9523809523809523
6 1.1428571428571428
7 1.3333333333333333
8 1.5238095238095237
9 1.7142857142857144
10 1.9047619047619047
11 2.0952380952380953
12 2.2857142857142856
13 2.4761904761904763
14 2.6666666666666665
15 2.857142857142857
16 3.0476190476190474
17 3.238095238095238
18 3.428571428571429
19 3.619047619047619
20 3.8095238095238093
21 4.0
22 4.190476190476191
23 4.380952380952381
24 4.571428571428571
25 4.761904761904762
26 4.9523809523809526
27 5.142857142857143
28 5.333333333333333
29 5.523809523809524
30 5.714285714285714
31 5.904761904761904
32 6.095238095238095
33 6.2857142857142865
34 6.476190476190476
35 6.666666666666667
36 6.857142857142858
37 7.047619047619048
38 7.238095238095238
39 7.428571428571428
40 7.619047619047619
41 7.809523809523809
42 8.0
43 8.190476190476192
44 8.380952380952381
45 8.571428571428571
46 8.761904761904763
47 8.952380952380953
48 9.142857142857142
49 9.3333

In [66]:
# Performing a 5 - in - 25 systematic sample

N = 42
n = 8
M = len(df_ordered)
sample_size = 80

Mi = M // N # 20 groups (4 * Mi) = 80

# random starting indices
s1 = random.sample(range(N), n)

# sampling from each group
indices = [[] for _ in range(n)]
samples = [[] for _ in range(n)]

# Systematic sampling with fixed offset
for i in range(Mi):  # for each group
    s = [pos + i * N for pos in s1]  # apply offset to each of the starting pos
    for j in range(n):
        if s[j] < M:  # make sure index is within bounds
            indices[j].append(s[j])
            samples[j].append(df_ordered['SAT Writing Avg. Score'].iloc[s[j]])

samples

[[330.0, 351.0, 360.0, 368.0, 377.0, 385.0, 394.0, 411.0, 433.0, 518.0],
 [326.0, 351.0, 359.0, 368.0, 376.0, 384.0, 394.0, 410.0, 431.0, 496.0],
 [311.0, 349.0, 358.0, 367.0, 375.0, 383.0, 392.0, 404.0, 425.0, 476.0],
 [302.0, 348.0, 358.0, 365.0, 374.0, 382.0, 392.0, 403.0, 424.0, 467.0],
 [318.0, 350.0, 359.0, 368.0, 376.0, 384.0, 394.0, 408.0, 430.0, 494.0],
 [339.0, 354.0, 362.0, 370.0, 379.0, 388.0, 398.0, 414.0, 442.0, 587.0],
 [312.0, 349.0, 359.0, 367.0, 375.0, 383.0, 393.0, 405.0, 426.0, 479.0],
 [335.0, 354.0, 362.0, 370.0, 379.0, 387.0, 397.0, 414.0, 442.0, 577.0]]

# 7. Choose the best estimator of your parameter based on the smallest estimated variance
The var_hat_mu_hat from the ordered systematic sample produced a smaller estimated variance (2.158519345238095) than the periodic one. Therefore, the var_hat_mu_hat from the ordered population is better.