In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

#  read the dataset
df = pd.read_csv("2012-sat-results.csv")

print(df.info())
print("")

# convert all values to numeric
df["SAT Critical Reading Avg. Score"] = pd.to_numeric(df["SAT Critical Reading Avg. Score"], errors="coerce")
df["SAT Math Avg. Score"] = pd.to_numeric(df["SAT Math Avg. Score"], errors="coerce")
df["SAT Writing Avg. Score"] = pd.to_numeric(df["SAT Writing Avg. Score"], errors="coerce")

# Drop rows with NaN values
df = df.dropna(subset=["SAT Critical Reading Avg. Score", "SAT Math Avg. Score", "SAT Writing Avg. Score"])

print(df.info())
print("")

# population params
mu = df["SAT Writing Avg. Score"].mean()
tao = df["SAT Writing Avg. Score"].sum()
sigmasq = df["SAT Writing Avg. Score"].var(ddof=0)

print(f"The mu is: {mu}")
print(f"The tao is: {tao}")
print(f"The sigma^2 is: {sigmasq}")

print("")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   DBN                              478 non-null    object
 1   SCHOOL NAME                      478 non-null    object
 2   Num of SAT Test Takers           478 non-null    object
 3   SAT Critical Reading Avg. Score  478 non-null    object
 4   SAT Math Avg. Score              478 non-null    object
 5   SAT Writing Avg. Score           478 non-null    object
dtypes: object(6)
memory usage: 22.5+ KB
None

<class 'pandas.core.frame.DataFrame'>
Index: 421 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              421 non-null    object 
 1   SCHOOL NAME                      421 non-null    object 
 2   Num of SA

In [2]:
random.seed(420)

In [3]:
# Making length of the DF not a prime number by randomly removing one entry
idx = random.randint(0, len(df) - 1)
df = df.drop(index=idx).reset_index(drop=True)

In [4]:
print(f"mu: {df["SAT Writing Avg. Score"].mean()}")
print(f"Population size M: {len(df)}")
print(f"sample size n: 80")

mu: 393.8952380952381
Population size M: 420
sample size n: 80


## 1. Perform a systematic sample with (approximately) the same total sample size as you used in previous Reports

In [16]:
# Performing a 4 - in - 21 systematic sample

N = 21
n = 4
M = len(df)
sample_size = 80

Mi = M // N # 20 groups (4 * Mi) = 80

# random starting indices
s1 = random.sample(range(N), n)

# sampling from each group
indices = [[] for _ in range(n)]
samples = [[] for _ in range(n)]

# Systematic sampling with fixed offset
for i in range(Mi):  # for each group
    s = [pos + i * N for pos in s1]  # apply offset to each of the starting pos
    for j in range(n):
        if s[j] < M:  # make sure index is within bounds
            indices[j].append(s[j])
            samples[j].append(df['SAT Writing Avg. Score'].iloc[s[j]])

samples

[[359.0,
  523.0,
  400.0,
  408.0,
  398.0,
  357.0,
  365.0,
  402.0,
  352.0,
  368.0,
  349.0,
  371.0,
  358.0,
  355.0,
  380.0,
  373.0,
  380.0,
  401.0,
  359.0,
  417.0],
 [363.0,
  357.0,
  351.0,
  376.0,
  357.0,
  348.0,
  376.0,
  339.0,
  400.0,
  391.0,
  341.0,
  381.0,
  368.0,
  335.0,
  383.0,
  356.0,
  396.0,
  464.0,
  596.0,
  385.0],
 [405.0,
  388.0,
  429.0,
  373.0,
  349.0,
  389.0,
  356.0,
  301.0,
  367.0,
  364.0,
  431.0,
  381.0,
  441.0,
  374.0,
  378.0,
  400.0,
  441.0,
  454.0,
  398.0,
  350.0],
 [392.0,
  381.0,
  384.0,
  577.0,
  588.0,
  383.0,
  374.0,
  365.0,
  364.0,
  354.0,
  312.0,
  350.0,
  291.0,
  359.0,
  424.0,
  359.0,
  481.0,
  368.0,
  359.0,
  370.0]]

## 2. Display the indexes of your sampled units. When systematic sampling is executed correctly, the indexes of the sampled units will follow a distinct pattern

In [8]:
print("Sampled units indices:", indices)

Sampled units indices: [[11, 32, 53, 74, 95, 116, 137, 158, 179, 200, 221, 242, 263, 284, 305, 326, 347, 368, 389, 410], [8, 29, 50, 71, 92, 113, 134, 155, 176, 197, 218, 239, 260, 281, 302, 323, 344, 365, 386, 407], [12, 33, 54, 75, 96, 117, 138, 159, 180, 201, 222, 243, 264, 285, 306, 327, 348, 369, 390, 411], [3, 24, 45, 66, 87, 108, 129, 150, 171, 192, 213, 234, 255, 276, 297, 318, 339, 360, 381, 402]]
