In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#  read the dataset
df = pd.read_csv("2012-sat-results.csv")

print(df.info())
print("")

# convert all values to numeric
df["SAT Critical Reading Avg. Score"] = pd.to_numeric(df["SAT Critical Reading Avg. Score"], errors="coerce")
df["SAT Math Avg. Score"] = pd.to_numeric(df["SAT Math Avg. Score"], errors="coerce")
df["SAT Writing Avg. Score"] = pd.to_numeric(df["SAT Writing Avg. Score"], errors="coerce")

# Drop rows with NaN values
df = df.dropna(subset=["SAT Critical Reading Avg. Score", "SAT Math Avg. Score", "SAT Writing Avg. Score"])

print(df.info())
print("")

# population params
mu = df["SAT Writing Avg. Score"].mean()
tao = df["SAT Writing Avg. Score"].sum()
sigmasq = df["SAT Writing Avg. Score"].var(ddof=0)

print(f"The mu is: {mu}")
print(f"The tao is: {tao}")
print(f"The sigma^2 is: {sigmasq}")

print("")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   DBN                              478 non-null    object
 1   SCHOOL NAME                      478 non-null    object
 2   Num of SAT Test Takers           478 non-null    object
 3   SAT Critical Reading Avg. Score  478 non-null    object
 4   SAT Math Avg. Score              478 non-null    object
 5   SAT Writing Avg. Score           478 non-null    object
dtypes: object(6)
memory usage: 22.5+ KB
None

<class 'pandas.core.frame.DataFrame'>
Index: 421 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              421 non-null    object 
 1   SCHOOL NAME                      421 non-null    object 
 2   Num of SA

${\mu}$: 393.9857

N: 421

n: 80

# Problem 1. Select Same First Auxiliary Variable as Report 4

In [4]:
# Selecting: SAT Math Avg. Score

# Problem 2. Perform Double Sampling with SRS

In [6]:
seed = 422
np.random.seed(seed)

n = 80
N = 421
npr = n * 2

first_phase_sample = df.sample(n=npr, replace=False, random_state=seed)
second_phase_sample = first_phase_sample.sample(n=n, replace=False, random_state=seed)

# Problem 3. Perform a Diagonstic Analysis to Check Ratio Estimator Assumptions

In [8]:
import statsmodels.api as sm

X = second_phase_sample["SAT Math Avg. Score"]
y = second_phase_sample["SAT Writing Avg. Score"]

# Add a constant (intercept)
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Get the summary
print(model.summary())

                              OLS Regression Results                              
Dep. Variable:     SAT Writing Avg. Score   R-squared:                       0.871
Model:                                OLS   Adj. R-squared:                  0.869
Method:                     Least Squares   F-statistic:                     525.8
Date:                    Sat, 03 May 2025   Prob (F-statistic):           2.09e-36
Time:                            12:28:37   Log-Likelihood:                -361.92
No. Observations:                      80   AIC:                             727.8
Df Residuals:                          78   BIC:                             732.6
Df Model:                               1                                         
Covariance Type:                nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

### Since the p-value for the slope is 0 < 0.05, this means that you reject the null hypothesis that there is not a significant linear relationship between SAT Math Avg. score and SAT Writing Avg. score. Since the p-value of the intercept is 0.025 < 0.05, this means you reject the null hypothesis that the line passes through the origin.

# Problem 4. Make conclusion about appropriateness of using ratio estimator based on the result of 3. Does your conclusion agree with part 7 of Report 4 (Regression output based on whole population)?