In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#  read the dataset
df = pd.read_csv("2012-sat-results.csv")

print(df.info())
print("")

# convert all values to numeric
df["SAT Critical Reading Avg. Score"] = pd.to_numeric(df["SAT Critical Reading Avg. Score"], errors="coerce")
df["SAT Math Avg. Score"] = pd.to_numeric(df["SAT Math Avg. Score"], errors="coerce")
df["SAT Writing Avg. Score"] = pd.to_numeric(df["SAT Writing Avg. Score"], errors="coerce")

# Drop rows with NaN values
df = df.dropna(subset=["SAT Critical Reading Avg. Score", "SAT Math Avg. Score", "SAT Writing Avg. Score"])

print(df.info())
print("")

# population params
mu = df["SAT Writing Avg. Score"].mean()
tao = df["SAT Writing Avg. Score"].sum()
sigmasq = df["SAT Writing Avg. Score"].var(ddof=0)

print(f"The mu is: {mu}")
print(f"The tao is: {tao}")
print(f"The sigma^2 is: {sigmasq}")

print("")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   DBN                              478 non-null    object
 1   SCHOOL NAME                      478 non-null    object
 2   Num of SAT Test Takers           478 non-null    object
 3   SAT Critical Reading Avg. Score  478 non-null    object
 4   SAT Math Avg. Score              478 non-null    object
 5   SAT Writing Avg. Score           478 non-null    object
dtypes: object(6)
memory usage: 22.5+ KB
None

<class 'pandas.core.frame.DataFrame'>
Index: 421 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              421 non-null    object 
 1   SCHOOL NAME                      421 non-null    object 
 2   Num of SA

${\mu}$: 393.9857

N: 421

n: 80

# Problem 1. Select Same First Auxiliary Variable as Report 4

In [2]:
# Selecting: SAT Math Avg. Score

# Problem 2. Perform Double Sampling with SRS

In [3]:
seed = 422
np.random.seed(seed)

n = 80
N = 421
npr = n * 2

first_phase_sample = df.sample(n=npr, replace=False, random_state=seed)
second_phase_sample = first_phase_sample.sample(n=n, replace=False, random_state=seed)

# Problem 3. Perform a Diagonstic Analysis to Check Ratio Estimator Assumptions

In [4]:
import statsmodels.api as sm

X = second_phase_sample["SAT Math Avg. Score"]
y = second_phase_sample["SAT Writing Avg. Score"]

# Add a constant (intercept)
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Get the summary
print(model.summary())

                              OLS Regression Results                              
Dep. Variable:     SAT Writing Avg. Score   R-squared:                       0.871
Model:                                OLS   Adj. R-squared:                  0.869
Method:                     Least Squares   F-statistic:                     525.8
Date:                    Mon, 05 May 2025   Prob (F-statistic):           2.09e-36
Time:                            12:20:48   Log-Likelihood:                -361.92
No. Observations:                      80   AIC:                             727.8
Df Residuals:                          78   BIC:                             732.6
Df Model:                               1                                         
Covariance Type:                nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

### Since the p-value for the slope is 0 < 0.05, this means that you reject the null hypothesis that there is not a significant linear relationship between SAT Math Avg. score and SAT Writing Avg. score. Since the p-value of the intercept is 0.025 < 0.05, this means you reject the null hypothesis that the line passes through the origin.

# Problem 4. Make conclusion about appropriateness of using ratio estimator based on the result of 3. Does your conclusion agree with part 7 of Report 4 (Regression output based on whole population)?

The ratio estimator assumes that the variable of interest is linearly related to the auxiliary variable, and in our case, there is a significant linear relationship between SAT Math Avg. Score and SAT Writing Avg. score. Additionally, The ratio estimator also assumes that the variable of interest tends to be zero when the auxiliary variable tends to zero, which is not satisfied here, since the linear regression line does not pass through the origin, shown above. Since one of the assumptions isn't met, ratio estimator isn't the best fit here.

This follows Report 4, where the ratio estimator has a poorer estimate than the linear regression estimator. In that report for the regression output on the whole population, we observe that linearity holds, but fails to satisfy the origin-passing condition. This matches the conclusions we just got with double sampling with the auxiliary variable as SAT Math Avg. Score.

# 5. Estimate your parameter of interest by ratio estimator. Estimate its variance and standard deviation.

$\Large r = \frac{\sum_{i=1}^n y_i}{\sum_{i=1}^n x_i}$

$\Large \hat{\tau_x} = \frac{N}{n'}\sum_{i=1}^{n'} x_i$

$\Large \hat{\tau_r} = r\hat{\tau_x}$

$\Large \hat{\mu_r} = \frac{1}{N}\hat{\tau_r}$

In [5]:
y_i = second_phase_sample['SAT Writing Avg. Score']# is from 1 to n (SECOND PHASE SAMPLE)
x_i = second_phase_sample['SAT Math Avg. Score']   # is from 1 to n (SECOND PHASE SAMPLE)
x_i_pr = first_phase_sample['SAT Math Avg. Score'] # is from 1 to n' (FIRST PHASE SAMPLE)

r = sum(y_i) / sum(x_i)
tau_hat_x = (N / npr) * sum(x_i_pr)

tau_hat_r = r * tau_hat_x
mu_hat_r = (1 / N) * tau_hat_r
round(mu_hat_r, 2)

396.35

$\Large \bar{y} = \frac{1}{n}\sum_{i=1}^ny_i$

$\Large s^2 = \frac{1}{n - 1}\sum_{i=1}^n(y_i - \bar{y})^2$

$\Large \hat{\text{var}}(\hat{\tau_r}) = N(N-n')\frac{s^2}{n'} + N^2\frac{n'-n}{n'n(n-1)}\sum_{i=1}^n(y_i-rx_i)^2$

$\Large \hat{\text{var}}(\hat{\mu_r}) = \frac{1}{N^2}\hat{\text{var}}(\hat{\tau_r})$

In [6]:
ybar = (1 / n) * sum(y_i)
s2 = (1 / (n - 1)) * sum((y_i - ybar)**2)

var_hat_tau_hat_r = (N * (N - npr) * (s2 / npr)) + (N**2 * ((npr - n) / (npr*n*(n-1))) * sum((y_i - r * x_i)**2))

var_hat_mu_hat_r = (1 / N**2) * var_hat_tau_hat_r
round(var_hat_mu_hat_r, 2)

18.49

In [7]:
stddev_mu_hat_r = float(np.sqrt(var_hat_mu_hat_r))
round(stddev_mu_hat_r, 2)

4.3

# 6.Choose the second auxiliary variable x the same as in Report 4.

In [8]:
# Select SAT average reading score as second auxiliary variable.

# 7. Repeat steps 2–5.

In [13]:
seed2 = 422
np.random.seed(seed2)

n2 = 80
N2 = 421
npr2 = n * 2

first_phase_sample_2 = df.sample(n=npr2, replace=False, random_state=seed2)
second_phase_sample_2 = first_phase_sample_2.sample(n=n2, replace=False, random_state=seed2)

In [14]:

X2 = second_phase_sample_2["SAT Critical Reading Avg. Score"]
y2 = second_phase_sample_2["SAT Writing Avg. Score"]

X2= sm.add_constant(X2)

model2 = sm.OLS(y2, X2).fit()

print(model2.summary())

                              OLS Regression Results                              
Dep. Variable:     SAT Writing Avg. Score   R-squared:                       0.931
Model:                                OLS   Adj. R-squared:                  0.930
Method:                     Least Squares   F-statistic:                     1059.
Date:                    Mon, 05 May 2025   Prob (F-statistic):           3.92e-47
Time:                            12:23:33   Log-Likelihood:                -336.62
No. Observations:                      80   AIC:                             677.2
Df Residuals:                          78   BIC:                             682.0
Df Model:                               1                                         
Covariance Type:                nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------