In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#  read the dataset
df = pd.read_csv("2012-sat-results.csv")

print(df.info())
print("")

# convert all values to numeric
df["SAT Critical Reading Avg. Score"] = pd.to_numeric(df["SAT Critical Reading Avg. Score"], errors="coerce")
df["SAT Math Avg. Score"] = pd.to_numeric(df["SAT Math Avg. Score"], errors="coerce")
df["SAT Writing Avg. Score"] = pd.to_numeric(df["SAT Writing Avg. Score"], errors="coerce")

# Drop rows with NaN values
df = df.dropna(subset=["SAT Critical Reading Avg. Score", "SAT Math Avg. Score", "SAT Writing Avg. Score"])

print(df.info())
print("")

# population params
mu = df["SAT Writing Avg. Score"].mean()
tao = df["SAT Writing Avg. Score"].sum()
sigmasq = df["SAT Writing Avg. Score"].var(ddof=0)

print(f"The mu is: {mu}")
print(f"The tao is: {tao}")
print(f"The sigma^2 is: {sigmasq}")

print("")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   DBN                              478 non-null    object
 1   SCHOOL NAME                      478 non-null    object
 2   Num of SAT Test Takers           478 non-null    object
 3   SAT Critical Reading Avg. Score  478 non-null    object
 4   SAT Math Avg. Score              478 non-null    object
 5   SAT Writing Avg. Score           478 non-null    object
dtypes: object(6)
memory usage: 22.5+ KB
None

<class 'pandas.core.frame.DataFrame'>
Index: 421 entries, 0 to 477
Data columns (total 6 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   DBN                              421 non-null    object 
 1   SCHOOL NAME                      421 non-null    object 
 2   Num of SA

#### Choose an auxiliary variable x that should be related to your variable of interest y. Take a SRS of size n (the same size as in Report 2)

In [8]:
# Correlation With an Auxilary Variable
# Our variable of interest is SAT Writing Avg. Score. A related variable we are using is SAT Math Avg. Score. The SAT Math Avg. Score
# has a 0.8885 correlation with SAT Writing Avg. Score. 
df["SAT Math Avg. Score"].corr(df["SAT Writing Avg. Score"])

n = 80
seed = 440
sampled_df = df.sample(n=n, replace=True, random_state=seed)

#### Perform a diagnostic analysis to determine if x and y have a linear relationship based on the sample data. Do regression analysis y ∼ x

In [12]:
import statsmodels.api as sm

X = df["SAT Math Avg. Score"]
y = df["SAT Writing Avg. Score"]

# Add a constant (intercept)
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Get the summary
print(model.summary())

                              OLS Regression Results                              
Dep. Variable:     SAT Writing Avg. Score   R-squared:                       0.789
Model:                                OLS   Adj. R-squared:                  0.789
Method:                     Least Squares   F-statistic:                     1570.
Date:                    Sat, 29 Mar 2025   Prob (F-statistic):          8.43e-144
Time:                            11:41:20   Log-Likelihood:                -1983.0
No. Observations:                     421   AIC:                             3970.
Df Residuals:                         419   BIC:                             3978.
Df Model:                               1                                         
Covariance Type:                nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------

In [None]:
# Since the p-value for the slope is 0, this means that you reject the null hypothesis and conclude there is a 
# significant linear relationship between SAT math score and SAT writing score.

#### Based on the results of the regression analysis, make a conclusion about the appropriateness of using ratio and regression estimators.

In [16]:
# Since the p-value of the intercept is 0, this means that you reject the null hypothesis and conclude the intercept is
# significantly different from 0. As seen in the previous part, there is a statistically significant linear linearship between 
# SAT math score and SAT writing scores. 

# Ratio estimators assume that the line has an intercept of 0, but this assumption is not met in this case. In contrast, the 
# linear regression model does not require the intercept to be 0. Therefore, with this data, it is more appropriate to use the 
# regression estimator rather than a ratio estimator.