In [1]:
%reload_ext autoreload
%autoreload 2
# %autoreload 1
# %aimport from kret_studies import *
# %aimport from kret_studies.notebook_imports import *
# %load_ext fireducks.pandas # linux only for now

In [2]:
from kret_studies import *
from kret_studies.notebook_imports import *

## I'm going to try to complete this assignment using polars instead of pandas to get some familiarity

In [3]:
data_url = r"""https://users.stat.ufl.edu/~rrandles/sta4210/Rclassnotes/data/textdatasets/KutnerData/Chapter%20%206%20Data%20Sets/CH06PR18.txt"""
url = data_url

In [4]:
FEAT = [(age := "Age"), (expenses := "Expenses"), (vacancy := "Vacancy"), (size := "Size")]
x1, x2, x3, x4 = age, expenses, vacancy, size
LABEL = [rental := ("Rental")]
col_names = LABEL + FEAT

In [5]:
pd_df = pd.read_csv(url, sep=r"\s+", header=None, names=col_names)

In [6]:
df = pl.DataFrame(pd_df)[FEAT + LABEL]

In [7]:
df.head(2)

Age,Expenses,Vacancy,Size,Rental
i64,f64,f64,i64,f64
1,5.02,0.14,123000,13.5
14,8.19,0.27,104079,12.0


### (a) Obtain the appropriate ANOVA table and calculate SSR(X4), SSR(X1|X4), SSR(X2|X1,X4), SSR(X3|X1, X2, X4).

In [8]:
# 1. SSR(X4)

In [9]:
ols_x4 = uks_stats.get_ols_formula(x4, LABEL)
ols_x4

'Rental ~ Size'

In [10]:
model_x4 = smf.ols(ols_x4, data=df).fit()

In [11]:
ssr_x4 = model_x4.ssr
ssr_x4

np.float64(168.78240201352642)

In [12]:
# 2. SSR(X1|X4)

In [13]:
ols_x1_x4 = uks_stats.get_ols_formula([x1] + [x4], LABEL)
ols_x1_x4

'Rental ~ Age + Size'

In [14]:
model_x1_x4 = smf.ols(ols_x1_x4, data=df).fit()

In [15]:
ssr_x1_given_x4 = model_x4.ess - model_x1_x4.ess

In [16]:
# 3. SSR(X2|X1,X4)

In [17]:
ols_x2_x1_x4 = uks_stats.get_ols_formula([x2] + [x1] + [x4], LABEL)
ols_x2_x1_x4

'Rental ~ Expenses + Age + Size'

In [18]:
model_x2_x1_x4 = smf.ols(ols_x2_x1_x4, data=df).fit()

In [19]:
ssr_x2_given_x1_x4 = model_x1_x4.ess - model_x2_x1_x4.ess

In [20]:
# 4. SSR(X3|X1,X2,X4)

In [21]:
ols_all_vars = uks_stats.get_ols_formula(FEAT, LABEL)
ols_all_vars

'Rental ~ Age + Expenses + Vacancy + Size'

In [22]:
model_full = smf.ols(ols_all_vars, data=df).fit()

In [23]:
ssr_x3_given_x1_x2_x4 = model_x2_x1_x4.ess - model_full.ess

In [24]:
print("--- Sequential Sum of Squares (SSR) ---")
print(f"SSR(X4) = {ssr_x4:.4f}")
print(f"SSR(X1|X4) = {ssr_x1_given_x4:.4f}")
print(f"SSR(X2|X1,X4) = {ssr_x2_given_x1_x4:.4f}")
print(f"SSR(X3|X1,X2,X4) = {ssr_x3_given_x1_x2_x4:.4f}")
print(f"SSR(Full Model) = {model_full.ssr:.4f}")

--- Sequential Sum of Squares (SSR) ---
SSR(X4) = 168.7824
SSR(X1|X4) = -42.2746
SSR(X2|X1,X4) = -27.8575
SSR(X3|X1,X2,X4) = -0.4197
SSR(Full Model) = 98.2306


### Test whether X2 and X3 can be dropped from the model given that X1 and X4 are retained. Use a = 0.01, state the hypotheses, critical value, p-value and conclusion.

In [25]:
f_test_results = sm.stats.anova_lm(model_x1_x4, model_full)

In [26]:
f_test_results

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,78.0,126.507834,0.0,,,
1,76.0,98.230594,2.0,28.27724,10.938905,6.7e-05


In [27]:
alpha = 0.01
# Degrees of freedom 1: number of parameters dropped (2: X2 and X3)
df1 = model_x1_x4.df_resid - model_full.df_resid
# Degrees of freedom 2: degrees of freedom of the full model
df2 = model_full.df_resid
critical_value = f.ppf(1 - alpha, df1, df2)
critical_value

np.float64(4.89583988401818)

Since the p-value is tiny, we can reject the null hypothesis that x2 & x3 can be dropped from the model

### (c) Calculate $R^2_{Y,X_4}$, $R^2_{Y,X_1}$, $R^2_{Y,X_1|X_4}$, $R^2_{Y,X_2|X_1,X_4}$, $R^2_{Y,X_3|X_1,X_2,X_4}$ and the regular $R^2$ when predicting Y with all X variables.

In [29]:
sse_x4 = model_x4.ess
sse_x1_x4 = model_x1_x4.ess
sse_x2_x1_x4 = model_x2_x1_x4.ess

In [30]:
# R-squared Y,X4 (There are no other variables)
# This is just the regular R-squared of the model Y ~ X4
r2_y_x4 = model_x4.rsquared

# R-squared Y,X1 | X4
r2_y_x1_given_x4 = ssr_x1_given_x4 / sse_x4

# R-squared Y,X2 | X1,X4
r2_y_x2_given_x1_x4 = ssr_x2_given_x1_x4 / sse_x1_x4

# R-squared Y,X3 | X1,X2,X4
r2_y_x3_given_x1_x2_x4 = ssr_x3_given_x1_x2_x4 / sse_x2_x1_x4

# Regular R-squared for the full model
regular_r2_full_model = model_full.rsquared

print("\n--- Coefficients of Partial Determination ---")
print(f"R-squared(Y,X4) = {r2_y_x4:.4f}")
print(f"R-squared(Y,X1|X4) = {r2_y_x1_given_x4:.4f}")
print(f"R-squared(Y,X2|X1,X4) = {r2_y_x2_given_x1_x4:.4f}")
print(f"R-squared(Y,X3|X1,X2,X4) = {r2_y_x3_given_x1_x2_x4:.4f}")
print(f"\nRegular R-squared for the full model = {regular_r2_full_model:.4f}")


--- Coefficients of Partial Determination ---
R-squared(Y,X4) = 0.2865
R-squared(Y,X1|X4) = -0.6237
R-squared(Y,X2|X1,X4) = -0.2531
R-squared(Y,X3|X1,X2,X4) = -0.0030

Regular R-squared for the full model = 0.5847
