In [1]:
import pandas as pd, numpy as np, statsmodels.api as sm
from scipy.stats import f

df = pd.read_csv("ps1small.csv")
df["lwage"] = np.log(df["wage"])
D_educ = pd.get_dummies(df["education"], prefix="ed", drop_first=True).astype(float)
D_age  = pd.get_dummies(df["age"],       prefix="age", drop_first=True).astype(float)
XU = sm.add_constant(pd.concat([D_educ, D_age], axis=1)).astype(float)
XR = sm.add_constant(df.assign(ea=df["education"]*df["age"])[["education","age","ea"]]).astype(float)
y  = pd.to_numeric(df["lwage"], errors="coerce")

# align to common non-missing rows across both designs and y
mask = (~XU.isna().any(axis=1)) & (~XR.isna().any(axis=1)) & y.notna()
y_, XU_, XR_ = y.loc[mask], XU.loc[mask], XR.loc[mask]

res_U, res_R = sm.OLS(y_, XU_).fit(), sm.OLS(y_, XR_).fit()
RSSU, RSSR = float(res_U.ssr), float(res_R.ssr)
n   = int(y_.shape[0])
kU, kR = int(res_U.df_model+1), int(res_R.df_model+1)
r   = kU - kR
F   = ((RSSR - RSSU)/r) / (RSSU/(n - kU))
p   = 1 - f.cdf(F, r, n - kU)

print(f"n={n}, kU={kU}, kR={kR}, r={r}")
print(f"RSS_U={RSSU:.4f}, RSS_R={RSSR:.4f}")
print(f"F={F:.4f}, p={p:.4g}")


n=898, kU=10, kR=4, r=6
RSS_U=618.5134, RSS_R=624.7819
F=1.5000, p=0.175
