In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt

# --- Config ---
IN_PROCESSED = "artifacts/districts_processed.parquet"
IN_MODEL = "artifacts/lr_compactness.joblib"
IN_SPLIT = "artifacts/train_test_indices.npz"

race_cols = ["pct_white", "pct_black", "pct_asian", "pct_hispanic"]

# --- Load data and artifacts ---
df = pd.read_parquet(IN_PROCESSED)
lr = joblib.load(IN_MODEL)
split = np.load(IN_SPLIT)

train_idx = split["train_idx"]
test_idx = split["test_idx"]

X = df[race_cols].copy()
y = df["compactness_weighted_pca"].copy()

X_test = X.loc[test_idx]
y_test = y.loc[test_idx]

# --- Predict & Score ---
y_pred = lr.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print("Reloaded model performance on holdout:")
print("R^2:", r2)
print("RMSE:", rmse)
print("Coefficients:", dict(zip(race_cols, lr.coef_)))
print("Intercept:", lr.intercept_)

# --- Plot: y_true vs y_pred scatter ---
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_pred, alpha=0.8, edgecolor="k")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], linestyle="--")
plt.xlabel("True PCA-weighted compactness")
plt.ylabel("Predicted")
plt.title("Linear Regression: True vs Predicted")
plt.tight_layout()
plt.show()

# --- Plot: Partial relationships for each race feature ---
import seaborn as sns
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for ax, feat in zip(axes.flatten(), race_cols):
    sns.regplot(x=df[feat], y=df["compactness_weighted_pca"], ax=ax,
                scatter_kws={"s": 60, "alpha": 0.7}, line_kws={"color": "red"})
    coef = lr.coef_[race_cols.index(feat)]
    ax.set_title(f"{feat} vs Composite Compactness\nCoef = {coef:.2f}")
    ax.set_xlabel(feat)
    ax.set_ylabel("Composite Compactness")
plt.tight_layout()
plt.show()

TypeError: got an unexpected keyword argument 'squared'