In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import joblib
from pathlib import Path

# --- Config ---
IN_PROCESSED = "artifacts/districts_processed.parquet"
OUT_MODEL = "artifacts/lr_compactness.joblib"
OUT_SPLIT = "artifacts/train_test_indices.npz"
TEST_SIZE = 0.30
RANDOM_STATE = 42

# --- Load processed data ---
df = pd.read_parquet(IN_PROCESSED)

# --- Features/Target ---
race_cols = ["pct_white", "pct_black", "pct_asian", "pct_hispanic"]
X = df[race_cols].copy()
y = df["compactness_weighted_pca"].copy()

# --- Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

# Save indices for deterministic evaluation later
train_idx = X_train.index.values
test_idx = X_test.index.values
np.savez(OUT_SPLIT, train_idx=train_idx, test_idx=test_idx)

# --- Train model ---
lr = LinearRegression()
lr.fit(X_train, y_train)

# --- Evaluate on holdout ---
y_pred = lr.predict(X_test)
r2 = r2_score(y_test, y_pred)

try:
    rmse = mean_squared_error(y_test, y_pred, squared=False)
except TypeError:
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# --- Persist model and metrics ---
joblib.dump(lr, OUT_MODEL)

print("Saved model:", OUT_MODEL)
print("R^2:", r2)
print("RMSE:", rmse)
print("Coefficients:", dict(zip(race_cols, lr.coef_)))
print("Intercept:", lr.intercept_)


Saved model: artifacts/lr_compactness.joblib
R^2: -0.16570264038323135
RMSE: 0.09577339220389268
Coefficients: {'pct_white': np.float64(5.342920635061273), 'pct_black': np.float64(5.2040671329626), 'pct_asian': np.float64(3.9533860723838683), 'pct_hispanic': np.float64(4.9298684872016025)}
Intercept: -4.573312255836616
