## Import Needed Libraries and Filepaths

In [3]:
import json
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from texas_gerrymandering_hb4.config import IMAGES_DIR

from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.base import clone

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

ART_DIR = Path("artifacts")
ART_DIR.mkdir(parents=True, exist_ok=True)


## Load Data and Metadata

In [4]:
X_train = pd.read_parquet(ART_DIR / "X_train.parquet")
y_train = pd.read_parquet(ART_DIR / "y_train.parquet")["party"]

with open(ART_DIR / "split_meta.json") as f:
    meta = json.load(f)
NUMERIC = meta["numeric"]
CATEGORICAL = meta["categorical"]
COMPACTNESS_PCA_META = meta.get("compactness_pca", {})
if COMPACTNESS_PCA_META:
    print("Loaded compactness PCA metadata:")
    print(json.dumps(COMPACTNESS_PCA_META, indent=2))
else:
    print("No compactness PCA metadata found in split_meta.json")


Loaded compactness PCA metadata:
{
  "metrics": [
    "polsby_popper",
    "schwartzberg",
    "convex_hull_ratio",
    "reock"
  ],
  "scaler_mean": [
    0.221836988782935,
    0.4626842641033208,
    0.6869649492932969,
    0.3447026021018883
  ],
  "scaler_scale": [
    0.08658260237780978,
    0.08809234095029718,
    0.10691193073924808,
    0.10252565155680432
  ],
  "pca_components": [
    0.5326604170627146,
    0.5420231196260704,
    0.48495683113594357,
    0.43278249712757744
  ],
  "explained_variance_ratio": 0.7908192155948008,
  "sign_correction": 1.0
}


In [5]:
# --- Cell 3: Preprocessor ---
POLY_DEGREE = 2

def make_preprocessor(use_polynomial_features: bool = False):
    numeric_steps = []
    if use_polynomial_features:
        numeric_steps.extend([
            ("scale_in", StandardScaler()),
            ("poly", PolynomialFeatures(degree=POLY_DEGREE, include_bias=False)),
            ("scale_out", StandardScaler()),
        ])
    else:
        numeric_steps.append(("scale", StandardScaler()))
    numeric_transformer = Pipeline(numeric_steps)
    return ColumnTransformer([
        ("cat", OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"), CATEGORICAL),
        ("num", numeric_transformer, NUMERIC),
    ])

# Helper to recover feature names after fit
def get_feature_names(fitted_preprocessor):
    ohe = fitted_preprocessor.named_transformers_["cat"]
    ohe_names = list(ohe.get_feature_names_out(CATEGORICAL))
    num_transformer = fitted_preprocessor.named_transformers_["num"]
    if hasattr(num_transformer, "named_steps") and "poly" in num_transformer.named_steps:
        poly_step = num_transformer.named_steps["poly"]
        num_names = list(poly_step.get_feature_names_out(NUMERIC))
    else:
        num_names = list(NUMERIC)
    feat_names = ohe_names + num_names
    return ohe_names, feat_names


## Tuning Threshold

In [6]:
def pick_threshold(y_true, scores, metric="balanced_accuracy"):
    scores = np.asarray(scores)
    grid = np.linspace(0.0, 1.0, 201)
    best_thr, best_val = 0.5, -1.0
    for thr in grid:
        y_hat = (scores >= thr).astype(int)
        val = balanced_accuracy_score(y_true, y_hat) if metric == "balanced_accuracy" else 0.0
        if val > best_val:
            best_val, best_thr = val, thr
    return float(best_thr), float(best_val)

## K-Fold Cross Validation

In [7]:
# --- Cell 5: Cross-val evaluator for regression-as-classifier ---
def cv_bal_acc_for_reg_pipeline(pipeline, X, y, n_splits=5, random_state=42):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    scores = []
    for tr_idx, va_idx in kf.split(X):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
        # fit on fold-train
        pipeline.fit(X_tr, y_tr)
        # tune threshold on fold-train ONLY
        y_tr_score = pipeline.predict(X_tr).clip(0, 1)
        thr, _ = pick_threshold(y_tr, y_tr_score, "balanced_accuracy")
        # evaluate on fold-val
        y_va_score = pipeline.predict(X_va).clip(0, 1)
        y_va_pred = (y_va_score >= thr).astype(int)
        bal = balanced_accuracy_score(y_va, y_va_pred)
        scores.append(bal)
    return float(np.mean(scores)), float(np.std(scores))

## Define Model Candidates

In [8]:
alphas = np.logspace(-3, 3, 50)
candidate_specs = {
    "ols": {"reg": LinearRegression(), "use_polynomial_features": False},
    "ols_poly": {"reg": LinearRegression(), "use_polynomial_features": True},
    "ridge": {"reg": RidgeCV(alphas=alphas, cv=5, scoring=None), "use_polynomial_features": False},
    "lasso": {"reg": LassoCV(alphas=alphas, cv=5, max_iter=10000, n_jobs=-1), "use_polynomial_features": False},
    "elasticnet": {"reg": ElasticNetCV(l1_ratio=[0.2, 0.5, 0.8, 1.0], alphas=alphas, cv=5, max_iter=10000, n_jobs=-1), "use_polynomial_features": False},
    "ridge_poly": {"reg": RidgeCV(alphas=alphas, cv=5, scoring=None), "use_polynomial_features": True},
    "lasso_poly": {"reg": LassoCV(alphas=alphas, cv=5, max_iter=10000, n_jobs=-1), "use_polynomial_features": True},
    "elasticnet_poly": {"reg": ElasticNetCV(l1_ratio=[0.2, 0.5, 0.8, 1.0], alphas=alphas, cv=5, max_iter=10000, n_jobs=-1), "use_polynomial_features": True},
}

results = {}
best_name, best_cv = None, -np.inf
best_use_poly = False

for name, spec in candidate_specs.items():
    reg = clone(spec["reg"])
    preprocessor = make_preprocessor(spec["use_polynomial_features"])
    pipe = Pipeline([("pre", preprocessor), ("reg", reg)])
    mean_bal, std_bal = cv_bal_acc_for_reg_pipeline(pipe, X_train, y_train, n_splits=5, random_state=42)
    results[name] = {
        "cv_balanced_accuracy_mean": mean_bal,
        "cv_balanced_accuracy_std": std_bal,
        "use_polynomial_features": spec["use_polynomial_features"],
        "polynomial_degree": POLY_DEGREE if spec["use_polynomial_features"] else 1,
    }
    tag = "with poly" if spec["use_polynomial_features"] else "linear-only"
    print(f"[CV] {name} ({tag}): mean={mean_bal:.3f} ± {std_bal:.3f}")
    if mean_bal > best_cv:
        best_cv = mean_bal
        best_name = name
        best_use_poly = spec["use_polynomial_features"]

best_spec = candidate_specs[best_name]
best_reg = clone(best_spec["reg"])
print(f"[SELECT] Best by CV balanced accuracy: {best_name} (mean={best_cv:.3f}, poly={best_use_poly})")




[CV] ols (linear-only): mean=0.833 ± 0.158




[CV] ols_poly (with poly): mean=0.833 ± 0.149




[CV] ridge (linear-only): mean=0.833 ± 0.158




[CV] lasso (linear-only): mean=0.917 ± 0.167




[CV] elasticnet (linear-only): mean=0.867 ± 0.172




[CV] ridge_poly (with poly): mean=0.733 ± 0.097




[CV] lasso_poly (with poly): mean=0.750 ± 0.091




[CV] elasticnet_poly (with poly): mean=0.800 ± 0.041
[SELECT] Best by CV balanced accuracy: lasso (mean=0.917, poly=False)


In [9]:
# --- Cell 7: Fit best on full train & tune final threshold ---
best_pipeline = Pipeline([("pre", make_preprocessor(best_use_poly)), ("reg", best_reg)])
best_pipeline.fit(X_train, y_train)
y_train_score = best_pipeline.predict(X_train).clip(0, 1)
final_thr, bal_train = pick_threshold(y_train, y_train_score, "balanced_accuracy")
print(f"[TRAIN] Final threshold={final_thr:.3f}, balanced-acc(train)={bal_train:.3f}")


[TRAIN] Final threshold=0.330, balanced-acc(train)=0.917


In [10]:
# --- Cell 8: Save model, threshold, CV results, coefficients plot ---
joblib.dump(best_pipeline, ART_DIR / "active_model.pkl")
with open(ART_DIR / "train_threshold.json", "w") as f:
    json.dump(
        {
            "variant": best_name,
            "threshold": final_thr,
            "balanced_accuracy_on_train": bal_train,
            "use_polynomial_features": best_use_poly,
            "polynomial_degree": POLY_DEGREE if best_use_poly else 1,
        },
        f, indent=2
    )
with open(ART_DIR / "cv_results.json", "w") as f:
    json.dump(results, f, indent=2)

# coefficients
ohe_names, feat_names = get_feature_names(best_pipeline.named_steps["pre"])
coefs = best_pipeline.named_steps["reg"].coef_
coef_df = pd.DataFrame({"feature": feat_names, "coef": coefs}).sort_values(
    "coef", key=lambda s: s.abs(), ascending=False
)
coef_df.to_csv(ART_DIR / "linear_regression_coefficients_active.csv", index=False)

plt.figure()
plt.barh(coef_df["feature"], coef_df["coef"])
plt.title(f"Linear Regression Coefficients ({best_name})")
plt.xlabel("Coefficient Value")
plt.tight_layout()
plt.savefig(IMAGES_DIR / "linear_regression_coefficients_active.png", dpi=200)
plt.close()


## Model Diagnostics

In [11]:
# Fit a separate OLS baseline for diagnostics only
diagnostic_preprocessor = make_preprocessor(best_use_poly)
ols_pipe = Pipeline([("pre", diagnostic_preprocessor), ("reg", LinearRegression())])
ols_pipe.fit(X_train, y_train)
X_design = ols_pipe.named_steps["pre"].transform(X_train)  # dense array
ohe_names_diag, feat_names_diag = get_feature_names(ols_pipe.named_steps["pre"])

# statsmodels OLS
X_sm_df = pd.DataFrame(X_design, columns=feat_names_diag)
X_sm_df = sm.add_constant(X_sm_df)
ols_sm = sm.OLS(y_train.values, X_sm_df).fit()

summary_text = ols_sm.summary().as_text()
print(summary_text)

fig = plt.figure(figsize=(12, 8))
plt.axis("off")
plt.text(0.0, 1.0, summary_text, fontsize=8, fontfamily="monospace", va="top")
plt.tight_layout()
fig.savefig(IMAGES_DIR / "ols_summary.png", dpi=200)
plt.close(fig)

pvalues_df = (
    pd.Series(ols_sm.pvalues, index=ols_sm.model.exog_names, name="p_value")
    .reset_index()
    .rename(columns={"index": "term"})
)
print(pvalues_df)

fig, ax = plt.subplots(figsize=(8, max(2, 0.3 * len(pvalues_df))))
ax.axis("off")
table = ax.table(
    cellText=[[row.term, f"{row.p_value:.4g}"] for row in pvalues_df.itertuples()],
    colLabels=["Term", "p-value"],
    loc="center",
    cellLoc="left",
)
table.auto_set_font_size(False)
table.set_fontsize(8)
table.scale(1, 1.4)
fig.tight_layout()
fig.savefig(IMAGES_DIR / "ols_pvalues.png", dpi=200)
plt.close(fig)

# VIF
vif_rows = []
X_sm_values = X_sm_df.values
for i in range(1, X_sm_values.shape[1]):  # skip intercept
    vif_rows.append((feat_names_diag[i-1], float(variance_inflation_factor(X_sm_values[:, 1:], i-1))))
vif_df = pd.DataFrame(vif_rows, columns=["feature", "VIF"]).sort_values("VIF", ascending=False)
vif_df.to_csv(ART_DIR / "vif_ols.csv", index=False)

plt.figure()
plt.barh(vif_df["feature"], vif_df["VIF"])
plt.title("VIF (diagnostic OLS, Train)")
plt.xlabel("VIF")
plt.tight_layout()
plt.savefig(IMAGES_DIR / "vif_ols.png", dpi=200)
plt.close()

# residuals & QQ
y_fit = ols_pipe.predict(X_train)
resid = y_train.values - y_fit

plt.figure(); plt.scatter(y_fit, resid); plt.axhline(0, ls="--")
plt.title("Residuals vs Fitted (diagnostic OLS)")
plt.xlabel("Fitted"); plt.ylabel("Residual"); plt.tight_layout()
plt.savefig(IMAGES_DIR / "residuals_vs_fitted_ols.png", dpi=200); plt.close()

plt.figure(); plt.hist(resid, bins=12); plt.title("Residuals Histogram (diagnostic OLS)")
plt.tight_layout(); plt.savefig(IMAGES_DIR / "residuals_hist_ols.png", dpi=200); plt.close()

fig = sm.qqplot(resid, line='45', fit=True); plt.title("QQ Plot (diagnostic OLS)")
plt.tight_layout(); fig.savefig(IMAGES_DIR / "qqplot_residuals_ols.png", dpi=200); plt.close()

# Influence / Cook's D
influence = ols_sm.get_influence()
cooks_d = influence.cooks_distance[0]
leverage = influence.hat_matrix_diag
infl_df = pd.DataFrame({"index": X_train.index, "cooks_d": cooks_d, "leverage": leverage, "residual": resid}).set_index("index")
infl_df.sort_values("cooks_d", ascending=False).to_csv(ART_DIR / "influence_ols.csv")

plt.figure(); plt.scatter(leverage, resid)
plt.title("Leverage vs Residuals (diagnostic OLS)")
plt.xlabel("Leverage"); plt.ylabel("Residual"); plt.tight_layout()
plt.savefig(IMAGES_DIR / "leverage_vs_residuals_ols.png", dpi=200); plt.close()


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.646
Model:                            OLS   Adj. R-squared:                  0.545
Method:                 Least Squares   F-statistic:                     6.383
Date:                Fri, 10 Oct 2025   Prob (F-statistic):           0.000609
Time:                        20:02:20   Log-Likelihood:               -0.25450
No. Observations:                  28   AIC:                             14.51
Df Residuals:                      21   BIC:                             23.83
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const               

## Save Metadata

In [12]:
with open(ART_DIR / "training_summary.json", "w") as f:
    json.dump({
        "selected_variant": best_name,
        "train_balanced_accuracy": bal_train,
        "final_threshold": final_thr,
        "cv_results": results,
        "compactness_pca": COMPACTNESS_PCA_META,
        "use_polynomial_features": best_use_poly,
        "polynomial_degree": POLY_DEGREE if best_use_poly else 1,
    }, f, indent=2)

print("Training complete. Saved active_model.pkl, threshold, CV results, coefficients, and diagnostics.")


Training complete. Saved active_model.pkl, threshold, CV results, coefficients, and diagnostics.
