In [None]:
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import balanced_accuracy_score

from texas_gerrymandering_hb4.config import LINEAR_REGRESSION_ARTIFACTS, IMAGES_DIR


In [None]:
X_train = pd.read_parquet(LINEAR_REGRESSION_ARTIFACTS / "X_train.parquet")
y_train = pd.read_parquet(LINEAR_REGRESSION_ARTIFACTS / "y_train.parquet")["party"]

with open(LINEAR_REGRESSION_ARTIFACTS / "split_meta.json") as f:
    meta = json.load(f)

NUMERIC = meta["numeric"]
CATEGORICAL = meta["categorical"]


In [None]:
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(drop="first"), CATEGORICAL),
    ("num", StandardScaler(), NUMERIC),
])

pipeline = Pipeline([
    ("pre", preprocessor),
    ("reg", LinearRegression())
])


In [None]:
pipeline.fit(X_train, y_train)

# Get *train* scores for threshold tuning
y_pred_train = pipeline.predict(X_train).clip(0, 1)


In [None]:
def pick_threshold(y_true, scores, metric="balanced_accuracy"):
    scores = np.asarray(scores)
    grid = np.linspace(0.0, 1.0, 201)
    best_thr, best_val = 0.5, -1.0
    for thr in grid:
        y_hat = (scores >= thr).astype(int)
        if metric == "balanced_accuracy":
            val = balanced_accuracy_score(y_true, y_hat)
        else:
            # you can add 'f1' variant if desired
            from sklearn.metrics import f1_score
            val = f1_score(y_true, y_hat, zero_division=0)
        if val > best_val:
            best_val, best_thr = val, thr
    return float(best_thr), float(best_val)

threshold, bal_on_train = pick_threshold(y_train, y_pred_train, metric="balanced_accuracy")
threshold, bal_on_train


In [None]:
import os
joblib.dump(pipeline, LINEAR_REGRESSION_ARTIFACTS / "linear_regression_pipeline.pkl")
with open(LINEAR_REGRESSION_ARTIFACTS / "train_threshold.json", "w") as f:
    json.dump({"threshold": threshold, "balanced_accuracy_on_train": bal_on_train}, f, indent=2)

print("Saved model to models/linear_regression_pipeline.pkl")
print(f"Tuned threshold on train: {threshold:.3f} (balanced-acc={bal_on_train:.3f})")


In [None]:
# Recover feature names: OHE first (drop='first') + numeric
ohe = pipeline.named_steps["pre"].named_transformers_["cat"]
ohe_names = list(ohe.get_feature_names_out(CATEGORICAL))
feat_names = ohe_names + NUMERIC

coefs = pipeline.named_steps["reg"].coef_
coef_df = pd.DataFrame({"feature": feat_names, "coef": coefs}).sort_values(
    "coef", key=lambda s: s.abs(), ascending=False
)
coef_df.to_csv(LINEAR_REGRESSION_ARTIFACTS / "linear_regression_coefficients.csv", index=False)

plt.figure()
plt.barh(coef_df["feature"], coef_df["coef"])
plt.title("Linear Regression Coefficients")
plt.xlabel("Coefficient Value")
plt.tight_layout()
plt.savefig(IMAGES_DIR / "linear_regression_coefficients.png", dpi=200)
plt.close()
