In [1]:
# --- Imports ---
import json, os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score

from texas_gerrymandering_hb4.config import FINAL_CSV

# --- Load raw again (use same auto-target logic as preprocess) ---
df = pd.read_csv(FINAL_CSV)

# Re-create the same target as 01_preprocess did
import re
def pick_positive_label(labels):
    labels_norm = [str(l).lower() for l in labels]
    for gl in ["rep", "republican", "gop", "r"]:
        for lab, labn in zip(labels, labels_norm):
            if re.search(rf"\b{gl}\b", labn):
                return lab
    for lab, labn in zip(labels, labels_norm):
        if re.search(r"\bdem|democrat|democratic\b", labn):
            return lab
    return sorted(map(str, labels))[-1]

# Drop obvious non-features if present
drop_cols = [c for c in ["district_id","district","DISTRICT","District","geometry","geom","wkt"] if c in df.columns]

cat_target_candidates = ["party","winner","party_outcome","party_winner","outcome"]
cat_target = next((c for c in cat_target_candidates if c in df.columns), None)

if cat_target is not None and df[cat_target].notna().any():
    y_raw = df[cat_target].astype(str)
    top2 = y_raw.value_counts().index.tolist()[:2]
    y_raw = y_raw.where(y_raw.isin(top2)).dropna()
    df = df.loc[y_raw.index].copy()
    pos_label = pick_positive_label(top2)
    y = (y_raw == pos_label).astype(int).to_numpy()
else:
    # shares
    for col, lab in [("rep_share","Republican"),("gop_share","Republican"),("republican_share","Republican"),
                     ("r_share","Republican"),("dem_share","Democrat"),("democratic_share","Democrat"),("d_share","Democrat")]:
        if col in df.columns:
            pos_label = lab
            y = (df[col] >= 0.5).astype(int).to_numpy()
            break
    else:
        raise ValueError("No target found (need party/winner or *_share column).")

X = df.drop(columns=[col for col in df.columns if col in drop_cols or col == cat_target or col.endswith("_share") and col in ["rep_share","gop_share","republican_share","r_share","dem_share","democratic_share","d_share"]], errors="ignore")
categorical_cols = X.select_dtypes(include=["object","category"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["number","bool"]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols),
    ]
)

pipe = Pipeline(steps=[("prep", preprocessor), ("reg", LinearRegression())])

# --- Cross-validation ---
X_vals = X.reset_index(drop=True)
y_vals = y.astype(int)

skf = StratifiedKFold(n_splits=min(5, np.bincount(y_vals).min()), shuffle=True, random_state=42)

accs, aucs = [], []
for tr_idx, te_idx in skf.split(X_vals, y_vals):
    X_tr, X_te = X_vals.iloc[tr_idx], X_vals.iloc[te_idx]
    y_tr, y_te = y_vals[tr_idx], y_vals[te_idx]
    pipe.fit(X_tr, y_tr)
    y_cont = pipe.predict(X_te)
    y_pred = (y_cont >= 0.5).astype(int)
    accs.append(accuracy_score(y_te, y_pred))
    try:
        aucs.append(roc_auc_score(y_te, y_cont))
    except ValueError:
        aucs.append(np.nan)

# Majority baseline
majority = int(np.bincount(y_vals).argmax())
baseline_acc = np.mean(y_vals == majority)

print("=== Stratified CV (LinearRegression-as-classifier) ===")
print(f"Folds: {skf.get_n_splits()} | Positive label: {pos_label}")
print(f"Accuracy: mean={np.nanmean(accs):.4f} ± {np.nanstd(accs):.4f}")
print(f"ROC AUC:  mean={np.nanmean(aucs):.4f} ± {np.nanstd(aucs):.4f}")
print(f"Majority baseline accuracy: {baseline_acc:.4f}")


[32m2025-09-30 16:01:52.780[0m | [1mINFO    [0m | [36mtexas_gerrymandering_hb4.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/aimlexpert/Documents/GitHub/texas-gerrymandering-HB4[0m


ValueError: k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more, got n_splits=0.