In [None]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

from texas_gerrymandering_hb4.config import FINAL_CSV, LINEAR_REGRESSION_ARTIFACTS

RANDOM_STATE = 42
TEST_SIZE = 0.25

NUMERIC = [
    "polsby_popper", "schwartzberg", "convex_hull_ratio", "reock",
    "pct_white", "pct_black", "pct_asian", "pct_hispanic"
]
RACE_COLS = ["pct_white", "pct_black", "pct_asian", "pct_hispanic"]
CATEGORICAL = ["majority_race"]

## Helper Functions

In [None]:
def robust_party_labels(df: pd.DataFrame) -> pd.Series:
    # 1) dem_share vs rep_share; fallback to median dem_share; then tiny noise tie-breaker
    if not {"dem_share", "rep_share"}.issubset(df.columns):
        raise ValueError("dem_share and rep_share are required.")
    y = (df["dem_share"] > df["rep_share"]).astype(int)
    if y.nunique() == 2:
        return y
    med = df["dem_share"].median()
    y = (df["dem_share"] > med).astype(int)
    if y.nunique() == 2:
        return y
    rng = np.random.RandomState(42)
    tmp = df["dem_share"].values + rng.randn(len(df)) * 1e-9
    med = np.median(tmp)
    return pd.Series((tmp > med).astype(int), index=df.index)

def build_dataset(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["party"] = robust_party_labels(df)
    for col in RACE_COLS:
        if col not in df.columns:
            raise ValueError(f"Missing racial column: {col}")
    df["majority_race"] = df[RACE_COLS].idxmax(axis=1)
    return df

## Loading and Constructing Features

In [None]:
df = pd.read_csv(FINAL_CSV)
df = build_dataset(df)

X = df[NUMERIC + CATEGORICAL].copy()
y = df["party"].copy()

# persist the column order weâ€™ll expect later
feature_list = NUMERIC + CATEGORICAL


## Train/Test Split

In [None]:
strat = y if y.nunique() == 2 else None
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=strat
)

# Save splits (Parquet for speed/precision)
X_train.to_parquet(LINEAR_REGRESSION_ARTIFACTS / "X_train.parquet", index=True)
X_test.to_parquet(OUT / "X_test.parquet", index=True)
y_train.to_frame("party").to_parquet(LINEAR_REGRESSION_ARTIFACTS / "y_train.parquet", index=True)
y_test.to_frame("party").to_parquet(LINEAR_REGRESSION_ARTIFACTS / "y_test.parquet", index=True)

# Save split meta (so 02/03 can recreate exact behavior)
meta = {
    "random_state": RANDOM_STATE,
    "test_size": TEST_SIZE,
    "feature_list": feature_list,
    "numeric": NUMERIC,
    "categorical": CATEGORICAL,
    "train_index": X_train.index.tolist(),
    "test_index": X_test.index.tolist()
}
with open(LINEAR_REGRESSION_ARTIFACTS / "split_meta.json", "w") as f:
    json.dump(meta, f, indent=2)
