In [None]:
# --- Cell 1: Imports & constants ---
import json
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

RAW = Path("../data/raw/districts_final.csv")
OUT = Path("../data/interim")
OUT.mkdir(parents=True, exist_ok=True)

RANDOM_STATE = 42
TEST_SIZE = 0.25

NUMERIC = [
    "polsby_popper", "schwartzberg", "convex_hull_ratio", "reock",
    "pct_white", "pct_black", "pct_asian", "pct_hispanic"
]
RACE_COLS = ["pct_white", "pct_black", "pct_asian", "pct_hispanic"]
CATEGORICAL = ["majority_race"]

In [None]:
# --- Cell 2: Helpers ---
def robust_party_labels(df: pd.DataFrame) -> pd.Series:
    y = (df["dem_share"] > df["rep_share"]).astype(int)
    if y.nunique() == 2:
        return y
    med = df["dem_share"].median()
    y = (df["dem_share"] > med).astype(int)
    if y.nunique() == 2:
        return y
    rng = np.random.RandomState(42)
    tmp = df["dem_share"].values + rng.randn(len(df)) * 1e-9
    med = np.median(tmp)
    return pd.Series((tmp > med).astype(int), index=df.index)

def build_dataset(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if not {"dem_share", "rep_share"}.issubset(df.columns):
        raise ValueError("dem_share and rep_share are required.")
    df["party"] = robust_party_labels(df)
    for col in RACE_COLS:
        if col not in df.columns:
            raise ValueError(f"Missing racial column: {col}")
    df["majority_race"] = df[RACE_COLS].idxmax(axis=1)
    return df

In [None]:
# --- Cell 3: Load & features ---
df = pd.read_csv(RAW)
df = build_dataset(df)

X = df[NUMERIC + CATEGORICAL].copy()
y = df["party"].copy()
feature_list = NUMERIC + CATEGORICAL

In [None]:
# --- Cell 4: Split & persist ---
strat = y if y.nunique() == 2 else None
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=strat
)

X_train.to_parquet(OUT / "X_train.parquet", index=True)
X_test.to_parquet(OUT / "X_test.parquet", index=True)
y_train.to_frame("party").to_parquet(OUT / "y_train.parquet", index=True)
y_test.to_frame("party").to_parquet(OUT / "y_test.parquet", index=True)

meta = {
    "random_state": RANDOM_STATE,
    "test_size": TEST_SIZE,
    "feature_list": feature_list,
    "numeric": NUMERIC,
    "categorical": CATEGORICAL,
    "train_index": X_train.index.tolist(),
    "test_index": X_test.index.tolist()
}
with open(OUT / "split_meta.json", "w") as f:
    json.dump(meta, f, indent=2)

print("Preprocessing complete. Saved splits and metadata.")