## Import Needed Libraries/Filepaths & Set Constants

In [9]:
import os, json
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from texas_gerrymandering_hb4.config import FINAL_CSV

ART_DIR = Path("artifacts")
ART_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_STATE = 42
TEST_SIZE = 0.25

COMPACTNESS_METRICS = [
    "polsby_popper",
    "schwartzberg",
    "convex_hull_ratio",
    "reock",
]

NUMERIC = [
    "compactness_pca_score",
    "pct_black", "pct_asian", "pct_hispanic"  # dropped pct_white
]
RACE_COLS = ["pct_white", "pct_black", "pct_asian", "pct_hispanic"]
CATEGORICAL = ["majority_race"]


## Helper Functions

In [10]:
def robust_party_labels(df: pd.DataFrame) -> pd.Series:
    y = (df["dem_share"] > df["rep_share"]).astype(int)
    if y.nunique() == 2:
        return y
    med = df["dem_share"].median()
    y = (df["dem_share"] > med).astype(int)
    if y.nunique() == 2:
        return y
    rng = np.random.RandomState(42)
    tmp = df["dem_share"].values + rng.randn(len(df)) * 1e-9
    med = np.median(tmp)
    return pd.Series((tmp > med).astype(int), index=df.index)


def compute_compactness_pca(df: pd.DataFrame):
    missing = [col for col in COMPACTNESS_METRICS if col not in df.columns]
    if missing:
        raise ValueError(f"Missing compactness columns: {missing}")
    compact_df = df[COMPACTNESS_METRICS].astype(float)
    if compact_df.isnull().values.any():
        compact_df = compact_df.fillna(compact_df.mean())
    scaler = StandardScaler()
    scaled = scaler.fit_transform(compact_df)
    pca = PCA(n_components=1, random_state=RANDOM_STATE)
    component = pca.fit_transform(scaled).ravel()
    avg_metric = scaled.mean(axis=1)
    corr = np.corrcoef(component, avg_metric)[0, 1] if len(avg_metric) > 1 else 1.0
    if np.isnan(corr):
        corr = 1.0
    sign = 1.0 if corr >= 0 else -1.0
    composite = component * sign
    components = (pca.components_[0] * sign).tolist()
    meta = {
        "metrics": COMPACTNESS_METRICS,
        "scaler_mean": scaler.mean_.tolist(),
        "scaler_scale": scaler.scale_.tolist(),
        "pca_components": components,
        "explained_variance_ratio": float(pca.explained_variance_ratio_[0]),
        "sign_correction": float(sign),
    }
    return composite, meta


def build_dataset(df: pd.DataFrame):
    df = df.copy()
    if not {"dem_share", "rep_share"}.issubset(df.columns):
        raise ValueError("dem_share and rep_share are required.")
    df["party"] = robust_party_labels(df)
    for col in RACE_COLS:
        if col not in df.columns:
            raise ValueError(f"Missing racial column: {col}")
    df["majority_race"] = df[RACE_COLS].idxmax(axis=1)
    compactness_scores, compact_meta = compute_compactness_pca(df)
    df["compactness_pca_score"] = compactness_scores
    return df, compact_meta


## Load and Features

In [11]:
# --- Cell 3: Load & features ---
df = pd.read_csv(FINAL_CSV)
df, compact_meta = build_dataset(df)

X = df[NUMERIC + CATEGORICAL].copy()
y = df["party"].copy()
feature_list = NUMERIC + CATEGORICAL


## Split

In [12]:
strat = y if y.nunique() == 2 else None
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=strat
)

X_train.to_parquet(ART_DIR / "X_train.parquet", index=True)
X_test.to_parquet(ART_DIR / "X_test.parquet", index=True)
y_train.to_frame("party").to_parquet(ART_DIR / "y_train.parquet", index=True)
y_test.to_frame("party").to_parquet(ART_DIR / "y_test.parquet", index=True)

meta = {
    "random_state": RANDOM_STATE,
    "test_size": TEST_SIZE,
    "feature_list": feature_list,
    "numeric": NUMERIC,
    "categorical": CATEGORICAL,
    "compactness_pca": compact_meta,
    "train_index": X_train.index.tolist(),
    "test_index": X_test.index.tolist()
}
with open(ART_DIR / "split_meta.json", "w") as f:
    json.dump(meta, f, indent=2)
with open(ART_DIR / "compactness_pca.json", "w") as f:
    json.dump(compact_meta, f, indent=2)

print("Preprocessing complete. Saved splits and metadata.")


Preprocessing complete. Saved splits and metadata.
