In [5]:
# Add near the top (after imports)
import numpy as np
import pathlib
import json

class SafeJSONEncoder(json.JSONEncoder):
    def default(self, o):
        # pathlib.Path -> str
        if isinstance(o, pathlib.Path):
            return str(o)
        # NumPy scalars -> native Python
        if isinstance(o, (np.integer, np.floating, np.bool_)):
            return o.item()
        # NumPy arrays -> lists
        if isinstance(o, np.ndarray):
            return o.tolist()
        return super().default(o)


In [6]:
# --- Imports ---
import json, os, re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib
from scipy import sparse
import numpy as np

from texas_gerrymandering_hb4.config import FINAL_CSV

# --- Paths ---
ARTIFACT_DIR = "artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

XTR_PATH = os.path.join(ARTIFACT_DIR, "X_train.npz")
XTE_PATH = os.path.join(ARTIFACT_DIR, "X_test.npz")
YTR_PATH = os.path.join(ARTIFACT_DIR, "y_train.csv")
YTE_PATH = os.path.join(ARTIFACT_DIR, "y_test.csv")
TRANSFORMER_PATH = os.path.join(ARTIFACT_DIR, "preprocessor.joblib")
META_PATH = os.path.join(ARTIFACT_DIR, "meta.json")

# --- Load data ---
df = pd.read_csv(FINAL_CSV)
print("Loaded:", df.shape, "columns:", list(df.columns))

# --- Helper: drop obvious non-features if present ---
drop_cols = []
for c in ["district_id", "district", "DISTRICT", "District", "geometry", "geom", "wkt"]:
    if c in df.columns:
        drop_cols.append(c)

# --- Auto-detect/construct the binary target ---
# Preferred categorical targets (exact or close match)
cat_target_candidates = [
    "party", "winner", "party_outcome", "party_winner", "outcome"
]
cat_target = next((c for c in cat_target_candidates if c in df.columns), None)

def pick_positive_label(labels):
    """Prefer a GOP-ish label if present; else pick the lexicographically last for stability."""
    labels_norm = [str(l).lower() for l in labels]
    gop_like = ["rep", "republican", "gop", "r"]
    for gl in gop_like:
        for lab, labn in zip(labels, labels_norm):
            if re.search(rf"\b{gl}\b", labn):
                return lab
    # fallback: if "Democrat" appears, use that; else lexicographic max
    for lab, labn in zip(labels, labels_norm):
        if re.search(r"\bdem|democrat|democratic\b", labn):
            return lab
    return sorted(map(str, labels))[-1]

y_series = None
origin_target_col = None
positive_label = None
derived_from = None

if cat_target is not None and df[cat_target].notna().any():
    origin_target_col = cat_target
    y_raw = df[cat_target].astype(str)
    labels = sorted(y_raw.dropna().unique().tolist())
    if len(labels) < 2:
        raise ValueError(f"Categorical target '{cat_target}' has <2 unique values: {labels}")
    # Reduce to binary if more than 2 classes by mapping the most common two
    top2 = y_raw.value_counts().index.tolist()[:2]
    y_raw = y_raw.where(y_raw.isin(top2))
    y_raw = y_raw.dropna()
    df = df.loc[y_raw.index].copy()
    positive_label = pick_positive_label(top2)
    y_series = (y_raw == positive_label).astype(int)
    derived_from = f"categorical:{cat_target}:{top2}"
else:
    # Try numeric share columns
    share_cols_priority = [
        ("rep_share",  "Republican"),
        ("gop_share",  "Republican"),
        ("republican_share", "Republican"),
        ("r_share",    "Republican"),
        ("dem_share",  "Democrat"),
        ("democratic_share", "Democrat"),
        ("d_share",    "Democrat"),
    ]
    found = None
    for col, pos_lab in share_cols_priority:
        if col in df.columns:
            found = (col, pos_lab); break
    if not found:
        raise ValueError(
            "Could not find a target. Provide a categorical party column "
            "(e.g., 'party'/'winner') or a share column (e.g., 'rep_share' or 'dem_share')."
        )
    col, pos_lab = found
    origin_target_col = col
    # Build binary: pos_lab (Republican or Democrat) means 1 if that share >= 0.5
    if pos_lab == "Republican":
        y_series = (df[col] >= 0.5).astype(int)
        positive_label = "Republican"
    else:
        # pos_lab == "Democrat": 1 means Democrat wins; we still want a single "positive_label"
        y_series = (df[col] >= 0.5).astype(int)
        positive_label = "Democrat"
    derived_from = f"numeric_share:{col}"

# Align feature matrix with y index (in case we filtered)
X = df.drop(columns=[c for c in [origin_target_col] if c in df.columns] + drop_cols)
y = y_series

# --- Identify feature types ---
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["number", "bool"]).columns.tolist()

print("Dropping non-features:", drop_cols)
print("Categorical features:", categorical_cols)
print("Numeric features:", numeric_cols)
print("Target positive label:", positive_label, "| derived_from:", derived_from)

# --- Split (force both classes in test if possible) ---
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

y_arr = np.asarray(y, dtype=int)
unique, counts = np.unique(y_arr, return_counts=True)
class_counts = dict(zip(unique.tolist(), counts.tolist()))
print("Class counts (all data):", class_counts)

# Aim for ~ 25â€“33% test, but ensure at least 1 of each class in test if both classes exist
test_size = 0.33 if len(unique) == 2 and min(counts) >= 3 else 0.25

sss = StratifiedShuffleSplit(n_splits=100, test_size=test_size, random_state=42)
X_train = X_test = y_train = y_test = None

for tr_idx, te_idx in sss.split(X, y_arr):
    # ensure both classes appear in test
    if len(np.unique(y_arr[te_idx])) == len(unique):
        X_train, X_test = X.iloc[tr_idx], X.iloc[te_idx]
        y_train, y_test = y_arr[tr_idx], y_arr[te_idx]
        break

# Fallback: if we still couldn't get both classes (e.g., extremely imbalanced tiny data)
if X_train is None:
    print("WARNING: Could not form a test split containing both classes. Proceeding with best stratified split.")
    tr_idx, te_idx = next(iter(StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=43).split(X, y_arr)))
    X_train, X_test = X.iloc[tr_idx], X.iloc[te_idx]
    y_train, y_test = y_arr[tr_idx], y_arr[te_idx]

print("Train size:", X_train.shape[0], "Test size:", X_test.shape[0])
print("Test class counts:", dict(zip(*np.unique(y_test, return_counts=True))))


# --- Preprocessor (fit ONLY on train) ---
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols),
    ],
    remainder="drop",
)
preprocessor.fit(X_train)

# --- Transform ---
X_train_t = preprocessor.transform(X_train)
X_test_t  = preprocessor.transform(X_test)

# --- Persist ---
sparse.save_npz(XTR_PATH, sparse.csr_matrix(X_train_t))
sparse.save_npz(XTE_PATH, sparse.csr_matrix(X_test_t))
pd.Series(y_train).to_csv(YTR_PATH, index=False, header=False)
pd.Series(y_test).to_csv(YTE_PATH, index=False, header=False)
joblib.dump(preprocessor, TRANSFORMER_PATH)

# Feature names (best effort)
try:
    cat_names = preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_cols).tolist() if categorical_cols else []
    feat_names = cat_names + numeric_cols
except Exception:
    feat_names = [f"f{i}" for i in range(X_train_t.shape[1])]

meta = {
    "source_csv": str(FINAL_CSV),                     # ensure string
    "origin_target_col": origin_target_col,
    "derived_from": derived_from,
    "positive_label": positive_label,
    "dropped_non_features": drop_cols,
    "categorical_cols": list(map(str, categorical_cols)),
    "numeric_cols": list(map(str, numeric_cols)),
    "n_train_raw": int(X_train.shape[0]),
    "n_test_raw": int(X_test.shape[0]),
    "n_features_transformed": int(X_train_t.shape[1]),
    "feature_names": list(map(str, feat_names)),
}
with open(META_PATH, "w") as f:
    json.dump(meta, f, indent=2, cls=SafeJSONEncoder)  # <-- use custom encoder


print("Preprocessing complete.")
print(f"Train (X) transformed: {X_train_t.shape} | Test (X): {X_test_t.shape}")


Loaded: (38, 11) columns: ['district_id', 'polsby_popper', 'schwartzberg', 'convex_hull_ratio', 'reock', 'pct_white', 'pct_black', 'pct_asian', 'pct_hispanic', 'dem_share', 'rep_share']
Dropping non-features: ['district_id']
Categorical features: []
Numeric features: ['polsby_popper', 'schwartzberg', 'convex_hull_ratio', 'reock', 'pct_white', 'pct_black', 'pct_asian', 'pct_hispanic', 'dem_share']
Target positive label: Republican | derived_from: numeric_share:rep_share
Class counts (all data): {1: 38}
Train size: 28 Test size: 10
Test class counts: {np.int64(1): np.int64(10)}
Preprocessing complete.
Train (X) transformed: (28, 9) | Test (X): (10, 9)
