In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from pathlib import Path
from texas_gerrymandering_hb4.config import FINAL_CSV

# --- Config ---

OUT_PROCESSED = "artifacts/districts_processed.parquet"
OUT_META = "artifacts/preprocess_meta.npz"  # stores PCA loadings etc.

compactness_metrics = ["polsby_popper", "schwartzberg", "convex_hull_ratio", "reock"]
race_cols = ["pct_white", "pct_black", "pct_asian", "pct_hispanic"]

# --- Load ---
df = pd.read_csv(FINAL_CSV)

# --- PCA on compactness metrics to get weights (PC1 loadings) ---
pca = PCA(n_components=1, random_state=42)
pc1 = pca.fit_transform(df[compactness_metrics])

# PCA loadings for PC1. Use absolute values as positive weights, then normalize to sum=1.
loadings = np.abs(pca.components_[0])
pca_weights = loadings / loadings.sum()

# --- Build PCA-weighted composite compactness score ---
df["compactness_weighted_pca"] = (df[compactness_metrics] * pca_weights).sum(axis=1)

# (Optional) Keep a simple average too for reference
df["compactness_avg"] = df[compactness_metrics].mean(axis=1)

# --- Save processed dataset ---
df.to_parquet(OUT_PROCESSED, index=False)

# --- Save metadata (weights, columns) ---
np.savez(OUT_META,
         compactness_metrics=np.array(compactness_metrics, dtype=object),
         pca_weights=pca_weights,
         race_cols=np.array(race_cols, dtype=object))

print("Saved:", OUT_PROCESSED)
print("PCA Weights (order matches compactness_metrics):", pca_weights)

[32m2025-09-28 21:15:07.425[0m | [1mINFO    [0m | [36mtexas_gerrymandering_hb4.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/aimlexpert/Documents/GitHub/texas-gerrymandering-HB4[0m


Saved: artifacts/districts_processed.parquet
PCA Weights (order matches compactness_metrics): [0.23575907 0.24532211 0.27918193 0.23973689]
