In [2]:
# --- Texas Districts: Racial Demographics Maps (GeoParquet version) ---

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from pathlib import Path
import re

from texas_gerrymandering_hb4.config import FINAL_CSV, CLEAN_DISTRICTS_PARQUET




# If your percentage columns have different names, set them here:
# Any of these can be None, and we'll try to auto-detect by name.
RACE_COLS = {
    "White (%)":    None,  # e.g., "pct_white", "white_share", "white_percent"
    "Black (%)":    None,  # e.g., "pct_black", "black_share"
    "Hispanic (%)": None,  # e.g., "pct_hispanic", "hispanic_share"
    "Asian (%)":    None,  # e.g., "pct_asian", "asian_share"
}

# Preferred join keys (must exist in BOTH files)
PREFERRED_KEYS = [
    "district_id","DISTRICT_ID","district","DISTRICT",
    "GEOID","GEOID20","CD","CD118FP","NAME"
]

# ---------- helpers ----------
def best_join_key(left: pd.DataFrame, right: pd.DataFrame):
    """Find a reasonable join key shared by left and right dataframes."""
    def candidates(df):
        return [c for c in df.columns if c != "geometry" and df[c].dtype.kind in "iufOSU"]

    lc, rc = candidates(left), candidates(right)

    # 1) Try preferred exact matches
    for k in PREFERRED_KEYS:
        if k in lc and k in rc:
            return k, k

    # 2) Case-insensitive matches with max value overlap
    lmap = {c.lower(): c for c in lc}
    rmap = {c.lower(): c for c in rc}
    common = set(lmap) & set(rmap)
    if common:
        best = None; best_overlap = -1
        for lo in common:
            lcol, rcol = lmap[lo], rmap[lo]
            lvals = set(left[lcol].astype(str).unique())
            rvals = set(right[rcol].astype(str).unique())
            ov = len(lvals & rvals)
            if ov > best_overlap:
                best_overlap, best = ov, (lcol, rcol)
        if best:
            return best

    raise ValueError("Couldn't find a common join key. Please set it manually.")

def find_pct_col(df: pd.DataFrame, tokens):
    """
    Find a column that looks like the requested group’s PERCENT/SHARE.
    tokens: list of regex strings, e.g., [r'hisp', r'latino']
    Preference: names containing 'pct'/'percent'/'share'.
    """
    # 1) percentage-like
    for c in df.columns:
        name = c.lower()
        if any(t in name for t in ["pct","percent","share"]):
            if any(re.search(t, name) for t in tokens):
                return c
    # 2) fallback: any name with tokens (may be counts)
    for c in df.columns:
        name = c.lower()
        if any(re.search(t, name) for t in tokens):
            return c
    return None

def ensure_percent(series: pd.Series) -> pd.Series:
    """If max≤1.01, treat as proportion and convert to percent."""
    try:
        mx = float(np.nanmax(series.astype(float).values))
    except Exception:
        return series
    if mx <= 1.01:
        return series * 100.0
    return series

def pretty_bounds(ax, gdf):
    xmin, ymin, xmax, ymax = gdf.total_bounds
    ax.set_xlim(xmin, xmax)
    ax.set_ylim(ymin, ymax)
    ax.set_aspect("equal", adjustable="box")
    ax.axis("off")

# ---------- load and join ----------
tab = pd.read_csv(FINAL_CSV)
geo = gpd.read_parquet(CLEAN_DISTRICTS_PARQUET)  # <-- GeoParquet read

# If your GeoParquet stored the ID as index, bring it back:
if geo.index.name and geo.index.name not in geo.columns:
    geo = geo.reset_index()

# Find a join key and merge (override here if you prefer explicit keys)
left_key, right_key = best_join_key(tab, geo)
merged = geo.merge(tab, left_on=right_key, right_on=left_key, how="left")

# Reproject for nicer TX look (ignore if CRS missing or reprojection fails)
if merged.crs:
    try:
        merged = merged.to_crs(3083)  # NAD83 / Texas Centric Albers Equal Area
    except Exception:
        pass

# Auto-detect race columns if any are None
auto_specs = {
    "White (%)":    [r"white", r"\bnhw\b", r"anglo"],
    "Black (%)":    [r"black", r"african"],
    "Hispanic (%)": [r"hisp", r"latino", r"latinx"],
    "Asian (%)":    [r"asian"],
}
for label, spec in auto_specs.items():
    if RACE_COLS.get(label) is None:
        RACE_COLS[label] = find_pct_col(merged, spec)

# Validate we found everything
missing = [lbl for lbl, col in RACE_COLS.items() if col is None or col not in merged.columns]
if missing:
    raise KeyError(
        "Could not find these columns: "
        + ", ".join(missing)
        + "\nPlease edit RACE_COLS to your exact column names."
    )

# Ensure percentages and stage plotting columns with pretty labels
for lbl, col in list(RACE_COLS.items()):
    merged[lbl] = ensure_percent(merged[col])

# ---------- 2×2 small multiples ----------
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes = axes.ravel()

for ax, (label, _) in zip(axes, RACE_COLS.items()):
    merged.plot(
        column=label,
        ax=ax,
        legend=True,
        edgecolor="black",
        linewidth=0.5,
        cmap="OrRd",
        missing_kwds={"color": "#f0f0f0", "hatch": "///", "label": "No data"},
    )
    ax.set_title(label, fontsize=14)
    pretty_bounds(ax, merged)

fig.suptitle("Texas Congressional Districts — Racial Demographics", fontsize=16, y=0.98)
plt.tight_layout()
plt.savefig("tx_districts_race_small_multiples.png", dpi=220, bbox_inches="tight")
plt.show()

print("Saved small multiples:", "tx_districts_race_small_multiples.png")

# ---------- Individual PNGs per group ----------
for label in RACE_COLS.keys():
    fig, ax = plt.subplots(figsize=(10, 10))
    merged.plot(
        column=label,
        ax=ax,
        legend=True,
        edgecolor="black",
        linewidth=0.5,
        cmap="OrRd",
        missing_kwds={"color": "#f0f0f0", "hatch": "///", "label": "No data"},
    )
    ax.set_title(f"Texas Congressional Districts — {label}", fontsize=16)
    pretty_bounds(ax, merged)
    out = f"tx_districts_{re.sub('[^a-z]+','_', label.lower()).strip('_')}.png"
    plt.savefig(out, dpi=220, bbox_inches="tight")
    plt.close(fig)
    print("Saved:", out)


ImportError: Missing optional dependency 'pyarrow.parquet'. pyarrow is required for Parquet support.  "
        "Use pip or conda to install pyarrow.parquet.