In [None]:
# ----------------------------- 00. Imports -----------------------------------
import gzip, pickle, pathlib, collections, itertools
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
from   scipy.stats import entropy

In [None]:
# ----------------------------- 01. I/O  --------------------------------------
DATA_F = pathlib.Path("nc_voter_cleaned_2022.pkl.gz")     # << adjust path if needed
OUTDIR = pathlib.Path("eda_plots")
OUTDIR.mkdir(exist_ok=True)

with gzip.open(DATA_F, "rb") as f:
    df = pickle.load(f)          # ~7M rows after prior cleaning

In [None]:
print(df.value_counts)

<bound method DataFrame.value_counts of          county_id county_desc  zip_code      ncid     surname      first  \
2                1    ALAMANCE   27302.0  AA216996  AARMSTRONG    TIMOTHY   
3                1    ALAMANCE   27215.0   AA98377       AARON  CHRISTINA   
4                1    ALAMANCE   27215.0   AA69747       AARON    CLAUDIA   
5                1    ALAMANCE   27253.0  AA170513       AARON      JAMES   
7                1    ALAMANCE   27215.0   AA91549       AARON     NATHAN   
...            ...         ...       ...       ...         ...        ...   
8781277        100      YANCEY   28714.0   ES26136      ZITNEY     LAUREN   
8781278        100      YANCEY   28714.0   ES17204      ZITNEY   VICTORIA   
8781279        100      YANCEY   28714.0   ES26341      ZULICK       DAWN   
8781280        100      YANCEY   28714.0   ES26380   ZULLINGER     PAMELA   
8781281        100      YANCEY   28740.0  AL300586    ZURAWSKI    ANTHONY   

            middle race_code ethnic

In [None]:
# ----------------------------- 02. Helper: race bucket -----------------------
"""
We collapse the NC race × ethnicity codes into the five buckets used
throughout the study:

    White / Black / Hispanic / AAPI / Other

• Hispanic gets priority over race, matching BISG practice.
• AAPI = Asian (‘A’) or Native Hawaiian / Pacific-Islander (‘P’).
• “Other” lumps together AIAN (‘I’), Multi-race (‘M’) and Undesignated
  (‘U’) once non-Hispanic.

If your column names differ, change them here.
"""
def to_bucket(row) -> str:
    eth = str(row["ethnic_code"]).upper()
    if eth == "HL":                       # Hispanic or Latino
        return "Hispanic"

    race = str(row["race"]).upper()
    if race == "W":                       return "White"
    if race == "B":                       return "Black"
    if race in {"A", "P"}:                return "AAPI"
    return "Other"

df["bucket"] = df.apply(to_bucket, axis=1)
bucket_order  = ["White", "Black", "Hispanic", "AAPI", "Other"]
bucket_colors = ["#4C78A8", "#F58518", "#54A24B", "#B279A2", "#9D9D9D"]

# ----------------------------- 03-A.  Race Buckets ---------------------------
fig, ax = plt.subplots(figsize=(6,4))
counts = df["bucket"].value_counts().reindex(bucket_order)
pct    = counts / counts.sum() * 100
bars   = ax.bar(bucket_order, pct, color=bucket_colors)

ax.set_title("Class Imbalance Across Evaluation Buckets", weight="bold")
ax.set_ylabel("Share of voters (%)")
ax.set_ylim(0, pct.max()*1.15)
ax.bar_label(bars, fmt="%.1f%%", padding=4)
plt.tight_layout()
plt.savefig(OUTDIR / "race_buckets.png", dpi=300)
plt.close()


# ----------------------------- 03-B.  ZIP Entropy (Map) ----------------------------
"""
We now visualise geographic diversity by mapping the Shannon entropy of race buckets
across North Carolina ZCTAs.  Higher entropy ⇒ more heterogeneous racial composition.

Adapted from `make_geo.py`; code is inlined here so the notebook is self‑contained.
"""

import geopandas as gpd
from   matplotlib import colors
from   mpl_toolkits.axes_grid1.inset_locator import inset_axes

# --- paths ---
ZCTA_DIR = pathlib.Path("tl_2022_us_zcta520")        # folder with TIGER/Line shapes
OUT_SHP  = next(ZCTA_DIR.glob("*.shp"))
PNG_F    = OUTDIR / "zip_entropy_map.png"

# --- helper: Shannon entropy ---
def shannon_entropy(p: np.ndarray) -> float:
    p = p[p > 0]
    return 0.0 if p.size == 0 else -(p * np.log2(p)).sum()

# --- voter file → entropy per ZCTA ---
zip_counts = (df.groupby(["zcta", "bucket"], observed=True)
                .size()
                .unstack(fill_value=0))
zip_probs  = zip_counts.div(zip_counts.sum(axis=1), axis=0)
zip_entropy = zip_probs.apply(shannon_entropy, axis=1)
zip_entropy.name = "entropy"
zip_entropy.index = zip_entropy.index.astype(str).str.zfill(5)

# --- TIGER/Line shapes ---
shp = gpd.read_file(OUT_SHP).to_crs("EPSG:3857")
geo_col = next(col for col in ("ZCTA5CE20","ZCTA5CE10","ZCTA5CE","GEOID20","GEOID10") if col in shp.columns)
shp = (shp.rename(columns={geo_col: "zcta"})
           .assign(zcta=lambda d: d["zcta"].str.zfill(5)))

gdf = shp.merge(zip_entropy, on="zcta", how="inner")

# --- plot ---
fig, ax = plt.subplots(figsize=(7,6))
map_ax = gdf.plot(column="entropy", cmap="viridis",
                  linewidth=0.25, edgecolor="white", ax=ax)

ax.set_axis_off()
ax.set_title("ZIP‑Level Shannon Entropy of Race Buckets (NC, 2022)",
             fontweight="bold", pad=10)

# colour bar
sm = plt.cm.ScalarMappable(cmap=map_ax.collections[0].cmap,
                           norm=map_ax.collections[0].norm)
sm._A = []
cax = inset_axes(ax, width="3%", height="80%", loc="lower left",
                 bbox_to_anchor=(1.02,0.1,1,1),
                 bbox_transform=ax.transAxes, borderpad=0)
cb = fig.colorbar(sm, cax=cax, orientation="vertical")
cb.set_label("Shannon Entropy (bits)", fontsize=11)
cb.ax.tick_params(labelsize=9)

plt.tight_layout()
plt.savefig(PNG_F, dpi=300, bbox_inches="tight")
plt.close()

print(f"✓  Map saved to {PNG_F}")
# ----------------------------- 03-C.  Surname Coverage -----------------------
"""
Estimate how many surnames in the voter file are *covered* by the 2010
Census surname table (‘Surname’ ≤ occurs ≥ 100 times).

Download the table once from:
  https://www2.census.gov/topics/genealogy/2010surnames/names.zip
and unzip so that 'Names_2010Census.csv' sits next to this script.
If you haven’t done that yet, we’ll skip the figure instead of crashing.
"""
census_file = pathlib.Path("Names_2010Census.csv")
if census_file.is_file():
    census_names = pd.read_csv(census_file, usecols=["NAME"])
    covered = df["last_name"].str.upper().isin(census_names["NAME"])
    cov_pct = (df.assign(covered=covered)
                 .groupby("bucket")["covered"]
                 .mean()
                 .reindex(bucket_order))

    fig, ax = plt.subplots(figsize=(6,4))
    bars = ax.bar(bucket_order, cov_pct*100, color=bucket_colors)
    ax.set_ylim(0, 105)
    ax.set_ylabel("Surname in Census table (%)")
    ax.set_title("Surname Table Coverage by Race Group", weight="bold")
    ax.bar_label(bars, fmt="%.1f%%", padding=3)
    plt.tight_layout()
    plt.savefig(OUTDIR / "surname_coverage.png", dpi=300)
    plt.close()
else:
    print("›› Skip surname-coverage plot – census file not found.")


print("✓  All finished – check the 'eda_plots/' folder for 1-4 PNGs.")