## Import Needed Libraries and Filepaths

In [1]:
from texas_gerrymandering_hb4.config import INTERIM_DATA_DIR, CLEAN_ELECTION_RESULTS, CLEAN_VTD_GEO
import pandas as pd
import geopandas as gpd
import os
import re
import math
import sys
import pyarrow

[32m2025-09-20 10:28:28.243[0m | [1mINFO    [0m | [36mtexas_gerrymandering_hb4.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/aimlexpert/Documents/GitHub/texas-gerrymandering-HB4[0m


In [2]:
inner_parquet = INTERIM_DATA_DIR/"inner_join_vtds.parquet"
left_parquet = INTERIM_DATA_DIR/"left_join_vtds.parquet"

## Helper Functions

In [3]:
# ------------------------ Helpers ------------------------
def to_str_or_none(x):
    if x is None:
        return None
    if isinstance(x, float) and math.isnan(x):
        return None
    return str(x)

def clean_digits(x):
    s = to_str_or_none(x)
    if s is None:
        return None
    s = s.strip()
    s = re.sub(r"\D", "", s)   # keep digits only
    return s if s else None

def build_cntyvtd_norm_from_fips_vtd(fips, vtd):
    f = clean_digits(fips)
    v = clean_digits(vtd)
    if not f or not v:
        return None
    return f.zfill(3) + v.zfill(4)   # CCC + VVVV

def normalize_cntyvtd_string(raw, fallback_fips=None):
    """
    Normalize any CNTYVTD-like string into 7-digit CCCVVVV.
    If it's 7 digits already, keep it.
    If it's 6 => assume 3+3 and pad VTD to 4.
    If it's 5 => often CC+VVV; upgrade to CCC+VVVV by zfilling.
    If it's 4 => VTD only -> need fallback_fips to build.
    Otherwise try to coerce by padding left.
    """
    s = clean_digits(raw)
    if s is None:
        # try fallback with vtd-only branch below by passing raw again
        if fallback_fips is not None:
            return build_cntyvtd_norm_from_fips_vtd(fallback_fips, raw)
        return None

    if len(s) == 7:
        return s
    if len(s) == 6:
        return s[:3] + s[3:].zfill(4)
    if len(s) == 5:
        return s[:2].zfill(3) + s[2:].zfill(4)
    if len(s) == 4:
        if fallback_fips is not None:
            return build_cntyvtd_norm_from_fips_vtd(fallback_fips, s)
        return None
    # odd lengths: pad to 7 on the left as last resort
    return s.zfill(7)

def freq_len(series):
    return series.dropna().astype(str).str.len().value_counts().sort_index()

## Load Data

In [4]:
# Election Results
df = pd.read_csv(CLEAN_ELECTION_RESULTS)

In [5]:
# VTD Geospatial Data
gdf = gpd.read_parquet(CLEAN_VTD_GEO)

In [6]:
# ------------------------ Column cleanup ------------------------
gdf.columns = [c.strip().lower() for c in gdf.columns]
df.columns  = [c.strip().lower() for c in df.columns]


In [7]:
# ------------------------ Build normalized keys ------------------------
geo_fips_candidates = ["cnty", "cntykey", "county_fips", "countyfp", "fips"]
geo_vtd_candidates = ["vtd", "vtdkey", "precinct", "pct"]
geo_fips_col = next((c for c in geo_fips_candidates if c in gdf.columns), None)
geo_vtd_col = next((c for c in geo_vtd_candidates if c in gdf.columns), None)

if geo_fips_col and geo_vtd_col:
    gdf["cntyvtd_norm"] = [
        build_cntyvtd_norm_from_fips_vtd(f, v)
        for f, v in zip(gdf[geo_fips_col], gdf[geo_vtd_col])
    ]
elif "cntyvtd" in gdf.columns:
    fallback = gdf[geo_fips_col] if geo_fips_col else None
    fallback_iter = fallback if fallback is not None else [None] * len(gdf)
    gdf["cntyvtd_norm"] = [
        normalize_cntyvtd_string(val, fallback_fips=(None if pd.isna(fb) else fb))
        for val, fb in zip(gdf["cntyvtd"], fallback_iter)
    ]
else:
    raise KeyError("Geo file missing both (cnty,vtd) and cntyvtd columns needed to build the join key.")


In [8]:
# RESULTS side: build from fips + vtd whenever possible (most reliable)
res_fips_candidates = ["fips", "county_fips", "countyfp", "cnty"]
res_vtd_candidates = ["vtd", "vtdkey", "precinct", "pct"]
res_fips_col = next((c for c in res_fips_candidates if c in df.columns), None)
res_vtd_col = next((c for c in res_vtd_candidates if c in df.columns), None)

if res_fips_col and res_vtd_col:
    df["cntyvtd_norm"] = [
        build_cntyvtd_norm_from_fips_vtd(f, v)
        for f, v in zip(df[res_fips_col], df[res_vtd_col])
    ]
elif "cntyvtd" in df.columns:
    fallback = df[res_fips_col] if res_fips_col else None
    fallback_iter = fallback if fallback is not None else [None] * len(df)
    df["cntyvtd_norm"] = [
        normalize_cntyvtd_string(val, fallback_fips=(None if pd.isna(fb) else fb))
        for val, fb in zip(df["cntyvtd"], fallback_iter)
    ]
else:
    raise KeyError("Results file missing both (fips,vtd) and cntyvtd. Need one of those to build the key.")


In [9]:
# ------------------------ Diagnostics ------------------------
print("Geo key length distribution:\n", freq_len(gdf["cntyvtd_norm"]))
print("Res key length distribution:\n", freq_len(df["cntyvtd_norm"]))

geo_null = gdf["cntyvtd_norm"].isna().sum()
res_null = df["cntyvtd_norm"].isna().sum()
print(f"Null geo keys: {geo_null}/{len(gdf)}")
print(f"Null res keys: {res_null}/{len(df)}")

geo_keys = set(gdf["cntyvtd_norm"].dropna().unique())
res_keys = set(df["cntyvtd_norm"].dropna().unique())

only_in_geo = sorted(geo_keys - res_keys)[:15]
only_in_res = sorted(res_keys - geo_keys)[:15]
print("Only-in-geo sample:", only_in_geo)
print("Only-in-results sample:", only_in_res)

Geo key length distribution:
 cntyvtd_norm
7    9712
Name: count, dtype: int64
Res key length distribution:
 cntyvtd_norm
7    9712
Name: count, dtype: int64
Null geo keys: 0/9712
Null res keys: 0/9712
Only-in-geo sample: []
Only-in-results sample: []


In [10]:
# If duplicates in results, aggregate sensibly
vote_cols = [c for c in ["dem_votes","rep_votes","third_party_votes","total_votes","dem_share"] if c in df.columns]
if df["cntyvtd_norm"].duplicated().any():
    agg_map = {c: "sum" for c in vote_cols if c != "dem_share"}
    df_agg = df.groupby("cntyvtd_norm", as_index=False).agg(agg_map)
    if {"dem_votes","total_votes"}.issubset(df_agg.columns):
        df_agg["dem_share"] = df_agg["dem_votes"] / df_agg["total_votes"].replace({0: pd.NA})
else:
    df_agg = df.copy()

In [11]:
# ------------------------ Joins ------------------------
inner = gdf.merge(df_agg, on="cntyvtd_norm", how="inner")
left  = gdf.merge(df_agg, on="cntyvtd_norm", how="left")

# Fill vote NaNs with 0 on left; recompute share if possible
for c in vote_cols:
    if c in left.columns and c != "dem_share":
        left[c] = left[c].fillna(0).astype("Int64")
if {"dem_votes","total_votes"}.issubset(left.columns):
    left["dem_share"] = (left["dem_votes"].astype("Int64").fillna(0) /
                         left["total_votes"].replace({0: pd.NA}))

print(f"Rows — geo: {len(gdf)}, results(agg): {len(df_agg)}, inner: {len(inner)}, left: {len(left)}")

Rows — geo: 9712, results(agg): 9389, inner: 9712, left: 9712


In [12]:
# Exports
#inner.drop(columns=["geometry"], errors="ignore").to_csv(INTERIM_DATA_DIR/"inner_join_vtds.csv", index=False)
#left.drop(columns=["geometry"], errors="ignore").to_csv(INTERIM_DATA_DIR/"left_join_vtds.csv", index=False)

# GeoPackage + GeoParquet
#inner.to_file(INTERIM_DATA_DIR/"vtd_merged.gpkg", layer="vtd_merged_inner", driver="GPKG")
#left.to_file(INTERIM_DATA_DIR/"vtd_merged.gpkg", layer="vtd_merged_left", driver="GPKG")

# If parquet environment is ready:
inner.to_parquet(inner_parquet)
left.to_parquet(left_parquet)