## Import Needed Libraries & Filepaths

In [11]:
from texas_gerrymandering_hb4.config import INTERIM_DATA_DIR, VTDS_SHP_FILE
import geopandas as gpd
import pandas as pd
import numpy as np
import shapely
from shapely.geometry import Polygon, MultiPolygon, GeometryCollection

INPUT_LAYER = None
OUT_GEOPACKAGE = INTERIM_DATA_DIR/"vtds_geo_clean.gpkg"

In [12]:
# --- Load (pyogrio is faster & avoids Fiona/Shapely version quirks) ---
# If your file has multiple layers, set INPUT_LAYER to that layer name.
# Otherwise, GeoPandas will pick the first layer.
read_kwargs = {}
if INPUT_LAYER:
    read_kwargs["layer"] = INPUT_LAYER

gdf = gpd.read_file(VTDS_SHP_FILE, engine="pyogrio", **read_kwargs)

print("Original shape:", gdf.shape)
print("Original CRS:", gdf.crs)
print("Columns:", list(gdf.columns)[:20], "..." if len(gdf.columns) > 20 else "")

Original shape: (9712, 9)
Original CRS: PROJCS["NAD_1983_Lambert_Conformal_Conic",GEOGCS["NAD83",DATUM["North_American_Datum_1983",SPHEROID["GRS 1980",6378137,298.257222101,AUTHORITY["EPSG","7019"]],AUTHORITY["EPSG","6269"]],PRIMEM["Greenwich",0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["latitude_of_origin",31.1666666666667],PARAMETER["central_meridian",-100],PARAMETER["standard_parallel_1",27.4166666666667],PARAMETER["standard_parallel_2",34.9166666666667],PARAMETER["false_easting",1000000],PARAMETER["false_northing",1000000],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH]]
Columns: ['CNTY', 'COLOR', 'VTD', 'CNTYKEY', 'VTDKEY', 'CNTYVTD', 'Shape_area', 'Shape_len', 'geometry'] 


In [13]:
# --- Normalize column names and string values ---
gdf.columns = [c.strip() for c in gdf.columns]
obj_cols = [c for c in gdf.columns if c != "geometry" and gdf[c].dtype == "object"]
for c in obj_cols:
    gdf[c] = gdf[c].astype(str).str.strip()

In [14]:
# --- Ensure CRS (Census is commonly NAD83 / EPSG:4269). If missing, set conservative default. ---
if gdf.crs is None:
    gdf = gdf.set_crs(4269)

In [15]:
# --- Geometry fix: make_valid if available (Shapely 2.x), else buffer(0) fallback ---
def safe_make_valid(geom):
    if geom is None:
        return None
    mv = getattr(shapely, "make_valid", None)
    if mv is not None:
        try:
            return mv(geom)
        except Exception:
            pass
    try:
        return geom.buffer(0)
    except Exception:
        return None

def to_polygonal(geom):
    if geom is None:
        return None
    if isinstance(geom, GeometryCollection):
        polys = [g for g in geom.geoms if isinstance(g, (Polygon, MultiPolygon))]
        if not polys:
            return None
        return MultiPolygon(polys) if len(polys) > 1 else polys[0]
    if isinstance(geom, (Polygon, MultiPolygon)):
        return geom
    return None

gdf["geometry"] = gdf.geometry.apply(safe_make_valid).apply(to_polygonal)
gdf = gdf[gdf.geometry.notna() & (~gdf.geometry.is_empty)]
gdf = gdf[gdf.geometry.area > 0]  # remove zero-area slivers

In [16]:
# --- (Optional) Build a stable ID (won't be used yet; just cleaning) ---
id_col = None
for cand in ["GEOID20","GEOID","GEOID_VTD","CNTYVTD","VTDID","VTD","PRECINCT","CNTY_VTD"]:
    if cand in gdf.columns:
        id_col = cand
        break

if id_col is None:
    county_keys = [c for c in gdf.columns if c.upper() in {"COUNTYFP20","COUNTYFP","COUNTY","COUNTYFIPS","CNTYFIPS","COUNTY_FIPS"}]
    vtd_keys    = [c for c in gdf.columns if c.upper() in {"VTDST20","VTDST","VTD","VTD_CODE","PRECINCT","PCT"}]
    if county_keys and vtd_keys:
        ckey, vkey = county_keys[0], vtd_keys[0]
        zp = lambda s, n: ("" if pd.isna(s) else str(s)).zfill(n)
        gdf["CNTYVTD"] = gdf[ckey].map(lambda x: zp(x, 3)) + gdf[vkey].map(lambda x: zp(x, 6))
        id_col = "CNTYVTD"

if id_col is None:
    # choose any unique non-null column as a fallback id
    for c in gdf.columns:
        if c == "geometry":
            continue
        if gdf[c].notna().all() and gdf[c].is_unique:
            id_col = c
            break
if id_col is None:
    id_col = "VTD_UID"
    gdf[id_col] = np.arange(len(gdf), dtype=int)

In [17]:
# Deduplicate if necessary
if not gdf[id_col].is_unique:
    gdf = gdf.drop_duplicates(subset=[id_col], keep="first")

In [18]:
# --- Curate column order (ID first, common keys next, geometry last) ---
preferred = [id_col]
for c in ["STATEFP20","COUNTYFP20","VTDST20","NAME20","FUNCSTAT20","MTFCC20","GEOID20","GEOID"]:
    if c in gdf.columns and c not in preferred:
        preferred.append(c)
rest = [c for c in gdf.columns if c not in preferred + ["geometry"]]
gdf = gdf[preferred + rest + ["geometry"]]

In [19]:
gdf.columns = gdf.columns.str.lower()  # enforce lowercase column names

In [20]:
# --- Exports ---

# 2) Convert to WGS84 for GeoParquet (widely compatible & performant)
gdf.to_crs(4326).to_file(OUT_GEOPACKAGE, driver="GPKG")

print("\nCleaned shape:", gdf.shape)
print("Cleaned CRS:", gdf.crs)
print("ID column used:", id_col)
print("Wrote:", OUT_GEOPACKAGE)
display(gdf.drop(columns="geometry").head(10))


Cleaned shape: (9712, 9)
Cleaned CRS: PROJCS["NAD_1983_Lambert_Conformal_Conic",GEOGCS["NAD83",DATUM["North_American_Datum_1983",SPHEROID["GRS 1980",6378137,298.257222101,AUTHORITY["EPSG","7019"]],AUTHORITY["EPSG","6269"]],PRIMEM["Greenwich",0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic_2SP"],PARAMETER["latitude_of_origin",31.1666666666667],PARAMETER["central_meridian",-100],PARAMETER["standard_parallel_1",27.4166666666667],PARAMETER["standard_parallel_2",34.9166666666667],PARAMETER["false_easting",1000000],PARAMETER["false_northing",1000000],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH]]
ID column used: CNTYVTD
Wrote: /home/aimlexpert/Documents/GitHub/texas-gerrymandering-HB4/data/interim/vtds_geo_clean.gpkg


Unnamed: 0,cntyvtd,cnty,color,vtd,cntykey,vtdkey,shape_area,shape_len
0,10001,1,4,1,1,1.0,5666216.0,15288.088777
1,10002,1,3,2,1,2.0,256212900.0,94434.420881
2,10003,1,6,3,1,3.0,70722280.0,55660.372406
3,10004,1,2,4,1,4.0,241066200.0,91319.549282
4,10005,1,1,5,1,5.0,168985400.0,86937.648556
5,10006,1,2,6,1,6.0,6746992.0,16108.198116
6,10007,1,6,7,1,7.0,7234059.0,18668.086181
7,10008,1,1,8,1,8.0,2926144.0,8016.037372
8,10009,1,4,9,1,9.0,148057300.0,95396.760958
9,10010,1,1,10,1,10.0,114737900.0,80814.935429
