# Connecting to the Congressional District Geospatial Data

### Import Needed Filepaths and Libraries

In [1]:
import math

# import geopandas to simplify working with geospatial data in the district shapefile
import geopandas as gpd

try:
    from shapely import make_valid
    HAS_MAKE_VALID = True
except Exception:
    HAS_MAKE_VALID = False

# import filepath of congressional district geospatial data from config.py
from texas_gerrymandering_hb4.config import PLANC2333_SHP_FILE, INTERIM_DATA_DIR

[32m2025-09-07 18:17:44.627[0m | [1mINFO    [0m | [36mtexas_gerrymandering_hb4.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/aimlexpert/Documents/GitHub/texas-gerrymandering-HB4[0m


### Load Congressional Districts Shapefile

In [3]:
# load PLANC2308 Congressional district shapefile
gdf = gpd.read_file(PLANC2333_SHP_FILE)
print(f"Loaded {len(gdf)} rows")
gdf.head()

Loaded 38 rows


Unnamed: 0,District,geometry
0,1,"POLYGON ((1558608.508 1007368.924, 1558424.339..."
1,2,"POLYGON ((1430214.842 865650.638, 1430191.262 ..."
2,3,"POLYGON ((1494406.98 1201348.426, 1494404.818 ..."
3,4,"POLYGON ((1294445.052 1205704.041, 1294442.732..."
4,5,"POLYGON ((1403804.511 1045105.488, 1403812.104..."


### Fix Invalid Geometries
Shapely handles most topology errors; otherwise buffer(0) is used.

In [None]:
if HAS_MAKE_VALID:
    gdf["geometry"] = gdf.geometry.apply(make_valid)
else:
    gdf["geometry"] = gdf.buffer(0)
print("Fixed geometry errors (if any)".)

### Set Coordinate Reference System
The reason we reproject project shapefiles into WGS84 is because it is the de facto standard coordinate reference system for geospatial data exchange.

In [None]:
# Ensure CRS WGS84
if gdf.crs is None:
    gdf = gdf.set_crs(4269, allow_override=True).to_crs(4326)
else:
    gdf = gdf.to_crs(4326)
gdf.crs

## Compute Derived Columns
The derived columns that we will compute will help us measure the compactness of each Congressional District. This is important because compactness is a widely accepted form of quantifying gerrymandering.

In [None]:
g_area = gdf.to_crs(3081)
gdf["area_sq_km"] = g_area.area / 1e6
gdf["perimeter_km"] = g_area.length / 1e3
gdf["polsby_popper"] = (4 * math.pi * g_area.area / (g_area.length ** 2)).fillna(0)
gdf[["area_sq_km","perimeter_km","polsby_popper"]].describe()

In [None]:

# Select tidy columns to keep
keep = [c for c in gdf.columns if c.lower() in {
    "statefp","geoid","name","namelsad","cd116fp","cd118fp","cd119fp","gid",
    "district","dist","cd","stusps","state","id"
}]
keep = sorted(set(keep + ["area_sq_km","perimeter_km","polsby_popper","geometry"]),
              key=lambda x: (x!="geometry", x))
gdf = gdf[keep]
gdf.head()

In [None]:

# Save outputs
gpkg_path = INTERIM_DATA_DIR / "districts_clean.gpkg"
gdf.to_file(gpkg_path, driver="GPKG", layer="districts")
parquet_path = INTERIM_DATA_DIR / "districts_clean.parquet"
gdf.to_parquet(parquet_path, index=False)
print("Wrote:", gpkg_path)
print("Wrote:", parquet_path)