# Connecting to the Congressional District Geospatial Data

### Import Needed Filepaths and Libraries
* Math Library
Importing the math library provides us with the mathematical functions and constants necessary to compute compactness scores.
* Geopandas
* Shapely


In [7]:
import math

# import geopandas to simplify working with geospatial data in the district shapefile
import geopandas as gpd

try:
    from shapely import make_valid
    HAS_MAKE_VALID = True
except Exception:
    HAS_MAKE_VALID = False

# import filepath of congressional district geospatial data from config.py
from texas_gerrymandering_hb4.config import PLANC2333_SHP_FILE, INTERIM_DATA_DIR

### Load Congressional Districts Shapefile

In [8]:
# load PLANC2308 Congressional district shapefile
gdf = gpd.read_file(PLANC2333_SHP_FILE)
print(f"Loaded {len(gdf)} rows")
gdf.head()

Loaded 38 rows


Unnamed: 0,District,geometry
0,1,"POLYGON ((1558608.508 1007368.924, 1558424.339..."
1,2,"POLYGON ((1430214.842 865650.638, 1430191.262 ..."
2,3,"POLYGON ((1494406.98 1201348.426, 1494404.818 ..."
3,4,"POLYGON ((1294445.052 1205704.041, 1294442.732..."
4,5,"POLYGON ((1403804.511 1045105.488, 1403812.104..."


### Fix Invalid Geometries
Shapely handles most topology errors; otherwise buffer(0) is used.

In [9]:
if HAS_MAKE_VALID:
    gdf["geometry"] = gdf.geometry.apply(make_valid)
else:
    gdf["geometry"] = gdf.buffer(0)
print("Fixed geometry errors (if any)")

Fixed geometry errors (if any)


### Set Coordinate Reference System
The reason we reproject project shapefiles into WGS84 is because it is the de facto standard coordinate reference system for geospatial data exchange.

In [10]:
# Ensure CRS WGS84
if gdf.crs is None:
    gdf = gdf.set_crs(4269, allow_override=True).to_crs(4326)
else:
    gdf = gdf.to_crs(4326)
gdf.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

## Base Metrics (Area, Perimeter, Polsby-Popper)

In [11]:
g_area = gdf.to_crs(3081)
gdf["area_sq_km"] = g_area.area / 1e6
gdf["perimeter_km"] = g_area.length / 1e3
gdf["polsby_popper"] = (4 * math.pi * g_area.area / (g_area.length ** 2)).fillna(0)
gdf[["area_sq_km","perimeter_km","polsby_popper"]].describe()

Unnamed: 0,area_sq_km,perimeter_km,polsby_popper
count,38.0,38.0,38.0
mean,18256.180498,814.491214,0.22176
std,29044.347085,610.690399,0.087689
min,322.366561,174.028083,0.103462
25%,1029.879509,262.65718,0.148006
50%,6931.063918,693.429901,0.207416
75%,22694.428001,1209.865036,0.286433
max,143150.81066,2883.147415,0.53169


## Reock and Convex Hull Compactness

In [12]:
# --- Reock (area / min enclosing circle area) & Convex Hull ratio (area / hull area) ---

# Try Shapely's minimum bounding circle; else fallback to Welzl on hull vertices
try:
    from shapely import minimum_bounding_circle as _mbc  # Shapely 2.x
except Exception:
    try:
        from shapely.algorithms.minimum_bounding_circle import minimum_bounding_circle as _mbc
    except Exception:
        _mbc = None

def _welzl_minidisk(points):
    """Minimal enclosing circle for 2D points. Returns (cx, cy, r)."""
    import random
    P = points[:]
    random.shuffle(P)

    def _circle_two(A, B):
        cx = (A[0] + B[0]) / 2.0
        cy = (A[1] + B[1]) / 2.0
        r = ((A[0]-cx)**2 + (A[1]-cy)**2) ** 0.5
        return (cx, cy, r)

    def _circle_three(A, B, C):
        ax, ay = A; bx, by = B; cx, cy = C
        d = 2 * (ax*(by-cy) + bx*(cy-ay) + cx*(ay-by))
        if d == 0:
            return None
        ux = ((ax**2+ay**2)*(by-cy) + (bx**2+by**2)*(cy-ay) + (cx**2+cy**2)*(ay-by)) / d
        uy = ((ax**2+ay**2)*(cx-bx) + (bx**2+by**2)*(ax-cx) + (cx**2+cy**2)*(bx-ax)) / d
        r = ((ax-ux)**2 + (ay-uy)**2) ** 0.5
        return (ux, uy, r)

    def _contains(circ, pts):
        cx, cy, r = circ
        r2 = r*r + 1e-9
        for (x, y) in pts:
            if (x-cx)*(x-cx) + (y-cy)*(y-cy) > r2:
                return False
        return True

    c = None
    for i, p in enumerate(P):
        if (c is not None) and _contains(c, [p]):
            continue
        c = (p[0], p[1], 0.0)
        for j, q in enumerate(P[:i]):
            if _contains(c, [q]):
                continue
            c = _circle_two(p, q)
            for k, r in enumerate(P[:j]):
                if _contains(c, [r]):
                    continue
                ct = _circle_three(p, q, r)
                if ct is not None:
                    c = ct
    return c

def _minimum_enclosing_circle_polygon(poly_3081):
    """Return (cx, cy, r) for minimal enclosing circle of a (Multi)Polygon (EPSG:3081)."""
    if _mbc is not None:
        circ = _mbc(poly_3081)
        area = float(circ.area)
        r = (area / math.pi) ** 0.5 if area > 0 else 0.0
        cx, cy = float(circ.centroid.x), float(circ.centroid.y)
        return (cx, cy, r)
    hull = poly_3081.convex_hull
    if hull.is_empty:
        return (0.0, 0.0, 0.0)
    coords = list(hull.exterior.coords)
    pts = [(float(x), float(y)) for (x, y) in (coords[:-1] if len(coords)>=2 and coords[0]==coords[-1] else coords)]
    if len(pts) == 1:
        return (pts[0][0], pts[0][1], 0.0)
    if len(pts) == 2:
        cx = (pts[0][0]+pts[1][0])/2.0
        cy = (pts[0][1]+pts[1][1])/2.0
        r = ((pts[0][0]-cx)**2 + (pts[0][1]-cy)**2) ** 0.5
        return (cx, cy, r)
    return _welzl_minidisk(pts)

def reock_score(geom_3081):
    """Reock = area(geom)/area(min enclosing circle)."""
    if geom_3081.is_empty:
        return 0.0
    area_poly = float(geom_3081.area)
    cx, cy, r = _minimum_enclosing_circle_polygon(geom_3081)
    if r <= 0:
        return 0.0
    area_circ = math.pi * r * r
    return float(area_poly / area_circ) if area_circ > 0 else 0.0

def convex_hull_ratio(geom_3081):
    """Convex hull compactness = area(geom)/area(convex_hull)."""
    if geom_3081.is_empty:
        return 0.0
    hull = geom_3081.convex_hull
    area_hull = float(hull.area)
    if area_hull == 0:
        return 0.0
    return float(geom_3081.area / area_hull)

# Compute on the projected copy (EPSG:3081), store results on the WGS84 gdf
gdf["reock"] = g_area.geometry.apply(reock_score)
gdf["convex_hull_ratio"] = g_area.geometry.apply(convex_hull_ratio)

# Optional sanity column
gdf["convex_hull_area_sq_km"] = g_area.convex_hull.area / 1e6


In [15]:
gdf["reock"].describe()

count    38.000000
mean      0.346595
std       0.104512
min       0.165608
25%       0.269492
50%       0.332582
75%       0.445556
max       0.508818
Name: reock, dtype: float64

In [16]:
gdf["convex_hull_ratio"].describe()

count    38.000000
mean      0.686943
std       0.108351
min       0.454285
25%       0.612685
50%       0.712999
75%       0.762905
max       0.838869
Name: convex_hull_ratio, dtype: float64

### Schwartzberg Compactness

In [13]:
# Schwartzberg compactness (dimensionless, (0,1], circle=1)
# Definition: circumference of a circle with the same area divided by the polygon perimeter
# S = (2 * sqrt(pi * A)) / P  (A in m^2, P in m), so we use g_area for A and P
A = g_area.area              # m^2
P = g_area.length            # m
circle_circumference = 2.0 * (math.pi * A).pow(0.5)  # vectorized sqrt via pandas/GeoSeries
# Beware of division by zero:
gdf["schwartzberg"] = (circle_circumference / P).fillna(0).replace([float("inf")], 0)
gdf["schwartzberg"].describe()


count    38.000000
mean      0.462608
std       0.089240
min       0.321654
25%       0.384672
50%       0.455395
75%       0.535195
max       0.729171
Name: schwartzberg, dtype: float64

## Keep Tidy Columns and Preview

In [14]:
# Select tidy columns to keep (only those present; avoids KeyErrors)
keep = [c for c in gdf.columns if c.lower() in {
    "statefp","geoid","name","namelsad","cd116fp","cd118fp","cd119fp","gid",
    "district","dist","cd","stusps","state","id"
}]

# Always include derived metrics + geometry
keep = sorted(set(keep + [
    "area_sq_km","perimeter_km","polsby_popper",
    "reock","convex_hull_ratio","convex_hull_area_sq_km",
    "schwartzberg",
    "geometry"
]), key=lambda x: (x != "geometry", x))

gdf = gdf[keep]
gdf.head()


Unnamed: 0,geometry,District,area_sq_km,convex_hull_area_sq_km,convex_hull_ratio,perimeter_km,polsby_popper,reock,schwartzberg
0,"POLYGON ((-94.12963 31.09928, -94.13149 31.100...",1,24872.423988,34407.648538,0.722875,1270.431901,0.193653,0.410968,0.440061
1,"POLYGON ((-95.53725 29.87388, -95.53732 29.877...",2,1310.539397,2095.991145,0.62526,315.656491,0.165284,0.420622,0.406551
2,"POLYGON ((-94.70576 32.87916, -94.70579 32.879...",3,8726.001927,10833.55985,0.80546,711.485635,0.216617,0.274956,0.465422
3,"POLYGON ((-96.84412 32.98744, -96.8441 32.9889...",4,12114.918945,21097.061568,0.574247,1213.041827,0.103462,0.223692,0.321654
4,"POLYGON ((-95.73928 31.50406, -95.7392 31.5041...",5,9956.758283,12095.544056,0.823176,823.654724,0.184432,0.312211,0.429456


### Save Outputs

In [18]:

# Save outputs
gpkg_path = INTERIM_DATA_DIR / "districts_clean.gpkg"
gdf.to_file(gpkg_path, driver="GPKG", layer="districts")
print("Wrote:", gpkg_path)

Wrote: /home/aimlexpert/Documents/GitHub/texas-gerrymandering-HB4/data/interim/districts_clean.gpkg
