## Import Needed Filepaths and Libraries

In [1]:
from texas_gerrymandering_hb4.config import INTERIM_DATA_DIR, CENSUS_GEO_SHP_FILE
import geopandas as gpd

[32m2025-09-16 17:09:50.235[0m | [1mINFO    [0m | [36mtexas_gerrymandering_hb4.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/mle/Documents/GitHub/texas-gerrymandering-hb4[0m


In [2]:
gpkg_path = INTERIM_DATA_DIR/"texas_census_blocks_clean.gpkg"

In [3]:
gdf = gpd.read_file(CENSUS_GEO_SHP_FILE)

## Initial EDA

In [5]:
print(gdf.shape)

(668757, 18)


In [6]:
print(list(gdf.columns))

['STATEFP20', 'COUNTYFP20', 'TRACTCE20', 'BLOCKCE20', 'GEOID20', 'NAME20', 'MTFCC20', 'UR20', 'UACE20', 'UATYPE20', 'FUNCSTAT20', 'ALAND20', 'AWATER20', 'INTPTLAT20', 'INTPTLON20', 'HOUSING20', 'POP20', 'geometry']


In [7]:
gdf.isna().sum()

STATEFP20          0
COUNTYFP20         0
TRACTCE20          0
BLOCKCE20          0
GEOID20            0
NAME20             0
MTFCC20            0
UR20               0
UACE20        295915
UATYPE20      295915
FUNCSTAT20         0
ALAND20            0
AWATER20           0
INTPTLAT20         0
INTPTLON20         0
HOUSING20          0
POP20              0
geometry           0
dtype: int64

## Only Keep Relevant Columns for Joins

In [8]:
# According to Census metadata, key columns:
# STATEFP20, COUNTYFP20, TRACTCE20, BLOCKCE20, GEOID20, NAME20, ALAND20, AWATER20, INTPTLAT20, INTPTLON20
keep_cols = [
    "STATEFP20", "COUNTYFP20", "TRACTCE20", "BLOCKCE20",
    "GEOID20", "NAME20", "ALAND20", "AWATER20",
    "INTPTLAT20", "INTPTLON20", "geometry"
]

gdf = gdf[keep_cols]

## Enforce Correct Coordinate Reference System

In [9]:
gdf = gdf.set_crs("EPSG:4269", allow_override=True)

## Drop Duplicates

In [10]:
gdf = gdf.drop_duplicates(subset=["GEOID20"])

## Export to Geopackage File

In [11]:
gdf.to_file(gpkg_path, driver="GPKG", layer="blocks")
print("✅ Cleaned shapefile exported to:", gpkg_path)

✅ Cleaned shapefile exported to: /home/mle/Documents/GitHub/texas-gerrymandering-hb4/data/interim/texas_census_blocks_clean.gpkg


## Raw Data Schema

Taken directly from the <a href="https://www2.census.gov/geo/pdfs/maps-data/data/tiger/tgrshp2024/TGRSHP2024_TechDoc.pdf">Census TIGER/LINE Shapefile Technical Documentation</a>, here is the record layout for the block shapefiles:

| Field      | Type | Description                                                                                                                        |
|------------|--|------------------------------------------------------------------------------------------------------------------------------------|
| STATEFP20  | String | 2020 Census State FIPS code                                                                                                        |
| COUNTYFP20 | String | 2020 Census County FIPS code                                                                                                       |
| TRACTCE20  | String | 2020 Census tract code                                                                                                             |
| BLOCKCE20  | String | 2020 Census tabulation block number                                                                                                |
| GEOID20    | String | Census block identifier, a concatenation of 2020 Census state FIPS code, 2020 Census tract code, and 2020 Census tabulation number |
| NAME20     | String | 2020 Census tabulation block name, the word "Block" followed by block number
| MTFF20     | String | MAF/TIGER feature class code
| UR20       | String | 2020 Census urban/rural indicator
| UACE20     | String | 2020 Census urban area code
| FUNCSTAT20 | String | 2020 functional status
| ALAND20    | Number | 2020 Census land area
| UWATER20   | Number | 2020 Census water area
| INTPTLAT20 | String | 2020 Census latitude of the internal point
| INTPTLON20 | String | 2020 Census longtitude of the internal point
| POP20      | Number | 2020 Population
| geometry   |  |