# Connecting to the Congressional District Geospatial Data

In [6]:
# import filepath of congressional district geospatial data from config.py
from texas_gerrymandering_hb4.config import PLANC2308_SHP_FILE

# import geopandas to simplify working with geospatial data in the district shapefile
import geopandas as gpd

# load PLANC2308 Congressional district shapefile
gdf = gpd.read_file(PLANC2308_SHP_FILE)

# Converting the Congressional District Shapefile Into a Geopackage File

In [9]:
# import filepath of processed data directory from config.py
from texas_gerrymandering_hb4.config import INTERIM_DATA_DIR

# save shapefile as a geopackage
gdf.to_file(INTERIM_DATA_DIR/"districts.gpkg", driver="GPKG")

# Initial EDA on District Geospatial Data

In [12]:
# reading the newly created districts.gpkg file
districts = gpd.read_file(INTERIM_DATA_DIR/"districts.gpkg", driver="GPKG")

# viewing the first 5 rows of the dataset
districts.head()

   District                                           geometry
0         1  POLYGON ((1558608.508 1007368.924, 1558424.339...
1         2  POLYGON ((1430214.842 865650.638, 1430191.262 ...
2         3  POLYGON ((1494406.980 1201348.426, 1494404.818...
3         4  POLYGON ((1294445.052 1205704.041, 1294442.732...
4         5  POLYGON ((1403804.511 1045105.488, 1403812.104...


In [14]:
# viewing the last 5 rows of the dataset
districts.tail()

    District                                           geometry
33        34  POLYGON ((1261750.064 412835.286, 1261695.727 ...
34        35  POLYGON ((1216875.807 725701.423, 1216868.212 ...
35        36  POLYGON ((1468189.541 825548.094, 1468131.384 ...
36        37  POLYGON ((1215880.409 884311.182, 1215642.827 ...
37        38  POLYGON ((1433143.468 848921.024, 1433142.046 ...


In [16]:
# viewing dataset information
districts.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   District  38 non-null     int64   
 1   geometry  38 non-null     geometry
dtypes: geometry(1), int64(1)
memory usage: 740.0 bytes


# Converting the Geopackage to a new Coordinate Reference System (CRS)

In [18]:
# Converting coordinate reference system to global web maps
gdf = gdf.to_crs(epsg=4326)

# Lowercasing Column Names

In [19]:
# lowercasing all the attributes names in dataset
gdf.columns = gdf.columns.str.lower()

# Checking for Invalid Geometry

In [21]:
invalid_geometry = gdf[~gdf.is_valid]
if not invalid_geometry.empty:
    print("Invalid geometry is found at these indices:")
    print(invalid_geometry.index)
else:
    print("No invalid geometry occurs in the dataset.")

No invalid geometry occurs in the dataset.


# Sorting and Indexing Congressional Districts

In [22]:
# discarding the old index and adopting the default integer index
gdf = gdf.sort_values("district").reset_index(drop="True")