In [28]:
import geopandas as gpd
import pandas as pd

acs = pd.read_parquet("/data/CensusSouthFlorida_dataset CLEANED.parquet")
cdc = pd.read_parquet("/data/CDCSocialVulnerabilityIndexFlorida_dataset CLEANED.parquet")
nri = pd.read_parquet("/data/FEMANationalRiskIndex_dataset CLEANED.parquet")

In [29]:
for df, name in [(acs,"acs"), (cdc,"cdc"), (nri,"nri")]:
    print(name, "GEOID in cols?", "GEOID" in df.columns)
    df["GEOID"] = df["GEOID"].astype(str).str.zfill(11)

acs GEOID in cols? True
cdc GEOID in cols? True
nri GEOID in cols? True


In [30]:
tracts_geom = gpd.read_parquet("/Users/acsoteldo/Desktop/datasets/Data Project 12 Dec 2025/data/SouthFlorida_tracts_geometry.parquet")

In [31]:
# TIGER uses GEOID as tract GEOID; sometimes itâ€™s GEOID or GEOID10
if "GEOID" not in tracts_geom.columns:
    for alt in ["GEOID10", "GEOID20"]:
        if alt in tracts_geom.columns:
            tracts_geom = tracts_geom.rename(columns={alt: "GEOID"})
            break

tracts_geom["GEOID"] = tracts_geom["GEOID"].astype(str).str.zfill(11)


In [32]:
# Filter to South Florida counties
sf_counties = {"011", "086", "087", "099"}  # Broward, Miami-Dade, Monroe, Palm Beach
if "COUNTYFP" in tracts_geom.columns:
    tracts_geom = tracts_geom[tracts_geom["COUNTYFP"].isin(sf_counties)].copy()

In [33]:
# Create the bridge: merge all attributes onto tract geometry
tracts_tbl = (
    acs.merge(cdc, on="GEOID", how="left")
       .merge(nri, on="GEOID", how="left")
)

In [34]:
tracts_bridge = tracts_geom.merge(tracts_tbl, on="GEOID", how="left")

In [35]:
# Sanity checks
print(type(tracts_bridge))  # should be GeoDataFrame
print("Rows:", len(tracts_bridge))
print("Missing CDC SVI:", tracts_bridge["svi_overall_pctile"].isna().mean() if "svi_overall_pctile" in tracts_bridge.columns else "no col")
print("Missing NRI:", tracts_bridge["nri_score"].isna().mean() if "nri_score" in tracts_bridge.columns else "no col")

<class 'geopandas.geodataframe.GeoDataFrame'>
Rows: 1526
Missing CDC SVI: 0.015727391874180863
Missing NRI: 0.00327653997378768


In [36]:
# Load NFHL polygons
nfhl = gpd.read_parquet("/data/ARCGISFloodHazardSouthFlorida_dataset CLEANED.parquet")

In [37]:
# Match CRS
tracts_bridge = tracts_bridge.to_crs(nfhl.crs)

In [38]:
# Spatial join
flood_join = gpd.sjoin(
    tracts_bridge,
    nfhl[["FLD_ZONE", "geometry"]],
    how="left",
    predicate="intersects"
)

In [39]:
# Collapse to tract-level flag
flood_flag = (
    flood_join.assign(in_floodplain=flood_join["FLD_ZONE"].notna())
             .groupby("GEOID")["in_floodplain"]
             .max()
             .reset_index()
)

tracts_final = tracts_bridge.merge(flood_flag, on="GEOID", how="left")
tracts_final["in_floodplain"] = tracts_final["in_floodplain"].fillna(False)

In [None]:
tracts_final.fillna(0, inplace=True)  # Replace NaN with '0'

In [40]:
tracts_final.drop(columns="geometry").to_csv("/data/MASTER_dataset.csv", index=False)
tracts_final.to_file("/data/MASTER_dataset.geojson", driver="GeoJSON")