In [28]:
import requests
import pandas as pd

# Store Census API key securely
# (In practice, consider using environment variables)
API_KEY = #"PASTE_YOUR_KEY_HERE"

# Base URL for ACS 5-Year Estimates (2022)
BASE_URL = "https://api.census.gov/data/2022/acs/acs5"

# Variables to pull from ACS
variables = [
    "B19013_001E",  # Median household income
    "B01003_001E",  # Total population
    "B11001_001E",  # Total households
    "B25010_001E",  # Average household size
]

# South Florida counties and their FIPS codes
counties = {
    "Miami-Dade": "086",
    "Broward": "011",
    "Palm Beach": "099",
    "Monroe": "087"
}

# List to store each county's DataFrame
all_counties = []

for county_name, county_fips in counties.items():
    
    # Define API query parameters for this county
    params = {
        "get": ",".join(variables),
        "for": "tract:*",
        "in": f"state:12 county:{county_fips}",
        "key": API_KEY
    }

    # Send request to Census API
    response = requests.get(BASE_URL, params=params)
    response.raise_for_status()  # Stop if request fails
    
    # Convert JSON response to Python object
    data = response.json()
    
    # Convert API response to Pandas DataFrame
    # - First row contains column names
    # - Remaining rows contain data
    temp_df = pd.DataFrame(data[1:], columns=data[0])

    # Add readable county name
    temp_df["county_name"] = county_name

    # Create GEOID by concatenating state + county + tract
    # This key will be used to join with:
    # - FEMA National Risk Index
    # - CDC Social Vulnerability Index
    # - Census shapefiles
    temp_df["GEOID"] = (
    temp_df["state"] +
    temp_df["county"] +
    temp_df["tract"])

    # Append to list
    all_counties.append(temp_df)

# Combine all county DataFrames
acs_sf = pd.concat(all_counties, ignore_index=True)

In [29]:
# Convert numeric fields from strings to numbers
# Errors coerced to NaN for safe analysis
for v in ["B19013_001E", "B01003_001E", "B11001_001E", "B25010_001E"]:
    acs_sf[v] = pd.to_numeric(acs_sf[v], errors="coerce")

# Rename columns for readability
acs_sf = acs_sf.rename(columns={
    "B19013_001E": "acs_median_hh_income",
    "B01003_001E": "acs_population",
    "B11001_001E": "acs_households",
    "B25010_001E": "acs_avg_hh_size",
})

# Basic data validation
acs_sf.describe()

Unnamed: 0,acs_median_hh_income,acs_population,acs_households,acs_avg_hh_size
count,1526.0,1526.0,1526.0,1526.0
mean,-15214340.0,4066.836828,1519.420052,-10484930.0
std,99843600.0,1800.882076,629.017472,82973070.0
min,-666666700.0,0.0,0.0,-666666700.0
25%,50196.75,2863.0,1084.0,2.18
50%,69710.5,3923.5,1485.5,2.68
75%,94958.75,5136.75,1888.0,3.1
max,250001.0,23706.0,6561.0,4.77


In [30]:
# Replace known sentinel codes with NaN
import numpy as np

sentinels = [-666666666, -666666667, -999999999]

for col in [
    "acs_median_hh_income",
    "acs_population",
    "acs_households",
    "acs_avg_hh_size",
]:
    acs_sf[col] = acs_sf[col].replace(sentinels, np.nan)

In [31]:
# Population, households, income must be positive
acs_sf.loc[acs_sf["acs_median_hh_income"] <= 0, "acs_median_hh_income"] = np.nan
acs_sf.loc[acs_sf["acs_population"] <= 0, "acs_population"] = np.nan
acs_sf.loc[acs_sf["acs_households"] <= 0, "acs_households"] = np.nan
acs_sf.loc[acs_sf["acs_avg_hh_size"] <= 0, "acs_avg_hh_size"] = np.nan

In [32]:
# Confirm counties
acs_sf["county_name"].value_counts()

county_name
Miami-Dade    707
Broward       417
Palm Beach    373
Monroe         29
Name: count, dtype: int64

In [33]:
# Check GEOID length (should be 11 characters)
acs_sf["GEOID"].str.len().value_counts()

GEOID
11    1526
Name: count, dtype: int64

In [35]:
# Check missing values
acs_sf[[
    "acs_median_hh_income",
    "acs_population",
    "acs_households",
    "acs_avg_hh_size"]].isna().mean()

acs_median_hh_income    0.022936
acs_population          0.013106
acs_households          0.015727
acs_avg_hh_size         0.015727
dtype: float64

In [36]:
acs_sf.describe()

Unnamed: 0,acs_median_hh_income,acs_population,acs_households,acs_avg_hh_size
count,1491.0,1506.0,1502.0,1502.0
mean,77972.244802,4120.845286,1543.698402,2.676458
std,37691.587164,1750.304706,603.726719,0.617109
min,16441.0,1.0,4.0,1.16
25%,51849.5,2914.25,1098.25,2.2
50%,70461.0,3953.0,1498.5,2.7
75%,95589.0,5146.0,1894.75,3.11
max,250001.0,23706.0,6561.0,4.77


In [37]:
acs_sf.head()

Unnamed: 0,acs_median_hh_income,acs_population,acs_households,acs_avg_hh_size,state,county,tract,county_name,GEOID
0,54811.0,3013.0,1584.0,1.9,12,86,107,Miami-Dade,12086000107
1,55179.0,3187.0,1283.0,2.45,12,86,109,Miami-Dade,12086000109
2,97847.0,1788.0,746.0,2.4,12,86,115,Miami-Dade,12086000115
3,98824.0,1208.0,492.0,2.46,12,86,118,Miami-Dade,12086000118
4,73939.0,4175.0,1988.0,2.1,12,86,120,Miami-Dade,12086000120


In [38]:
acs_sf.to_parquet("/data/CensusSouthFlorida_dataset CLEANED.parquet", index=False)
acs_sf.to_csv("/data/CensusSouthFlorida_dataset CLEANED.csv", index=False)