In [1]:
# Cell 1: Imports, config, helpers

import os
import time
import math
import pandas as pd
import numpy as np
import requests

# --- Configuration ---
STATE_FIPS = "24"      # Maryland
COUNTY_FIPS = "027"    # Howard County
ACS_YEAR = "2022"      # ACS 5-year vintage as string for URL

SAVE_DIR = "."
os.makedirs(SAVE_DIR, exist_ok=True)

# Optional: if you have a Census API key, put it here (or leave as None)
CENSUS_API_KEY = None  # e.g., "your_api_key_here"

BASE_ACS = f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5"


def census_get(base_url, params, max_tries=5, sleep=1.0):
    """
    Robust wrapper around requests.get that returns parsed JSON (list-of-lists).
    Raises on repeated failure.
    """
    if CENSUS_API_KEY:
        # avoid overwriting user-supplied key
        params = dict(params)
        if "key" not in params:
            params["key"] = CENSUS_API_KEY

    for i in range(max_tries):
        r = requests.get(base_url, params=params)
        if r.status_code == 200:
            try:
                return r.json()
            except ValueError:
                # Not JSON (HTML error, etc.)
                print("Response not JSON, status:", r.status_code)
                print("First 200 chars of response:\n", r.text[:200])
                raise
        print(f"Request failed (status {r.status_code}), retry {i+1}/{max_tries}")
        time.sleep(sleep * (i + 1))

    r.raise_for_status()  # last response


def tidy_group_json(data, drop_cols=("state", "county", "tract", "block group")):
    """
    Convert Census API JSON (list-of-lists) to DataFrame.
    Keeps all columns; just coerces some to string.
    """
    cols = data[0]
    rows = data[1:]
    df = pd.DataFrame(rows, columns=cols)
    for c in drop_cols:
        if c in df.columns:
            df[c] = df[c].astype(str)
    return df


def make_bg_geoid(state, county, tract, block_group):
    return (
        str(state).zfill(2) +
        str(county).zfill(3) +
        str(tract).zfill(6) +
        str(block_group)
    )


def make_tract_geoid(state, county, tract):
    return (
        str(state).zfill(2) +
        str(county).zfill(3) +
        str(tract).zfill(6)
    )


In [2]:
# Cell 2: Children under 5 by Block Group (ACS B01001)

# B01001_003E = Male under 5 years
# B01001_027E = Female under 5 years
vars_under5 = ["B01001_003E", "B01001_027E"]

params = {
    "get": ",".join(["NAME"] + vars_under5),
    "for": "block group:*",
    "in": f"state:{STATE_FIPS} county:{COUNTY_FIPS}",
}

data = census_get(BASE_ACS, params=params)
bg = tidy_group_json(data, drop_cols=())

# Build geo_id and children_under5
bg["geo_id"] = bg.apply(
    lambda r: make_bg_geoid(r["state"], r["county"], r["tract"], r["block group"]),
    axis=1
)

bg["B01001_003E"] = pd.to_numeric(bg["B01001_003E"], errors="coerce").fillna(0).astype(int)
bg["B01001_027E"] = pd.to_numeric(bg["B01001_027E"], errors="coerce").fillna(0).astype(int)
bg["children_under5"] = bg["B01001_003E"] + bg["B01001_027E"]

bg_demog = bg[["geo_id", "NAME", "state", "county", "tract", "block group", "children_under5"]].copy()

print("Block group rows:", len(bg_demog))
bg_demog.head()


Block group rows: 166


Unnamed: 0,geo_id,NAME,state,county,tract,block group,children_under5
0,240276011031,Block Group 1; Census Tract 6011.03; Howard Co...,24,27,601103,1,276
1,240276011032,Block Group 2; Census Tract 6011.03; Howard Co...,24,27,601103,2,92
2,240276011033,Block Group 3; Census Tract 6011.03; Howard Co...,24,27,601103,3,124
3,240276011041,Block Group 1; Census Tract 6011.04; Howard Co...,24,27,601104,1,191
4,240276011042,Block Group 2; Census Tract 6011.04; Howard Co...,24,27,601104,2,45


In [3]:
# Cell 3: Fetch tract-level ACS groups

GROUPS = {
    "B23008": [],  # Families with own children under 6 by labor force status
    "B08006": [],  # Means of transportation to work (includes worked from home)
    "B08011": [],  # Time leaving home to go to work
    "B16001": [],  # Language spoken at home by ability to speak English
    "B17020": [],  # Poverty status in the past 12 months by age
    "B03002": [],  # Hispanic or Latino origin by race (for race/ethnicity)
}


def fetch_group(group):
    params = {
        "get": f"NAME,group({group})",
        "for": "tract:*",
        "in": f"state:{STATE_FIPS} county:{COUNTY_FIPS}",
    }
    data = census_get(BASE_ACS, params=params)
    df = tidy_group_json(data, drop_cols=())

    # Rebuild tract GEOID (state+county+tract)
    df["tract_geoid"] = df.apply(
        lambda r: make_tract_geoid(r["state"], r["county"], r["tract"]),
        axis=1,
    )

    # Convert numeric columns in the group
    for c in df.columns:
        if c.startswith(group + "_"):
            df[c] = pd.to_numeric(df[c], errors="coerce")

    return df


tract_b23008 = fetch_group("B23008")
tract_b08006 = fetch_group("B08006")
tract_b08011 = fetch_group("B08011")
tract_b16001 = fetch_group("B16001")
tract_b17020 = fetch_group("B17020")
tract_b03002 = fetch_group("B03002")  # race/ethnicity

print("Tracts in B23008:", len(tract_b23008))
tract_b23008.head()


Tracts in B23008: 59


Unnamed: 0,NAME,B23008_001E,B23008_001EA,B23008_001M,B23008_001MA,B23008_002E,B23008_002EA,B23008_002M,B23008_002MA,B23008_003E,...,B23008_027E,B23008_027EA,B23008_027M,B23008_027MA,GEO_ID,NAME.1,state,county,tract,tract_geoid
0,Census Tract 6011.03; Howard County; Maryland,1435,,270,,528,,217,,433,...,50,,76,,1400000US24027601103,Census Tract 6011.03; Howard County; Maryland,24,27,601103,24027601103
1,Census Tract 6011.04; Howard County; Maryland,1654,,180,,405,,164,,395,...,14,,20,,1400000US24027601104,Census Tract 6011.04; Howard County; Maryland,24,27,601104,24027601104
2,Census Tract 6011.05; Howard County; Maryland,1307,,290,,360,,133,,349,...,0,,19,,1400000US24027601105,Census Tract 6011.05; Howard County; Maryland,24,27,601105,24027601105
3,Census Tract 6011.07; Howard County; Maryland,1093,,257,,331,,153,,297,...,0,,19,,1400000US24027601107,Census Tract 6011.07; Howard County; Maryland,24,27,601107,24027601107
4,Census Tract 6011.08; Howard County; Maryland,804,,131,,197,,83,,190,...,0,,13,,1400000US24027601108,Census Tract 6011.08; Howard County; Maryland,24,27,601108,24027601108


In [4]:
# Cell 4: Build tract-level indicator columns (parents, remote, hours, language, poverty, race/ethnicity)

# 1) B23008 — All parents in labor force, own children under 6
def build_b23008_share(df):
    tmp = df.copy()
    # Numerator: "All parents in labor force, own children under 6"
    num = (
        tmp.get("B23008_006E", 0) +  # Married-couple, all parents in LF, own children <6
        tmp.get("B23008_021E", 0) +  # Male HOH, in LF, kids <6
        tmp.get("B23008_028E", 0)    # Female HOH, in LF, kids <6
    )
    # Denominator: all families w/ own children under 6
    den = (
        tmp.get("B23008_005E", 0) +  # Married-couple, own children <6
        tmp.get("B23008_020E", 0) +  # Male HOH, kids <6
        tmp.get("B23008_027E", 0)    # Female HOH, kids <6
    ).replace({0: np.nan})

    share = (num / den).clip(0, 1)
    out = df[["tract_geoid"]].copy()
    out["pct_all_parents_work"] = share.fillna(0.0)
    return out


# 2) B08006 — Worked from home
def build_b08006_wfh(df):
    tmp = df.copy()
    total = tmp.get("B08006_001E", 0).replace({0: np.nan})
    wfh = tmp.get("B08006_017E", 0)  # Worked from home
    share = (wfh / total).clip(0, 1)
    out = df[["tract_geoid"]].copy()
    out["pct_remote_work"] = share.fillna(0.0)
    return out


# 3) B08011 — Non-traditional hours (depart before 6:30am)
def build_b08011_nontrad(df):
    tmp = df.copy()
    total = tmp.get("B08011_001E", 0).replace({0: np.nan})
    early_bins = (
        tmp.get("B08011_002E", 0) +  # 12:00–4:59 a.m.
        tmp.get("B08011_003E", 0) +  # 5:00–5:29 a.m.
        tmp.get("B08011_004E", 0) +  # 5:30–5:59 a.m.
        tmp.get("B08011_005E", 0)    # 6:00–6:29 a.m.
    )
    share = (early_bins / total).clip(0, 1)
    out = df[["tract_geoid"]].copy()
    out["pct_non_traditional_hours"] = share.fillna(0.0)
    return out


# 4) B16001 — Language & English ability: % speaking English less than "very well"
def build_b16001_language(df):
    tmp = df.copy()
    total = tmp.get("B16001_001E", 0).replace({0: np.nan})

    # Approximation:
    # We sum all "Speak English less than 'very well'" lines across language groups.
    # Those lines are every 5th category in each language block; here we approximate
    # by summing every column whose code ends with "007E", "012E", etc.
    limited_cols = [c for c in tmp.columns if c.startswith("B16001_") and c.endswith("E") and c != "B16001_001E"]

    # This will slightly overcount/undercount vs exact ACS layout,
    # but is a reasonable proxy for "limited English at home".
    limited = tmp[limited_cols].sum(axis=1, min_count=1)

    share = (limited / total).clip(0, 1)
    out = df[["tract_geoid"]].copy()
    out["pct_non_english_home"] = share.fillna(0.0)
    return out


# 5) B17020 — Poverty
def build_b17020_poverty(df):
    tmp = df.copy()
    total = tmp.get("B17020_001E", 0).replace({0: np.nan})
    # all persons below poverty (lines 2–17)
    below_poverty = sum(
        tmp.get(f"B17020_{str(i).zfill(3)}E", 0)
        for i in range(2, 18)
    )
    share = (below_poverty / total).clip(0, 1)
    out = df[["tract_geoid"]].copy()
    out["poverty_level_ratio"] = share.fillna(0.0)
    return out


# 6) B03002 — Race & Ethnicity
def build_b03002_race_ethnicity(df):
    tmp = df.copy()
    total = tmp.get("B03002_001E", 0).replace({0: np.nan})

    hispanic = tmp.get("B03002_012E", 0)    # Hispanic or Latino (any race)
    white_nh = tmp.get("B03002_003E", 0)    # White alone, not Hispanic
    black    = tmp.get("B03002_004E", 0)    # Black or African American alone
    asian    = tmp.get("B03002_006E", 0)    # Asian alone

    other = (total - (hispanic + white_nh + black + asian)).clip(lower=0)

    out = df[["tract_geoid"]].copy()
    out["pct_hispanic"]           = (hispanic / total).clip(0, 1).fillna(0.0)
    out["pct_white_non_hispanic"] = (white_nh / total).clip(0, 1).fillna(0.0)
    out["pct_black"]              = (black / total).clip(0, 1).fillna(0.0)
    out["pct_asian"]              = (asian / total).clip(0, 1).fillna(0.0)
    out["pct_other_race"]         = (other / total).clip(0, 1).fillna(0.0)
    return out


# Build tract_feats combining all of the above
tract_feats = (
    build_b23008_share(tract_b23008)
    .merge(build_b08006_wfh(tract_b08006), on="tract_geoid", how="outer")
    .merge(build_b08011_nontrad(tract_b08011), on="tract_geoid", how="outer")
    .merge(build_b16001_language(tract_b16001), on="tract_geoid", how="outer")
    .merge(build_b17020_poverty(tract_b17020), on="tract_geoid", how="outer")
    .merge(build_b03002_race_ethnicity(tract_b03002), on="tract_geoid", how="outer")  # NEW race/eth
)

tract_feats.head()


Unnamed: 0,tract_geoid,pct_all_parents_work,pct_remote_work,pct_non_traditional_hours,pct_non_english_home,poverty_level_ratio,pct_hispanic,pct_white_non_hispanic,pct_black,pct_asian,pct_other_race
0,24027601103,1.0,0.167402,0.172905,0.0,1.0,0.041441,0.563092,0.089986,0.253552,0.051928
1,24027601104,1.0,0.214629,0.10673,0.0,1.0,0.044139,0.63068,0.076845,0.171056,0.077279
2,24027601105,1.0,0.234887,0.120576,0.0,1.0,0.039139,0.561491,0.136709,0.168614,0.094046
3,24027601107,1.0,0.150805,0.255172,0.0,1.0,0.396771,0.281102,0.092688,0.175309,0.054131
4,24027601108,1.0,0.153036,0.092256,0.0,1.0,0.14966,0.475144,0.144689,0.196232,0.034275


In [5]:
# Cell 5: Income & subsidy eligibility (B19013, C17002)

# Median household income
tract_income = fetch_group("B19013")

# Income-to-poverty ratio buckets (C17002)
tract_c17002 = fetch_group("C17002")

# B19013_001E = Median household income in the past 12 months
tmp_inc = tract_income.copy()
tmp_inc["median_household_income"] = (
    tmp_inc.get("B19013_001E", 0)
)

tract_income_feats = tmp_inc[["tract_geoid", "median_household_income"]].copy()
tract_income_feats["median_household_income"] = pd.to_numeric(
    tract_income_feats["median_household_income"],
    errors="coerce"
)

# C17002: Income to poverty ratio buckets
tmp_pov = tract_c17002.copy()
total = tmp_pov.get("C17002_001E", 0).replace({0: np.nan})

# Persons with income-to-poverty ratio <= 2.0 (approx subsidy eligible)
le_200 = (
    tmp_pov.get("C17002_002E", 0) +  # 0.00 to 0.49
    tmp_pov.get("C17002_003E", 0) +  # 0.50 to 0.99
    tmp_pov.get("C17002_004E", 0) +  # 1.00 to 1.24
    tmp_pov.get("C17002_005E", 0) +  # 1.25 to 1.49
    tmp_pov.get("C17002_006E", 0)    # 1.50 to 1.99
)
pct_le_200 = (le_200 / total).clip(0, 1)

tract_income_feats["poverty_level_ratio"] = pct_le_200.fillna(0.0)
tract_income_feats["pct_eligible_subsidy"] = tract_income_feats["poverty_level_ratio"]

tract_income_feats.head()


Unnamed: 0,tract_geoid,median_household_income,poverty_level_ratio,pct_eligible_subsidy
0,24027601103,170794,0.048884,0.048884
1,24027601104,199827,0.051085,0.051085
2,24027601105,152292,0.071216,0.071216
3,24027601107,84891,0.299582,0.299582
4,24027601108,136094,0.140764,0.140764


In [7]:
# === FIXED CELL 6 ===
# Apportion tract features to block groups AND avoid KeyErrors for missing columns

# 1) Create tract_geoid and weights based on children_under5
bg_w = bg_demog.copy()
bg_w["tract_geoid"] = bg_w.apply(
    lambda r: make_tract_geoid(r["state"], r["county"], r["tract"]),
    axis=1
)

tract_tot = (
    bg_w.groupby("tract_geoid", as_index=False)["children_under5"]
        .sum()
        .rename(columns={"children_under5": "tract_children_under5"})
)

bg_w = bg_w.merge(tract_tot, on="tract_geoid", how="left")
bg_w["w"] = (
    bg_w["children_under5"] /
    bg_w["tract_children_under5"].replace({0: np.nan})
).fillna(0.0)

# 2) FULL TRACT FEATURES
tract_feats_full = tract_feats.merge(tract_income_feats, on="tract_geoid", how="left")

# --- IMPORTANT FIX: Ensure all needed columns exist ---
needed_cols = [
    "pct_all_parents_work",
    "pct_remote_work",
    "pct_non_traditional_hours",
    "pct_non_english_home",
    "poverty_level_ratio",
    "pct_eligible_subsidy",
    "median_household_income",
    "pct_hispanic",
    "pct_white_non_hispanic",
    "pct_black",
    "pct_asian",
    "pct_other_race",
]

for col in needed_cols:
    if col not in tract_feats_full.columns:
        print(f"[WARNING] Missing tract column {col} → filling with 0")
        tract_feats_full[col] = 0.0

# 3) Apportion to block groups
bg_app = bg_w[["geo_id", "tract_geoid", "w", "NAME", "children_under5"]] \
            .merge(tract_feats_full, on="tract_geoid", how="left")

# 4) Apply weights column-wise safely
for col in needed_cols:
    bg_app[col] = bg_app[col].astype(float) * bg_app["w"]

# 5) Aggregate by block group
agg_dict = {"NAME": "first", "children_under5": "sum"}
for col in needed_cols:
    agg_dict[col] = "sum"

bg_final = bg_app.groupby("geo_id", as_index=False).agg(agg_dict)

# 6) Parse county name
bg_final["county_name"] = bg_final["NAME"].str.replace(",", ";")

# 7) Split children into under 2 and 2–5
share_0_2 = 0.45
bg_final["children_total_under_5"] = bg_final["children_under5"].astype(int)
bg_final["children_under_2"] = (bg_final["children_total_under_5"] * share_0_2).round().astype(int)
bg_final["children_2_to_5"] = (
    bg_final["children_total_under_5"] - bg_final["children_under_2"]
).astype(int)

# 8) Build final schema
demographics_block_schema = bg_final.rename(columns={
    "pct_non_traditional_hours": "pct_parents_nontraditional_hours"
})[
    [
        "geo_id",
        "county_name",
        "children_under_2",
        "children_2_to_5",
        "children_total_under_5",
        "pct_non_english_home",
        "pct_all_parents_work",
        "pct_parents_nontraditional_hours",
        "pct_remote_work",
        "pct_eligible_subsidy",
        "median_household_income",
        "poverty_level_ratio",
        "pct_hispanic",
        "pct_white_non_hispanic",
        "pct_black",
        "pct_asian",
        "pct_other_race",
    ]
].copy()

demographics_block_schema["data_year"] = int(ACS_YEAR)

# Save
out_csv = os.path.join(SAVE_DIR, f"Howard_demographics_block_{ACS_YEAR}_with_race_ethnicity.csv")
demographics_block_schema.to_csv(out_csv, index=False)

print("Wrote:", out_csv)
demographics_block_schema.head()


Wrote: .\Howard_demographics_block_2022_with_race_ethnicity.csv


Unnamed: 0,geo_id,county_name,children_under_2,children_2_to_5,children_total_under_5,pct_non_english_home,pct_all_parents_work,pct_parents_nontraditional_hours,pct_remote_work,pct_eligible_subsidy,median_household_income,poverty_level_ratio,pct_hispanic,pct_white_non_hispanic,pct_black,pct_asian,pct_other_race,data_year
0,240276011031,Block Group 1; Census Tract 6011.03; Howard Co...,124,152,276,0.0,0.560976,0.096995,0.093908,0.027423,95811.268293,0.0,0.023247,0.315881,0.05048,0.142237,0.02913,2022
1,240276011032,Block Group 2; Census Tract 6011.03; Howard Co...,41,51,92,0.0,0.186992,0.032332,0.031303,0.009141,31937.089431,0.0,0.007749,0.105294,0.016827,0.047412,0.00971,2022
2,240276011033,Block Group 3; Census Tract 6011.03; Howard Co...,56,68,124,0.0,0.252033,0.043578,0.042191,0.01232,43045.642276,0.0,0.010445,0.141918,0.02268,0.063903,0.013088,2022
3,240276011041,Block Group 1; Census Tract 6011.04; Howard Co...,86,105,191,0.0,0.809322,0.086379,0.173704,0.041345,161724.394068,0.0,0.035723,0.510423,0.062192,0.13844,0.062544,2022
4,240276011042,Block Group 2; Census Tract 6011.04; Howard Co...,20,25,45,0.0,0.190678,0.020351,0.040925,0.009741,38102.605932,0.0,0.008416,0.120257,0.014653,0.032617,0.014735,2022
