
# Howard County Childcare Demand: Data Collection (2022)

This notebook collects **demand-side inputs** for a childcare supply‚Äìdemand analysis focused on **Howard County, Maryland**:

- **Step 1 (ACS, 2022)**: Children counts for **0‚Äì2**, **3‚Äì5**, and **Under-5 total** using **ACS B09001**.
- **Step 2 (LODES8 OD, 2022)**: Commuter flows using **`md_od_aux_JT00_2022.csv.gz`** (all jobs), summarized into **inflow / outflow / internal / net** counts and county‚Üîcounty OD totals.


In [42]:

# --- Imports and Configuration ---
import os, io, gzip, requests, pandas as pd

# ---- Year pins (set both to 2022) ----
ACS_YEAR   = 2022   # ACS 5-year vintage (2018‚Äì2022)
LODES_YEAR = 2022   # LODES8 OD year

# ---- Output directory ----
OUT_DIR    = "./outputs_2022"    # where to save CSV outputs
os.makedirs(OUT_DIR, exist_ok=True)

# ---- Toggle for Step 2 if you want to skip LODES (e.g., offline) ----
SKIP_LODES = False

# ---- Optional: Census API key (speeds up ACS requests) ----
CENSUS_API_KEY = os.getenv("CENSUS_API_KEY")  # set in your environment if available

# ---- FIPS codes ----
STATE_FIPS  = "24"        # Maryland
HOWARD_FIPS = "027"       # Howard County
HOWARD_FP5  = STATE_FIPS + HOWARD_FIPS  # '24027'

# ---- ACS (B09001) API base ----
ACS_BASE = f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5"
# B09001: Population Under 18 Years in Households by Age
ACS_VARS = {
    "under3": "B09001_003E",   # ages 0-2
    "3and4":  "B09001_004E",   # ages 3-4
    "age5":   "B09001_005E"    # age 5
}

# ---- LODES OD (LODES8, Maryland, aux JT00) ----
# Remote URL (direct download). If offline, set LODES_LOCAL_FILE to a local path instead.
LODES_URL = f"https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_aux_JT00_{LODES_YEAR}.csv.gz"
LODES_LOCAL_FILE = None  # e.g., r"C:\path\to\md_od_aux_JT00_2022.csv.gz" or a .csv extracted from it

# LODES column layout (no header in file)
LODES_COLS = [
    "h_geocode","w_geocode","S000",
    "SA01","SA02","SA03",
    "SE01","SE02","SE03",
    "SI01","SI02","SI03"
]


In [43]:

# --- Helper Functions ---

def _safe_int(x):
    """Convert Census API values to int safely.
    Handles strings, floats, None, and 'null' by returning 0 for invalid values.
    """
    try:
        # Common case: API returns numeric strings like '1234'
        return int(float(x))
    except (TypeError, ValueError):
        # Try pandas to_numeric for robustness
        v = pd.to_numeric(x, errors="coerce")
        if pd.isna(v):
            return 0
        try:
            return int(v)
        except Exception:
            return 0

def fetch_acs_under5():
    """Fetches ACS B09001 for Howard County (2022) and returns a one-row DataFrame with:
       children_0_2, children_3_5, children_under_5.
       Writes CSV to OUT_DIR.
    """
    params = {
        "get": ",".join(["NAME"] + list(ACS_VARS.values())),
        "for": f"county:{HOWARD_FIPS}",
        "in":  f"state:{STATE_FIPS}"
    }
    if CENSUS_API_KEY:
        params["key"] = CENSUS_API_KEY

    resp = requests.get(ACS_BASE, params=params, timeout=60)
    resp.raise_for_status()
    data = resp.json()
    hdr, row = data[0], data[1]
    rec = dict(zip(hdr, row))

    u3  = _safe_int(rec[ACS_VARS["under3"]])
    a34 = _safe_int(rec[ACS_VARS["3and4"]])
    a5  = _safe_int(rec[ACS_VARS["age5"]])

    out = pd.DataFrame([{
        "acs_year": ACS_YEAR,
        "county_name": rec["NAME"],
        "children_0_2": u3,
        "children_3_5": a34 + a5,
        "children_under_5": u3 + a34 + a5
    }])
    out_path = f"{OUT_DIR}/acs_b09001_howard_{ACS_YEAR}.csv"
    out.to_csv(out_path, index=False)
    print(f"[ACS] Wrote: {out_path}")
    return out

def read_lodes_df_or_chunks(chunksize: int | None = None):
    """Reads LODES OD (aux JT00, 2022) into a DataFrame or an iterator of chunks.
       If LODES_LOCAL_FILE is set, reads from the local file instead of URL.
    """
    # Prefer local file if provided
    if LODES_LOCAL_FILE:
        src = LODES_LOCAL_FILE
        print(f"[LODES] Reading from local file: {src}")
        if str(src).endswith('.gz'):
            with gzip.open(src, "rb") as gz:
                if chunksize is None:
                    return pd.read_csv(gz, header=None, names=LODES_COLS,
                                       dtype={"h_geocode": str, "w_geocode": str})
                else:
                    return pd.read_csv(gz, header=None, names=LODES_COLS,
                                       dtype={"h_geocode": str, "w_geocode": str},
                                       chunksize=chunksize)
        else:
            # plain CSV
            if chunksize is None:
                return pd.read_csv(src, header=None, names=LODES_COLS,
                                   dtype={"h_geocode": str, "w_geocode": str})
            else:
                return pd.read_csv(src, header=None, names=LODES_COLS,
                                   dtype={"h_geocode": str, "w_geocode": str},
                                   chunksize=chunksize)

    # Otherwise, download from URL
    print(f"[LODES] Downloading: {LODES_URL}")
    resp = requests.get(LODES_URL, stream=True, timeout=600)
    resp.raise_for_status()
    raw = io.BytesIO(resp.content)
    if chunksize is None:
        with gzip.open(raw, "rb") as gz:
            return pd.read_csv(gz, header=None, names=LODES_COLS,
                               dtype={"h_geocode": str, "w_geocode": str})
    else:
        gz = gzip.open(raw, "rb")
        return pd.read_csv(gz, header=None, names=LODES_COLS,
                           dtype={"h_geocode": str, "w_geocode": str},
                           chunksize=chunksize)

def _coerce_numeric(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    for c in cols:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)
    return df

def summarize_commute_full(df: pd.DataFrame):
    df = df.copy()
    df = _coerce_numeric(df, LODES_COLS[2:])  # S000 & breakdowns

    in_h = df["h_geocode"].str.startswith(HOWARD_FP5)
    in_w = df["w_geocode"].str.startswith(HOWARD_FP5)

    internal = int(df.loc[in_h & in_w, "S000"].sum())
    outflow  = int(df.loc[in_h & ~in_w, "S000"].sum())
    inflow   = int(df.loc[~in_h & in_w, "S000"].sum())
    net      = inflow - outflow

    summary = pd.DataFrame([{
        "internal_jobs": internal,
        "outflow_jobs": outflow,
        "inflow_jobs": inflow,
        "net_inflow_jobs": net
    }])

    # County‚ÜîCounty OD
    df["home_county"] = df["h_geocode"].str.slice(0,5)
    df["work_county"] = df["w_geocode"].str.slice(0,5)
    od_all = (df.groupby(["home_county","work_county"], as_index=False)["S000"]
                .sum().sort_values("S000", ascending=False))

    # Convenience subsets
    od_from_howard = od_all[od_all["home_county"] == HOWARD_FP5].reset_index(drop=True)
    od_to_howard   = od_all[od_all["work_county"] == HOWARD_FP5].reset_index(drop=True)

    return summary, od_all, od_from_howard, od_to_howard

def summarize_commute_chunked(chunks_iter):
    internal = outflow = inflow = 0
    parts = []

    for ch in chunks_iter:
        ch = _coerce_numeric(ch, LODES_COLS[2:])
        in_h = ch["h_geocode"].str.startswith(HOWARD_FP5)
        in_w = ch["w_geocode"].str.startswith(HOWARD_FP5)

        internal += int(ch.loc[in_h & in_w, "S000"].sum())
        outflow  += int(ch.loc[in_h & ~in_w, "S000"].sum())
        inflow   += int(ch.loc[~in_h & in_w, "S000"].sum())

        tmp = ch[["h_geocode","w_geocode","S000"]].copy()
        tmp["home_county"] = tmp["h_geocode"].str.slice(0,5)
        tmp["work_county"] = tmp["w_geocode"].str.slice(0,5)
        parts.append(tmp[["home_county","work_county","S000"]])

    od_all = (pd.concat(parts, ignore_index=True)
                .groupby(["home_county","work_county"], as_index=False)["S000"].sum()
                .sort_values("S000", ascending=False))

    summary = pd.DataFrame([{
        "internal_jobs": internal,
        "outflow_jobs": outflow,
        "inflow_jobs": inflow,
        "net_inflow_jobs": inflow - outflow
    }])

    od_from_howard = od_all[od_all["home_county"] == HOWARD_FP5].reset_index(drop=True)
    od_to_howard   = od_all[od_all["work_county"] == HOWARD_FP5].reset_index(drop=True)

    return summary, od_all, od_from_howard, od_to_howard


## Step 1 ‚Äî ACS (B09001, 2022): Children 0‚Äì2, 3‚Äì5, Under-5

In [44]:

acs_df = fetch_acs_under5()
print("ACS Under-5 (Howard, 2022):")
display(acs_df)


[ACS] Wrote: ./outputs_2022/acs_b09001_howard_2022.csv
ACS Under-5 (Howard, 2022):


Unnamed: 0,acs_year,county_name,children_0_2,children_3_5,children_under_5
0,2022,"Howard County, Maryland",10536,12009,22545



## Step 2 ‚Äî LODES8 OD (aux JT00, 2022): Inflow / Outflow / Internal / Net

This step reads **`md_od_aux_JT00_2022.csv.gz`** (all jobs) and computes:
- `internal_jobs` (live & work in Howard),
- `outflow_jobs` (live in Howard, work elsewhere),
- `inflow_jobs` (live elsewhere, work in Howard),
- `net_inflow_jobs` (inflow ‚àí outflow), and
- county‚Üîcounty OD totals.

If you encounter memory constraints, you can process the file in chunks by setting `CHUNKSIZE` to a value like `1_000_000` (1 million rows per chunk).


In [45]:

# Set CHUNKSIZE to None (read all at once) or to an integer for chunked processing.
CHUNKSIZE = None   # e.g., 1_000_000 for low-RAM machines

if not SKIP_LODES:
    data = read_lodes_df_or_chunks(chunksize=CHUNKSIZE)
    if isinstance(data, pd.DataFrame):
        summary, od_all, od_from_howard, od_to_howard = summarize_commute_full(data)
    else:
        summary, od_all, od_from_howard, od_to_howard = summarize_commute_chunked(data)

    # Save outputs
    summary.to_csv(f"{OUT_DIR}/lodes_commute_summary_{LODES_YEAR}.csv", index=False)
    od_all.to_csv(f"{OUT_DIR}/lodes_od_county_all_{LODES_YEAR}.csv", index=False)
    od_from_howard.to_csv(f"{OUT_DIR}/lodes_od_from_howard_{LODES_YEAR}.csv", index=False)
    od_to_howard.to_csv(f"{OUT_DIR}/lodes_od_to_howard_{LODES_YEAR}.csv", index=False)

    print("LODES commute summary (S000 jobs):")
    display(summary)

    print("Top OD flows FROM Howard (by county):")
    display(od_from_howard.head(10))

    print("Top OD flows TO Howard (by county):")
    display(od_to_howard.head(10))
else:
    print("[LODES] Skipped. Set SKIP_LODES = False to run.")


[LODES] Downloading: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_aux_JT00_2022.csv.gz


  return pd.read_csv(gz, header=None, names=LODES_COLS,


LODES commute summary (S000 jobs):


Unnamed: 0,internal_jobs,outflow_jobs,inflow_jobs,net_inflow_jobs
0,0,0,0,0


Top OD flows FROM Howard (by county):


Unnamed: 0,home_county,work_county,S000


Top OD flows TO Howard (by county):


Unnamed: 0,home_county,work_county,S000


# üß© Demographics Block Builder (ACS 5‚ÄëYear)
This section adds **demand-side** inputs needed for the Supply‚ÄìDemand study. It will fetch and assemble:
- Children under 5 by block group (ACS `B01001`)
- % All parents in labor force (ACS `B23008`) ‚Äî tract-level; apportioned to block groups
- % Worked from home (ACS `B08006`) ‚Äî tract-level; apportioned to block groups
- % Non‚Äëtraditional work hours (ACS `B08011`) ‚Äî tract-level; apportioned to block groups
- Language at home / English ability (ACS `B16001`) ‚Äî tract-level; apportioned to block groups
- Poverty / subsidy proxy (ACS `B17020`) ‚Äî tract-level; apportioned to block groups

> Notes
> - We standardize geo keys to **block group GEOID (12 chars)**.
> - Where ACS detail isn‚Äôt published at block‚Äëgroup, we **apportion tract metrics** down to block groups by population weights (children under 5 as default; fallback to total population if needed).
> - Put your **Census API key** in the `CENSUS_API_KEY` env var or paste inline below.


In [46]:
# --- Config & Imports (ACS Demographics Block) ---
import os
import io
import json
import time
import math
import textwrap
import pandas as pd
import requests

# ====== Set these ======
STATE_FIPS = "24"   # Maryland
COUNTY_FIPS = "027" # Howard County; change as needed
ACS_YEAR = "2022"   # 5-year release vintage
SAVE_DIR = "outputs"

# Optional: provide your Census API key
CENSUS_API_KEY = os.getenv("CENSUS_API_KEY", "")  # or paste here e.g., "YOUR_KEY"

os.makedirs(SAVE_DIR, exist_ok=True)

# Helper: block group & tract GEOID formatters
def make_bg_geoid(state, county, tract, block_group):
    # state (2), county (3), tract (6), block group (1) => 12-char GEOID
    tract6 = tract.zfill(6)
    bg1 = block_group.zfill(1)
    return f"{state}{county}{tract6}{bg1}"

def make_tract_geoid(state, county, tract):
    return f"{state}{county}{tract.zfill(6)}"

def census_get(url, params=None, retries=3, sleep=1.0):
    params = params or {}
    if CENSUS_API_KEY:
        params["key"] = CENSUS_API_KEY
    for i in range(retries):
        r = requests.get(url, params=params, timeout=60)
        if r.status_code == 200:
            return r.json()
        time.sleep(sleep * (i+1))
    r.raise_for_status()

def tidy_group_json(data, drop_cols=("state","county","tract","block group")):
    cols = data[0]
    rows = data[1:]
    df = pd.DataFrame(rows, columns=cols)
    for c in drop_cols:
        if c in df.columns:
            df[c] = df[c].astype(str)
    return df


In [47]:
# --- A) Children under 5 by Block Group (B01001) ---
# Variables: B01001_003E (M under 5), B01001_027E (F under 5)
BASE = f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5"
vars_under5 = ["B01001_003E","B01001_027E"]

params = {
    "get": ",".join(["NAME"] + vars_under5),
    "for": "block group:*",
    "in": f"state:{STATE_FIPS} county:{COUNTY_FIPS}"
}
data = census_get(BASE, params=params)
bg = tidy_group_json(data)
bg["geo_id"] = bg.apply(lambda r: make_bg_geoid(r["state"], r["county"], r["tract"], r["block group"]), axis=1)
bg["children_under5"] = bg["B01001_003E"].astype(int) + bg["B01001_027E"].astype(int)
bg_demog = bg[["geo_id","NAME","children_under5"]].copy()

print("Block group rows:", len(bg_demog))
bg_demog.head()


Block group rows: 166


Unnamed: 0,geo_id,NAME,children_under5
0,240276011031,Block Group 1; Census Tract 6011.03; Howard Co...,264
1,240276011032,Block Group 2; Census Tract 6011.03; Howard Co...,84
2,240276011033,Block Group 3; Census Tract 6011.03; Howard Co...,178
3,240276011041,Block Group 1; Census Tract 6011.04; Howard Co...,221
4,240276011042,Block Group 2; Census Tract 6011.04; Howard Co...,46


In [48]:
# --- B) Tract-level tables we will apportion to block groups ---
GROUPS = {
    # All parents in labor force (proxy need care): use shares from B23008
    "B23008": [],
    # Worked from home share: B08006
    "B08006": [],
    # Departure time (non-traditional hours): B08011
    "B08011": [],
    # Language at home / English ability: B16001
    "B16001": [],
    # Poverty status by age: B17020
    "B17020": [],
}

def fetch_group(group):
    params = {
        "get": f"NAME,group({group})",
        "for": "tract:*",
        "in": f"state:{STATE_FIPS} county:{COUNTY_FIPS}",
    }
    data = census_get(BASE, params=params)
    df = tidy_group_json(data, drop_cols=("state","county","tract"))
    df["tract_geoid"] = df.apply(lambda r: make_tract_geoid(r["state"], r["county"], r["tract"]), axis=1)
    # Convert numeric columns
    for c in df.columns:
        if c.startswith(group + "_"):
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

tract_b23008 = fetch_group("B23008")
tract_b08006 = fetch_group("B08006")
tract_b08011 = fetch_group("B08011")
tract_b16001 = fetch_group("B16001")
tract_b17020 = fetch_group("B17020")

print("Tract rows:", len(tract_b23008))
tract_b23008.head(3)


Tract rows: 59


Unnamed: 0,NAME,B23008_001E,B23008_001EA,B23008_001M,B23008_001MA,B23008_002E,B23008_002EA,B23008_002M,B23008_002MA,B23008_003E,...,B23008_027E,B23008_027EA,B23008_027M,B23008_027MA,GEO_ID,NAME.1,state,county,tract,tract_geoid
0,Census Tract 6011.03; Howard County; Maryland,1618,,270,,564,,234,,521,...,42,,56,,1400000US24027601103,Census Tract 6011.03; Howard County; Maryland,24,27,601103,24027601103
1,Census Tract 6011.04; Howard County; Maryland,1648,,192,,394,,168,,388,...,8,,14,,1400000US24027601104,Census Tract 6011.04; Howard County; Maryland,24,27,601104,24027601104
2,Census Tract 6011.05; Howard County; Maryland,1280,,360,,421,,170,,392,...,0,,19,,1400000US24027601105,Census Tract 6011.05; Howard County; Maryland,24,27,601105,24027601105


In [49]:
# --- C) Build tract-level indicator columns ---
# Helper aggregations follow Census table shells.
import numpy as np

# 1) B23008 ‚Äî All parents in labor force, own children under 6
#   We approximate share of households with own children <6 where both parents in labor force.
#   See ACS table layout; here we compute a ratio using male/female-led families and married-couple families.
def build_b23008_share(df):
    cols = [c for c in df.columns if c.startswith("B23008_")]
    tmp = df[["tract_geoid"] + cols].copy()

    # Numerator: "All parents in labor force, own children under 6"
    # A conservative proxy using available lines across family types.
    num = (
        tmp.get("B23008_006E", 0) +  # Married-couple, all parents in labor force, own children under 6
        tmp.get("B23008_021E", 0) +  # Male HOH, in LF, kids <6
        tmp.get("B23008_036E", 0)    # Female HOH, in LF, kids <6
    )

    # Denominator: "Families with own children under 6"
    den = (
        tmp.get("B23008_003E", 0) +  # Married-couple, own children under 6
        tmp.get("B23008_018E", 0) +  # Male HOH, own kids <6
        tmp.get("B23008_033E", 0)    # Female HOH, own kids <6
    ).replace({0: np.nan})

    share = (num / den).clip(0,1)
    out = df[["tract_geoid"]].copy()
    out["pct_all_parents_work"] = share.fillna(0.0)
    return out

# 2) B08006 ‚Äî Worked from home (means of transportation to work)
def build_b08006_wfh(df):
    cols = [c for c in df.columns if c.startswith("B08006_")]
    tmp = df[["tract_geoid"] + cols].copy()
    total = tmp.get("B08006_001E", 0).replace({0: np.nan})
    wfh   = tmp.get("B08006_017E", 0)  # Worked from home
    share = (wfh / total).clip(0,1)
    out = df[["tract_geoid"]].copy()
    out["pct_remote_work"] = share.fillna(0.0)
    return out

# 3) B08011 ‚Äî Time leaving home to go to work
#   Non-traditional hours proxy: departures before 6:30am (bins 12:00‚Äì4:59 + 5:00‚Äì5:29 + 5:30‚Äì5:59 + 6:00‚Äì6:29)
def build_b08011_nontrad(df):
    tmp = df.copy()
    total = tmp.get("B08011_001E", 0).replace({0: np.nan})
    early_bins = (
        tmp.get("B08011_002E", 0) + # 12:00 a.m. to 4:59 a.m.
        tmp.get("B08011_003E", 0) + # 5:00 to 5:29 a.m.
        tmp.get("B08011_004E", 0) + # 5:30 to 5:59 a.m.
        tmp.get("B08011_005E", 0)   # 6:00 to 6:29 a.m.
    )
    share = (early_bins / total).clip(0,1)
    out = df[["tract_geoid"]].copy()
    out["pct_non_traditional_hours"] = share.fillna(0.0)
    return out

# 4) B16001 ‚Äî Language & English ability
#   Proxy: % persons (age 5+) in HH where no one speaks English "very well". We approximate using language counts.
def build_b16001_language(df):
    tmp = df.copy()
    total = tmp.get("B16001_001E", 0).replace({0: np.nan})
    # Approximation: sum of "Speak English less than 'very well'" across language groups.
    limited_cols = [c for c in tmp.columns if c.endswith("007E") or c.endswith("012E") or c.endswith("017E") or c.endswith("022E") or c.endswith("027E") or c.endswith("032E") or c.endswith("037E") or c.endswith("042E") or c.endswith("047E") or c.endswith("052E") or c.endswith("057E") or c.endswith("062E") or c.endswith("067E") or c.endswith("072E") or c.endswith("077E")]
    limited = tmp[limited_cols].sum(axis=1, min_count=1)
    share = (limited / total).clip(0,1)
    out = df[["tract_geoid"]].copy()
    out["pct_non_english_home"] = share.fillna(0.0)
    return out

# 5) B17020 ‚Äî Poverty (income-to-poverty thresholds)
#   Proxy: % persons under poverty. Use total persons below poverty over total universe in table.
def build_b17020_poverty(df):
    tmp = df.copy()
    total = tmp.get("B17020_001E", 0).replace({0: np.nan})
    below_poverty = (
        tmp.get("B17020_002E", 0) + tmp.get("B17020_003E", 0) + tmp.get("B17020_004E", 0) +
        tmp.get("B17020_005E", 0) + tmp.get("B17020_006E", 0) + tmp.get("B17020_007E", 0) +
        tmp.get("B17020_008E", 0) + tmp.get("B17020_009E", 0) + tmp.get("B17020_010E", 0) +
        tmp.get("B17020_011E", 0) + tmp.get("B17020_012E", 0) + tmp.get("B17020_013E", 0) +
        tmp.get("B17020_014E", 0) + tmp.get("B17020_015E", 0) + tmp.get("B17020_016E", 0) +
        tmp.get("B17020_017E", 0)
    )
    share = (below_poverty / total).clip(0,1)
    out = df[["tract_geoid"]].copy()
    out["pct_poverty"] = share.fillna(0.0)
    return out

tract_feats = (
    build_b23008_share(tract_b23008)
    .merge(build_b08006_wfh(tract_b08006), on="tract_geoid", how="outer")
    .merge(build_b08011_nontrad(tract_b08011), on="tract_geoid", how="outer")
    .merge(build_b16001_language(tract_b16001), on="tract_geoid", how="outer")
    .merge(build_b17020_poverty(tract_b17020), on="tract_geoid", how="outer")
)

tract_feats.head(3)


Unnamed: 0,tract_geoid,pct_all_parents_work,pct_remote_work,pct_non_traditional_hours,pct_non_english_home,pct_poverty
0,24027601103,0.273356,0.253573,0.188595,0.0,1.0
1,24027601104,0.205674,0.271357,0.1341,0.0,1.0
2,24027601105,0.221176,0.288391,0.122141,0.0,1.0


In [50]:
# --- D) Apportion tract metrics down to block groups ---
# Weighting by each block group's share of children_under5 within its tract.
bg_w = bg.copy()
bg_w["tract_geoid"] = bg_w.apply(lambda r: make_tract_geoid(r["state"], r["county"], r["tract"]), axis=1)
tract_tot = bg_w.groupby("tract_geoid", as_index=False)["children_under5"].sum().rename(columns={"children_under5":"tract_children_under5"})
bg_w = bg_w.merge(tract_tot, on="tract_geoid", how="left")
bg_w["w"] = (bg_w["children_under5"] / bg_w["tract_children_under5"].replace({0: pd.NA})).fillna(0.0)

# Join tract features and apportion
bg_apport = bg_w[["geo_id","tract_geoid","w"]].merge(tract_feats, on="tract_geoid", how="left")

for col in ["pct_all_parents_work","pct_remote_work","pct_non_traditional_hours","pct_non_english_home","pct_poverty"]:
    bg_apport[col] = bg_apport[col] * bg_apport["w"]

bg_agg = bg_apport.groupby("geo_id", as_index=False)[["pct_all_parents_work","pct_remote_work","pct_non_traditional_hours","pct_non_english_home","pct_poverty"]].sum()

demographics_block = (
    bg_demog
    .merge(bg_agg, on="geo_id", how="left")
    .assign(data_year=int(ACS_YEAR))
)

demographics_block.to_csv(f"{SAVE_DIR}/demographics_block_{COUNTY_FIPS}_{ACS_YEAR}.csv", index=False)
print("Wrote:", f"{SAVE_DIR}/demographics_block_{COUNTY_FIPS}_{ACS_YEAR}.csv")
demographics_block.head()


Wrote: outputs/demographics_block_027_2023.csv


Unnamed: 0,geo_id,NAME,children_under5,pct_all_parents_work,pct_remote_work,pct_non_traditional_hours,pct_non_english_home,pct_poverty,data_year
0,240276011031,Block Group 1; Census Tract 6011.03; Howard Co...,264,0.137198,0.127268,0.094656,0.0,0.501901,2023
1,240276011032,Block Group 2; Census Tract 6011.03; Howard Co...,84,0.043654,0.040494,0.030118,0.0,0.159696,2023
2,240276011033,Block Group 3; Census Tract 6011.03; Howard Co...,178,0.092505,0.08581,0.063821,0.0,0.338403,2023
3,240276011041,Block Group 1; Census Tract 6011.04; Howard Co...,221,0.170239,0.224606,0.110996,0.0,0.827715,2023
4,240276011042,Block Group 2; Census Tract 6011.04; Howard Co...,46,0.035434,0.046751,0.023103,0.0,0.172285,2023


**Next:**  
- If you have **LODES OD** already (this notebook does), you can build a daytime-adjusted demand table by shifting a share of children to job‚Äêsite block groups using worker inflow shares.  
- After that, start the **supply** side: MSDE provider roster + EXCELS + Pre‚ÄëK + CCS/WPA tags, then join by block group.


## ‚ûï Demographics Block ‚Äî Schema-Exact Output

This section extends the build to **exactly** match your requested schema fields:
- `geo_id`, `county_name`
- `children_under_2`, `children_2_to_5`, `children_total_under_5`
- `pct_non_english_home`, `pct_all_parents_work`, `pct_parents_nontraditional_hours`, `pct_remote_work`
- `pct_eligible_subsidy` (proxy: % ‚â§ 200% Federal Poverty Level)
- `median_household_income` (ACS `B19013` ‚Äî tract, apportioned to BG)
- `poverty_level_ratio` (proxy: **weighted mean income-to-poverty ratio** from `C17002` buckets)
- `data_year`

**Notes on methods**  
- ACS publishes only **‚ÄúUnder 5‚Äù** at block-group. To split **0‚Äì2 vs 3‚Äì5**, we compute **state-level shares** from **ACS PUMS** (person microdata) and apportion the block-group `children_total_under_5` by those shares.  
- Tract-only tables are **apportioned to block groups** weighted by each BG‚Äôs share of `children_total_under_5` in its tract.


In [51]:
# --- PUMS-based split: children 0‚Äì2 vs 3‚Äì5 (state-level shares) ---
import pandas as pd
import requests
import time

# Pull 2022 ACS 5-year PUMS (person) for the state; variables: AGEP, PWGTP
# We'll compute weighted shares: ages 0-2 and 3-5 among AGEP < 5.
PUMS_BASE = f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5/pums"

def fetch_pums_ages_under5(state_fips):
    params = {
        "get": "AGEP,PWGTP",
        "for": f"state:{state_fips}"
    }
    rows = []
    page = 1
    while True:
        r = requests.get(PUMS_BASE, params=params, timeout=60)
        if r.status_code != 200:
            time.sleep(1.5)
            r = requests.get(PUMS_BASE, params=params, timeout=60)
            r.raise_for_status()
        data = r.json()
        cols = data[0]
        rows = data[1:]
        df = pd.DataFrame(rows, columns=cols)
        # PUMS API returns all rows in one shot (no page), so break.
        break
    df["AGEP"] = pd.to_numeric(df["AGEP"], errors="coerce")
    df["PWGTP"] = pd.to_numeric(df["PWGTP"], errors="coerce")
    return df

pums = fetch_pums_ages_under5(STATE_FIPS)
pums_u5 = pums[pums["AGEP"].between(0,4, inclusive="both")].copy()

w_total = pums_u5["PWGTP"].sum()
w_0_2 = pums_u5.loc[pums_u5["AGEP"].between(0,2, inclusive="both"), "PWGTP"].sum()
w_3_5 = pums_u5.loc[pums_u5["AGEP"].between(3,4, inclusive="both"), "PWGTP"].sum()

share_0_2 = (w_0_2 / w_total) if w_total else 0.6
share_3_5 = (w_3_5 / w_total) if w_total else 0.4

print(f"[PUMS] shares ‚Äî 0‚Äì2: {share_0_2:.3f}, 3‚Äì5: {share_3_5:.3f}")


[PUMS] shares ‚Äî 0‚Äì2: 0.577, 3‚Äì5: 0.423


In [52]:
# --- Additional tract-level pulls: B19013 (Median HH income), C17002 (income-to-poverty ratio buckets) ---
def fetch_group(group):
    params = {
        "get": f"NAME,group({group})",
        "for": "tract:*",
        "in": f"state:{STATE_FIPS} county:{COUNTY_FIPS}",
    }
    data = census_get(BASE, params=params)
    df = tidy_group_json(data, drop_cols=("state","county","tract"))
    # Rebuild tract GEOID (state+county+tract)
    df["tract_geoid"] = df.apply(lambda r: make_tract_geoid(r["state"], r["county"], r["tract"]), axis=1)
    # Coerce numeric columns in the group
    for c in df.columns:
        if c.startswith(group + "_"):
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

# Pull groups
tract_b19013 = fetch_group("B19013")  # Median household income
tract_c17002 = fetch_group("C17002")  # Income-to-poverty ratio buckets

# Median household income (B19013_001E)
tract_income = tract_b19013[["tract_geoid","B19013_001E"]].rename(
    columns={"B19013_001E":"median_household_income"}
)

# ---- Poverty ratio weighted mean (C17002) + % ‚â§ 200% FPL ----
# Define all possible bucket vars with midpoints; we'll intersect with what's actually present
midpoints_all = {
    "C17002_002E": 0.25,  # < 0.50
    "C17002_003E": 0.745, # 0.50‚Äì0.99
    "C17002_004E": 1.12,  # 1.00‚Äì1.24
    "C17002_005E": 1.37,  # 1.25‚Äì1.49
    "C17002_006E": 1.67,  # 1.50‚Äì1.84
    "C17002_007E": 1.92,  # 1.85‚Äì1.99
    "C17002_008E": 2.25,  # 2.00‚Äì2.99
    "C17002_009E": 3.50,  # ‚â• 3.00
}

# Keep only buckets that exist in the dataframe
available_buckets = [c for c in midpoints_all if c in tract_c17002.columns]
if len(available_buckets) < len(midpoints_all):
    missing = sorted(set(midpoints_all) - set(available_buckets))
    print(f"[C17002] Warning: missing buckets in API response: {missing}")

midpoints = {c: midpoints_all[c] for c in available_buckets}

required_cols = ["tract_geoid", "C17002_001E"] + available_buckets
tmp = tract_c17002[required_cols].copy()

# Ensure numeric
tmp["C17002_001E"] = pd.to_numeric(tmp["C17002_001E"], errors="coerce")
for c in available_buckets:
    tmp[c] = pd.to_numeric(tmp[c], errors="coerce")

# Weighted mean poverty ratio across available buckets
den = tmp["C17002_001E"].replace({0: pd.NA})
num = sum(tmp[c] * midpoints[c] for c in available_buckets)
tract_pov_mean = (num / den).to_frame("poverty_level_ratio")
tract_pov_mean["tract_geoid"] = tmp["tract_geoid"].values

# % ‚â§ 200% FPL proxy: sum buckets up to 1.99 (clip to available)
le_200_candidates = ["C17002_002E","C17002_003E","C17002_004E","C17002_005E","C17002_006E","C17002_007E"]
eligible_buckets = [c for c in le_200_candidates if c in available_buckets]
pct_le_200 = tmp[eligible_buckets].sum(axis=1, min_count=1) / den

tract_elig = pd.DataFrame({
    "tract_geoid": tmp["tract_geoid"].values,
    "pct_eligible_subsidy": pct_le_200.clip(0, 1).fillna(0.0)
})

# Merge income + poverty features
tract_income_feats = (
    tract_income
    .merge(tract_pov_mean, on="tract_geoid", how="outer")
    .merge(tract_elig, on="tract_geoid", how="outer")
)

tract_income_feats.head(3)




Unnamed: 0,tract_geoid,median_household_income,poverty_level_ratio,pct_eligible_subsidy
0,24027601103,185705,2.195647,0.03996
1,24027601104,206143,2.202122,0.048098
2,24027601105,149806,2.141169,0.079795


In [53]:
# --- Apportion income/poverty to block groups and assemble final schema ---
bg_w = bg.copy()
bg_w["tract_geoid"] = bg_w.apply(lambda r: make_tract_geoid(r["state"], r["county"], r["tract"]), axis=1)
tract_tot = bg_w.groupby("tract_geoid", as_index=False)["children_under5"].sum().rename(columns={"children_under5":"tract_children_under5"})
bg_w = bg_w.merge(tract_tot, on="tract_geoid", how="left")
bg_w["w"] = (bg_w["children_under5"] / bg_w["tract_children_under5"].replace({0: pd.NA})).fillna(0.0)

# Bring previously built tract features (pct_* from earlier section)
tract_feats = tract_feats.merge(tract_income_feats, on="tract_geoid", how="left")

# Apportion to BG
bg_app = bg_w[["geo_id","tract_geoid","w","NAME","children_under5"]].merge(tract_feats, on="tract_geoid", how="left")
for col in ["pct_all_parents_work","pct_remote_work","pct_non_traditional_hours","pct_non_english_home",
            "poverty_level_ratio","pct_eligible_subsidy","median_household_income"]:
    bg_app[col] = bg_app[col] * bg_app["w"] if col != "median_household_income" else bg_app[col] * bg_app["w"]  # income also averaged

bg_final = bg_app.groupby(["geo_id"], as_index=False).agg({
    "NAME":"first",
    "children_under5":"sum",
    "pct_all_parents_work":"sum",
    "pct_remote_work":"sum",
    "pct_non_traditional_hours":"sum",
    "pct_non_english_home":"sum",
    "pct_eligible_subsidy":"sum",
    "median_household_income":"sum",
    "poverty_level_ratio":"sum",
})

# County name parsed from NAME string (e.g., "Block Group 1, Census Tract 1234.56, Howard County, Maryland")
def parse_county(name):
    try:
        parts = [p.strip() for p in str(name).split(",")]
        # County is usually the penultimate piece before state
        for i in range(len(parts)-1, -1, -1):
            if "County" in parts[i]:
                return parts[i]
        return None
    except Exception:
        return None

bg_final["county_name"] = bg_final["NAME"].apply(parse_county).fillna("")

# Children splits via PUMS shares
bg_final["children_total_under_5"] = bg_final["children_under5"].astype(int)
bg_final["children_under_2"] = (bg_final["children_total_under_5"] * share_0_2).round().astype(int)
bg_final["children_2_to_5"] = (bg_final["children_total_under_5"] - bg_final["children_under_2"]).astype(int)

# Rename and order columns to match schema
demographics_block_schema = bg_final.rename(columns={
    "pct_non_traditional_hours": "pct_parents_nontraditional_hours",
    "children_under5": "children_total_under_5"
})[
    ["geo_id","county_name",
     "children_under_2","children_2_to_5","children_total_under_5",
     "pct_non_english_home","pct_all_parents_work","pct_parents_nontraditional_hours","pct_remote_work",
     "pct_eligible_subsidy","median_household_income","poverty_level_ratio"]
].copy()

demographics_block_schema["data_year"] = int(ACS_YEAR)

out_csv = f"{SAVE_DIR}/demographics_block_SCHEMA_{COUNTY_FIPS}_{ACS_YEAR}.csv"
demographics_block_schema.to_csv(out_csv, index=False)
print("Wrote:", out_csv)
demographics_block_schema.head()


Wrote: outputs/demographics_block_SCHEMA_027_2023.csv


Unnamed: 0,geo_id,county_name,children_under_2,children_2_to_5,children_total_under_5,children_total_under_5.1,pct_non_english_home,pct_all_parents_work,pct_parents_nontraditional_hours,pct_remote_work,pct_eligible_subsidy,median_household_income,poverty_level_ratio,data_year
0,240276011031,Block Group 1; Census Tract 6011.03; Howard Co...,152,112,264,264,0.0,0.137198,0.094656,0.127268,0.020056,93205.551331,1.101998,2023
1,240276011032,Block Group 2; Census Tract 6011.03; Howard Co...,48,36,84,84,0.0,0.043654,0.030118,0.040494,0.006381,29656.311787,0.350636,2023
2,240276011033,Block Group 3; Census Tract 6011.03; Howard Co...,103,75,178,178,0.0,0.092505,0.063821,0.08581,0.013523,62843.136882,0.743014,2023
3,240276011041,Block Group 1; Census Tract 6011.04; Howard Co...,128,93,221,221,0.0,0.170239,0.110996,0.224606,0.039812,170627.726592,1.82273,2023
4,240276011042,Block Group 2; Census Tract 6011.04; Howard Co...,27,19,46,46,0.0,0.035434,0.023103,0.046751,0.008287,35515.273408,0.379392,2023
