
# Howard County Childcare Demand: Data Collection (2022)

This notebook collects **demand-side inputs** for a childcare supply–demand analysis focused on **Howard County, Maryland**:

- **Step 1 (ACS, 2022)**: Children counts for **0–2**, **3–5**, and **Under-5 total** using **ACS B09001**.
- **Step 2 (LODES8 OD, 2022)**: Commuter flows using **`md_od_aux_JT00_2022.csv.gz`** (all jobs), summarized into **inflow / outflow / internal / net** counts and county↔county OD totals.


In [4]:

# --- Imports and Configuration ---
import os, io, gzip, requests, pandas as pd

# ---- Year pins (set both to 2022) ----
ACS_YEAR   = 2022   # ACS 5-year vintage (2018–2022)
LODES_YEAR = 2022   # LODES8 OD year

# ---- Output directory ----
OUT_DIR    = "./outputs_2022"    # where to save CSV outputs
os.makedirs(OUT_DIR, exist_ok=True)

# ---- Toggle for Step 2 if you want to skip LODES (e.g., offline) ----
SKIP_LODES = False

# ---- Optional: Census API key (speeds up ACS requests) ----
CENSUS_API_KEY = os.getenv("CENSUS_API_KEY")  # set in your environment if available

# ---- FIPS codes ----
STATE_FIPS  = "24"        # Maryland
HOWARD_FIPS = "027"       # Howard County
HOWARD_FP5  = STATE_FIPS + HOWARD_FIPS  # '24027'

# ---- ACS (B09001) API base ----
ACS_BASE = f"https://api.census.gov/data/{ACS_YEAR}/acs/acs5"
# B09001: Population Under 18 Years in Households by Age
ACS_VARS = {
    "under3": "B09001_003E",   # ages 0-2
    "3and4":  "B09001_004E",   # ages 3-4
    "age5":   "B09001_005E"    # age 5
}

# ---- LODES OD (LODES8, Maryland, aux JT00) ----
# Remote URL (direct download). If offline, set LODES_LOCAL_FILE to a local path instead.
LODES_URL = f"https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_aux_JT00_{LODES_YEAR}.csv.gz"
LODES_LOCAL_FILE = None  # e.g., r"C:\path\to\md_od_aux_JT00_2022.csv.gz" or a .csv extracted from it

# LODES column layout (no header in file)
LODES_COLS = [
    "h_geocode","w_geocode","S000",
    "SA01","SA02","SA03",
    "SE01","SE02","SE03",
    "SI01","SI02","SI03"
]


In [5]:

# --- Helper Functions ---

def _safe_int(x):
    """Convert Census API values to int safely.
    Handles strings, floats, None, and 'null' by returning 0 for invalid values.
    """
    try:
        # Common case: API returns numeric strings like '1234'
        return int(float(x))
    except (TypeError, ValueError):
        # Try pandas to_numeric for robustness
        v = pd.to_numeric(x, errors="coerce")
        if pd.isna(v):
            return 0
        try:
            return int(v)
        except Exception:
            return 0

def fetch_acs_under5():
    """Fetches ACS B09001 for Howard County (2022) and returns a one-row DataFrame with:
       children_0_2, children_3_5, children_under_5.
       Writes CSV to OUT_DIR.
    """
    params = {
        "get": ",".join(["NAME"] + list(ACS_VARS.values())),
        "for": f"county:{HOWARD_FIPS}",
        "in":  f"state:{STATE_FIPS}"
    }
    if CENSUS_API_KEY:
        params["key"] = CENSUS_API_KEY

    resp = requests.get(ACS_BASE, params=params, timeout=60)
    resp.raise_for_status()
    data = resp.json()
    hdr, row = data[0], data[1]
    rec = dict(zip(hdr, row))

    u3  = _safe_int(rec[ACS_VARS["under3"]])
    a34 = _safe_int(rec[ACS_VARS["3and4"]])
    a5  = _safe_int(rec[ACS_VARS["age5"]])

    out = pd.DataFrame([{
        "acs_year": ACS_YEAR,
        "county_name": rec["NAME"],
        "children_0_2": u3,
        "children_3_5": a34 + a5,
        "children_under_5": u3 + a34 + a5
    }])
    out_path = f"{OUT_DIR}/acs_b09001_howard_{ACS_YEAR}.csv"
    out.to_csv(out_path, index=False)
    print(f"[ACS] Wrote: {out_path}")
    return out

def read_lodes_df_or_chunks(chunksize: int | None = None):
    """Reads LODES OD (aux JT00, 2022) into a DataFrame or an iterator of chunks.
       If LODES_LOCAL_FILE is set, reads from the local file instead of URL.
    """
    # Prefer local file if provided
    if LODES_LOCAL_FILE:
        src = LODES_LOCAL_FILE
        print(f"[LODES] Reading from local file: {src}")
        if str(src).endswith('.gz'):
            with gzip.open(src, "rb") as gz:
                if chunksize is None:
                    return pd.read_csv(gz, header=None, names=LODES_COLS,
                                       dtype={"h_geocode": str, "w_geocode": str})
                else:
                    return pd.read_csv(gz, header=None, names=LODES_COLS,
                                       dtype={"h_geocode": str, "w_geocode": str},
                                       chunksize=chunksize)
        else:
            # plain CSV
            if chunksize is None:
                return pd.read_csv(src, header=None, names=LODES_COLS,
                                   dtype={"h_geocode": str, "w_geocode": str})
            else:
                return pd.read_csv(src, header=None, names=LODES_COLS,
                                   dtype={"h_geocode": str, "w_geocode": str},
                                   chunksize=chunksize)

    # Otherwise, download from URL
    print(f"[LODES] Downloading: {LODES_URL}")
    resp = requests.get(LODES_URL, stream=True, timeout=600)
    resp.raise_for_status()
    raw = io.BytesIO(resp.content)
    if chunksize is None:
        with gzip.open(raw, "rb") as gz:
            return pd.read_csv(gz, header=None, names=LODES_COLS,
                               dtype={"h_geocode": str, "w_geocode": str})
    else:
        gz = gzip.open(raw, "rb")
        return pd.read_csv(gz, header=None, names=LODES_COLS,
                           dtype={"h_geocode": str, "w_geocode": str},
                           chunksize=chunksize)

def _coerce_numeric(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    for c in cols:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)
    return df

def summarize_commute_full(df: pd.DataFrame):
    df = df.copy()
    df = _coerce_numeric(df, LODES_COLS[2:])  # S000 & breakdowns

    in_h = df["h_geocode"].str.startswith(HOWARD_FP5)
    in_w = df["w_geocode"].str.startswith(HOWARD_FP5)

    internal = int(df.loc[in_h & in_w, "S000"].sum())
    outflow  = int(df.loc[in_h & ~in_w, "S000"].sum())
    inflow   = int(df.loc[~in_h & in_w, "S000"].sum())
    net      = inflow - outflow

    summary = pd.DataFrame([{
        "internal_jobs": internal,
        "outflow_jobs": outflow,
        "inflow_jobs": inflow,
        "net_inflow_jobs": net
    }])

    # County↔County OD
    df["home_county"] = df["h_geocode"].str.slice(0,5)
    df["work_county"] = df["w_geocode"].str.slice(0,5)
    od_all = (df.groupby(["home_county","work_county"], as_index=False)["S000"]
                .sum().sort_values("S000", ascending=False))

    # Convenience subsets
    od_from_howard = od_all[od_all["home_county"] == HOWARD_FP5].reset_index(drop=True)
    od_to_howard   = od_all[od_all["work_county"] == HOWARD_FP5].reset_index(drop=True)

    return summary, od_all, od_from_howard, od_to_howard

def summarize_commute_chunked(chunks_iter):
    internal = outflow = inflow = 0
    parts = []

    for ch in chunks_iter:
        ch = _coerce_numeric(ch, LODES_COLS[2:])
        in_h = ch["h_geocode"].str.startswith(HOWARD_FP5)
        in_w = ch["w_geocode"].str.startswith(HOWARD_FP5)

        internal += int(ch.loc[in_h & in_w, "S000"].sum())
        outflow  += int(ch.loc[in_h & ~in_w, "S000"].sum())
        inflow   += int(ch.loc[~in_h & in_w, "S000"].sum())

        tmp = ch[["h_geocode","w_geocode","S000"]].copy()
        tmp["home_county"] = tmp["h_geocode"].str.slice(0,5)
        tmp["work_county"] = tmp["w_geocode"].str.slice(0,5)
        parts.append(tmp[["home_county","work_county","S000"]])

    od_all = (pd.concat(parts, ignore_index=True)
                .groupby(["home_county","work_county"], as_index=False)["S000"].sum()
                .sort_values("S000", ascending=False))

    summary = pd.DataFrame([{
        "internal_jobs": internal,
        "outflow_jobs": outflow,
        "inflow_jobs": inflow,
        "net_inflow_jobs": inflow - outflow
    }])

    od_from_howard = od_all[od_all["home_county"] == HOWARD_FP5].reset_index(drop=True)
    od_to_howard   = od_all[od_all["work_county"] == HOWARD_FP5].reset_index(drop=True)

    return summary, od_all, od_from_howard, od_to_howard


## Step 1 — ACS (B09001, 2022): Children 0–2, 3–5, Under-5

In [6]:

acs_df = fetch_acs_under5()
print("ACS Under-5 (Howard, 2022):")
display(acs_df)


[ACS] Wrote: ./outputs_2022/acs_b09001_howard_2022.csv
ACS Under-5 (Howard, 2022):


Unnamed: 0,acs_year,county_name,children_0_2,children_3_5,children_under_5
0,2022,"Howard County, Maryland",10536,12009,22545



## Step 2 — LODES8 OD (aux JT00, 2022): Inflow / Outflow / Internal / Net

This step reads **`md_od_aux_JT00_2022.csv.gz`** (all jobs) and computes:
- `internal_jobs` (live & work in Howard),
- `outflow_jobs` (live in Howard, work elsewhere),
- `inflow_jobs` (live elsewhere, work in Howard),
- `net_inflow_jobs` (inflow − outflow), and
- county↔county OD totals.

If you encounter memory constraints, you can process the file in chunks by setting `CHUNKSIZE` to a value like `1_000_000` (1 million rows per chunk).


In [7]:

# Set CHUNKSIZE to None (read all at once) or to an integer for chunked processing.
CHUNKSIZE = None   # e.g., 1_000_000 for low-RAM machines

if not SKIP_LODES:
    data = read_lodes_df_or_chunks(chunksize=CHUNKSIZE)
    if isinstance(data, pd.DataFrame):
        summary, od_all, od_from_howard, od_to_howard = summarize_commute_full(data)
    else:
        summary, od_all, od_from_howard, od_to_howard = summarize_commute_chunked(data)

    # Save outputs
    summary.to_csv(f"{OUT_DIR}/lodes_commute_summary_{LODES_YEAR}.csv", index=False)
    od_all.to_csv(f"{OUT_DIR}/lodes_od_county_all_{LODES_YEAR}.csv", index=False)
    od_from_howard.to_csv(f"{OUT_DIR}/lodes_od_from_howard_{LODES_YEAR}.csv", index=False)
    od_to_howard.to_csv(f"{OUT_DIR}/lodes_od_to_howard_{LODES_YEAR}.csv", index=False)

    print("LODES commute summary (S000 jobs):")
    display(summary)

    print("Top OD flows FROM Howard (by county):")
    display(od_from_howard.head(10))

    print("Top OD flows TO Howard (by county):")
    display(od_to_howard.head(10))
else:
    print("[LODES] Skipped. Set SKIP_LODES = False to run.")


[LODES] Downloading: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_aux_JT00_2022.csv.gz


  return pd.read_csv(gz, header=None, names=LODES_COLS,


LODES commute summary (S000 jobs):


Unnamed: 0,internal_jobs,outflow_jobs,inflow_jobs,net_inflow_jobs
0,0,0,0,0


Top OD flows FROM Howard (by county):


Unnamed: 0,home_county,work_county,S000


Top OD flows TO Howard (by county):


Unnamed: 0,home_county,work_county,S000
