In [1]:
import os
import io
import gzip
import requests
import pandas as pd

STATE_ABBR = "md"
STATE_FIPS = "24"
COUNTY_FIPS = "027"               # Howard County
COUNTY_KEY = STATE_FIPS + COUNTY_FIPS  # "24027"

YEAR = "2022"
PARTS = ["main", "aux"]           # OD variants
JOB_TYPES = ["JT00","JT01","JT02","JT03","JT04","JT05"]  # All jobs + sectors
OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Official LODES8 base (note: you used LODES8; keeping that)
BASE_URL = f"https://lehd.ces.census.gov/data/lodes/LODES8/{STATE_ABBR}/od"

# Raw OD columns (no header in source files)
OD_COLS = [
    "h_geocode","w_geocode","S000",
    "SA01","SA02","SA03",
    "SE01","SE02","SE03",
    "SI01","SI02","SI03"
]

def download_od(part: str, job_type: str, year: str) -> pd.DataFrame:
    """
    Download one OD file, read with headers, and return a DataFrame.
    Skips if 404 / missing.
    """
    url = f"{BASE_URL}/{STATE_ABBR}_od_{part}_{job_type}_{year}.csv.gz"
    print(f"→ Fetching: {url}")
    r = requests.get(url, timeout=180)
    if r.status_code == 404:
        print(f"   (missing) 404 Not Found — skipping")
        return pd.DataFrame(columns=OD_COLS)
    r.raise_for_status()

    # Decompress gz in-memory
    buf = io.BytesIO(r.content)
    with gzip.GzipFile(fileobj=buf, mode="rb") as gz:
        df = pd.read_csv(gz, header=None, names=OD_COLS, dtype=str)

    # Cast numeric columns we need
    df["S000"] = pd.to_numeric(df["S000"], errors="coerce").fillna(0).astype(int)

    # Annotate metadata
    df["year"] = int(year)
    df["part"] = part
    df["job_type"] = job_type
    return df

def filter_to_howard(df: pd.DataFrame, mode: str = "either") -> pd.DataFrame:
    """
    Keep rows linked to Howard County, MD (FIPS 24027).
    mode:
      - "either": keep if home OR work county == 24027
      - "home_only": keep if home county == 24027
      - "work_only": keep if work county == 24027
    """
    if df.empty:
        return df
    h_county = df["h_geocode"].str.slice(0, 5)
    w_county = df["w_geocode"].str.slice(0, 5)

    if mode == "home_only":
        m = (h_county == COUNTY_KEY)
    elif mode == "work_only":
        m = (w_county == COUNTY_KEY)
    else:
        m = (h_county == COUNTY_KEY) | (w_county == COUNTY_KEY)

    out = df.loc[m, ["h_geocode","w_geocode","S000","year"]].copy()
    # Helpful block-group IDs (12-digit GEOIDs)
    out["home_bg"] = out["h_geocode"].str.slice(0, 12)
    out["work_bg"] = out["w_geocode"].str.slice(0, 12)
    out["home_county"] = out["h_geocode"].str.slice(0, 5)
    out["work_county"] = out["w_geocode"].str.slice(0, 5)
    return out

def build_employment_commute(df_filtered: pd.DataFrame) -> pd.DataFrame:
    """
    Build the employment_commute table with required columns:
    home_geo_id, work_geo_id, commuter_count, net_commute_flow, data_year, source
    - Aggregates to block-group pairs
    - Computes net_commute_flow for Howard County block groups:
      inflow_to_bg - outflow_from_bg
    """
    if df_filtered.empty:
        return pd.DataFrame(columns=[
            "home_geo_id","work_geo_id","commuter_count",
            "net_commute_flow","data_year","source"
        ])

    # 1) Aggregate to block-group pair (home_bg, work_bg)
    pair = (
        df_filtered
        .groupby(["home_bg","work_bg","year"], as_index=False)["S000"]
        .sum()
        .rename(columns={"home_bg":"home_geo_id","work_bg":"work_geo_id","S000":"commuter_count"})
    )

    # 2) Compute inflow and outflow totals for EACH Howard County block group
    # Inflow to a BG = sum of S000 where work_bg == that BG
    inflow = (
        df_filtered
        .groupby("work_bg", as_index=False)["S000"]
        .sum()
        .rename(columns={"work_bg":"bg","S000":"inflow"})
    )
    # Outflow from a BG = sum of S000 where home_bg == that BG
    outflow = (
        df_filtered
        .groupby("home_bg", as_index=False)["S000"]
        .sum()
        .rename(columns={"home_bg":"bg","S000":"outflow"})
    )
    net = inflow.merge(outflow, on="bg", how="outer").fillna(0)
    net["net_commute_flow_bg"] = net["inflow"] - net["outflow"]

    # Keep net only for Howard County BGs (first 5 digits == 24027)
    net_howard = net[net["bg"].str.slice(0,5) == COUNTY_KEY][["bg","net_commute_flow_bg"]].copy()

    # 3) Attach a per-row net_commute_flow:
    # - If work_geo_id is in Howard, use net for work_geo_id
    # - Else if home_geo_id is in Howard, use net for home_geo_id
    pair = pair.merge(net_howard.rename(columns={"bg":"work_geo_id","net_commute_flow_bg":"_net_work"}),
                      on="work_geo_id", how="left")
    pair = pair.merge(net_howard.rename(columns={"bg":"home_geo_id","net_commute_flow_bg":"_net_home"}),
                      on="home_geo_id", how="left")

    def choose_net(row):
        if isinstance(row["_net_work"], (int, float)) and pd.notna(row["_net_work"]):
            return int(row["_net_work"])
        if isinstance(row["_net_home"], (int, float)) and pd.notna(row["_net_home"]):
            return int(row["_net_home"])
        return 0  # neither side is a Howard BG

    pair["net_commute_flow"] = pair.apply(choose_net, axis=1)
    pair = pair.drop(columns=["_net_work","_net_home"])

    # 4) Add metadata columns
    pair["data_year"] = pair["year"].astype(int)
    pair["source"] = "LEHD LODES8 OD"
    pair = pair.drop(columns=["year"])

    # Order columns exactly as requested
    pair = pair[[
        "home_geo_id","work_geo_id","commuter_count",
        "net_commute_flow","data_year","source"
    ]]

    return pair

def main():
    combined = []
    for part in PARTS:
        for jt in JOB_TYPES:
            try:
                df = download_od(part, jt, YEAR)
                if df.empty:
                    continue
                dff = filter_to_howard(df, mode="either")
                if not dff.empty:
                    combined.append(dff)
                print(f"   kept rows: {len(dff):,}")
            except Exception as e:
                print(f"   ERROR on {part} {jt}: {e}")

    if not combined:
        print("No data collected. Check availability/year.")
        return

    df_filtered = pd.concat(combined, ignore_index=True)

    # Build schema-aligned employment_commute table
    commute = build_employment_commute(df_filtered)

    # Save CSV
    csv_path = os.path.join(OUTPUT_DIR, f"employment_commute_howard_{YEAR}.csv")
    commute.to_csv(csv_path, index=False)
    print(f"\n✅ Wrote CSV: {csv_path}  (rows: {len(commute):,})")

    # Save Excel (single sheet: employment_commute)
    xlsx_path = os.path.join(OUTPUT_DIR, f"employment_commute_howard_{YEAR}.xlsx")
    with pd.ExcelWriter(xlsx_path, engine="xlsxwriter") as writer:
        commute.to_excel(writer, index=False, sheet_name="employment_commute")
    print(f"✅ Wrote Excel: {xlsx_path}")

if __name__ == "__main__":
    main()


→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_main_JT00_2022.csv.gz
   kept rows: 117,759
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_main_JT01_2022.csv.gz
   kept rows: 110,043
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_main_JT02_2022.csv.gz
   kept rows: 101,214
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_main_JT03_2022.csv.gz
   kept rows: 93,902
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_main_JT04_2022.csv.gz
   kept rows: 4,172
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_main_JT05_2022.csv.gz
   kept rows: 4,119
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_aux_JT00_2022.csv.gz
   kept rows: 0
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_aux_JT01_2022.csv.gz
   kept rows: 0
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_aux_JT02_2022.csv.gz
   kept rows: 0
