In [6]:
import os
import io
import gzip
import requests
import pandas as pd


In [7]:
STATE_ABBR = "md"
STATE_FIPS = "24"
COUNTY_FIPS = "027"              # Howard County
COUNTY_KEY = STATE_FIPS + COUNTY_FIPS  # "24027"

YEAR = "2022"
PARTS = ["main", "aux"]          # OD variants
JOB_TYPES = ["JT00","JT01","JT02","JT03","JT04","JT05"]  # All jobs + sectors
OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Official LODES7 base
BASE_URL = f"https://lehd.ces.census.gov/data/lodes/LODES8/{STATE_ABBR}/od"

# OD column schema (no header in raw files)
OD_COLS = [
    "h_geocode","w_geocode","S000",
    "SA01","SA02","SA03",
    "SE01","SE02","SE03",
    "SI01","SI02","SI03"
]

In [8]:
def download_od(part: str, job_type: str, year: str) -> pd.DataFrame:
    """
    Download one OD file, read with headers, and return a DataFrame.
    Skips if 404 / missing.
    """
    url = f"{BASE_URL}/{STATE_ABBR}_od_{part}_{job_type}_{year}.csv.gz"
    print(f"→ Fetching: {url}")
    r = requests.get(url, timeout=180)
    if r.status_code == 404:
        print(f"   (missing) 404 Not Found — skipping")
        return pd.DataFrame(columns=OD_COLS)
    r.raise_for_status()

    # Decompress gz in-memory
    buf = io.BytesIO(r.content)
    with gzip.GzipFile(fileobj=buf, mode="rb") as gz:
        df = pd.read_csv(gz, header=None, names=OD_COLS, dtype=str)

    # Cast numeric columns
    num_cols = ["S000","SA01","SA02","SA03","SE01","SE02","SE03","SI01","SI02","SI03"]
    for c in num_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

    # Annotate metadata
    df["year"] = int(year)
    df["part"] = part
    df["job_type"] = job_type
    return df

In [9]:
def filter_to_howard(df: pd.DataFrame, mode: str = "either") -> pd.DataFrame:
    """
    Keep rows linked to Howard County, MD (FIPS 24027).
    mode:
      - "either": keep if home OR work county == 24027
      - "home_only": keep if home county == 24027
      - "work_only": keep if work county == 24027
    """
    if df.empty:
        return df
    # County code = first 5 digits of the 15-digit block geocode
    h_county = df["h_geocode"].str.slice(0, 5)
    w_county = df["w_geocode"].str.slice(0, 5)

    if mode == "home_only":
        m = (h_county == COUNTY_KEY)
    elif mode == "work_only":
        m = (w_county == COUNTY_KEY)
    else:
        m = (h_county == COUNTY_KEY) | (w_county == COUNTY_KEY)

    out = df.loc[m].copy()
    # Helpful BG IDs (12-digit block group GEOIDs)
    out["home_bg"] = out["h_geocode"].str.slice(0, 12)
    out["work_bg"] = out["w_geocode"].str.slice(0, 12)
    return out

In [10]:
def main():
    combined = []
    for part in PARTS:
        for jt in JOB_TYPES:
            try:
                df = download_od(part, jt, YEAR)
                if df.empty:
                    continue
                # Filter to Howard County (either home or work in county)
                dff = filter_to_howard(df, mode="either")
                if not dff.empty:
                    combined.append(dff)
                print(f"   kept rows: {len(dff):,}")
            except Exception as e:
                print(f"   ERROR on {part} {jt}: {e}")

    if not combined:
        print("No data collected. Check availability/year.")
        return

    full = pd.concat(combined, ignore_index=True)

    # Reorder columns for clarity
    cols = [
        "year","part","job_type",
        "h_geocode","w_geocode","home_bg","work_bg",
        "S000","SA01","SA02","SA03","SE01","SE02","SE03","SI01","SI02","SI03"
    ]
    full = full[cols]

    # Save CSV
    csv_path = os.path.join(OUTPUT_DIR, f"lodes_od_{STATE_ABBR}_howard_{YEAR}_combined.csv")
    full.to_csv(csv_path, index=False)
    print(f"\n✅ Wrote CSV: {csv_path}  (rows: {len(full):,})")

    # Save Excel
    xlsx_path = os.path.join(OUTPUT_DIR, f"lodes_od_{STATE_ABBR}_howard_{YEAR}_combined.xlsx")
    with pd.ExcelWriter(xlsx_path, engine="xlsxwriter") as writer:
        full.to_excel(writer, index=False, sheet_name="OD_Combined")
    print(f"✅ Wrote Excel: {xlsx_path}")

    # Optional: quick county-level aggregates
    # Inflow to Howard (work county == 24027), Outflow from Howard (home county == 24027)
    full["home_county"] = full["h_geocode"].str.slice(0,5)
    full["work_county"] = full["w_geocode"].str.slice(0,5)
    inflow = full.loc[full["work_county"] == COUNTY_KEY, "S000"].sum()
    outflow = full.loc[full["home_county"] == COUNTY_KEY, "S000"].sum()
    internal = full.loc[(full["home_county"] == COUNTY_KEY) & (full["work_county"] == COUNTY_KEY), "S000"].sum()
    print(f"\nSummary — {YEAR} Howard County (JT00..JT05, {PARTS}):")
    print(f"  Internal jobs (home&work in county): {internal:,}")
    print(f"  Inflow jobs (others → Howard):       {inflow:,}")
    print(f"  Outflow jobs (Howard → others):      {outflow:,}")
    print(f"  Net inflow:                          {inflow - outflow:,}")


In [11]:
if __name__ == "__main__":
    main()

→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_main_JT00_2022.csv.gz
   kept rows: 117,759
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_main_JT01_2022.csv.gz
   kept rows: 110,043
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_main_JT02_2022.csv.gz
   kept rows: 101,214
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_main_JT03_2022.csv.gz
   kept rows: 93,902
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_main_JT04_2022.csv.gz
   kept rows: 4,172
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_main_JT05_2022.csv.gz
   kept rows: 4,119
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_aux_JT00_2022.csv.gz
   kept rows: 0
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_aux_JT01_2022.csv.gz
   kept rows: 0
→ Fetching: https://lehd.ces.census.gov/data/lodes/LODES8/md/od/md_od_aux_JT02_2022.csv.gz
   kept rows: 0


ModuleNotFoundError: No module named 'xlsxwriter'