# 05 — Link SDOH (DEMO) + OCQ/HOQ/HIQ/FSQ with robust I/O (1999–2023)

**Patches included**
- Robust XPT reading with encoding fallbacks.
- Use `pd.Int64Dtype()` for nullable integers instead of `'Int64'` strings.
- Race/ethnicity combined as an aligned `Series`.
- Adult filter uses safe numeric casting for `RIDAGEYR`.


In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np

ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
DATA = ROOT / "data"
OUT  = ROOT / "output"
MOD  = DATA / "nhanes_by_module"
OUT.mkdir(parents=True, exist_ok=True)
print("ROOT:", ROOT)

ROOT: /Users/dengshuyue/Desktop/SDOH/analysis


In [2]:
def read_any(p: Path) -> pd.DataFrame:
    s = p.suffix.lower()
    if s == ".xpt":
        import pyreadstat
        # Try default first, then latin-1 fallback (handles DEMO_L.xpt issue)
        try:
            df, _ = pyreadstat.read_xport(str(p))
        except Exception:
            df, _ = pyreadstat.read_xport(str(p), encoding="latin1")
    elif s == ".sas7bdat":
        df = pd.read_sas(str(p), format="sas7bdat", encoding="latin1")
    elif s == ".csv":
        # Try utf-8, then latin-1
        try:
            df = pd.read_csv(p)
        except UnicodeDecodeError:
            df = pd.read_csv(p, encoding="latin1")
    elif s == ".parquet":
        df = pd.read_parquet(p)
    else:
        raise ValueError(f"Unsupported file: {p}")
    df.columns = df.columns.str.upper()
    return df

def ensure_seqn(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d.columns = d.columns.str.upper()
    d["SEQN"] = pd.to_numeric(d["SEQN"], errors="coerce").astype(pd.Int64Dtype())
    return d.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])

In [3]:
BASE = OUT / "cov_core_mort_dep_all_1999_2023.parquet"
if not BASE.exists():
    for cand in [OUT/"cov_core_mort_dep_1999_2023.parquet", OUT/"demo_cov_dep_1999_2023.parquet"]:
        if cand.exists():
            BASE = cand
            break
print("Base file:", BASE)
base = pd.read_parquet(BASE)
base.columns = base.columns.str.upper()
base["SEQN"] = pd.to_numeric(base["SEQN"], errors="coerce").astype(pd.Int64Dtype())
print("Base shape:", base.shape)

Base file: /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_core_mort_dep_all_1999_2023.parquet
Base shape: (128809, 56)


## DEMO across cycles → SDOH + Adult filter

In [4]:
DEMO_DIR = MOD / "demo"
keep_cols = [
    "SEQN","SDDSRVYR","RIDAGEYR","RIAGENDR","WTMEC2YR",
    "INDFMPIR","INDFMINC","DMDEDUC2","DMDEDUC3","DMDMARTL",
    "RIDRETH1","RIDRETH2","RIDRETH3",
]

cand = []
if DEMO_DIR.exists():
    for pat in ["*DEMO*.xpt","*DEMO*.XPT","*demo*.sas7bdat","*demo*.csv","*demo*.parquet"]:
        cand.extend(sorted(DEMO_DIR.glob(pat)))

parts = []
for p in cand:
    try:
        d0 = read_any(p)
        cols = [c for c in keep_cols if c in d0.columns]
        if cols:
            parts.append(d0[cols].copy())
    except Exception as e:
        print("Skip", p.name, "—", e)

demo = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=keep_cols)
demo = ensure_seqn(demo)
print("DEMO shape:", demo.shape)

# Adult filter table (≥20)
ridage = pd.to_numeric(demo.get("RIDAGEYR"), errors="coerce")
age20 = demo.loc[ridage >= 20, ["SEQN","RIDAGEYR","SDDSRVYR"]].copy()

# Derive SDOH harmonized
d = demo.copy()
d["PIR"] = pd.to_numeric(d.get("INDFMPIR", np.nan), errors="coerce")
d["PIR_CAT"] = pd.cut(d["PIR"], [-np.inf,1.3,2.0,4.0,np.inf], labels=["<1.30","1.30–<2.00","2.00–<4.00","≥4.00"]) 
d["EDU"] = np.nan
if "DMDEDUC2" in d: d.loc[d["DMDEDUC2"].notna(), "EDU"] = d.loc[d["DMDEDUC2"].notna(), "DMDEDUC2"]
if "DMDEDUC3" in d: d.loc[d["EDU"].isna() & d["DMDEDUC3"].notna(), "EDU"] = d.loc[d["EDU"].isna(), "DMDEDUC3"]
d["EDU"] = pd.to_numeric(d["EDU"], errors="coerce").astype(pd.Int64Dtype())

edu2_map = {1:"<9th grade", 2:"9–11th (incl. 12th, no diploma)", 3:"High school/GED", 4:"Some college or AA", 5:"College graduate or above"}
edu3_map = {1:"<9th grade", 2:"9–11th", 3:"High school/GED", 4:"Some college or AA", 5:"College graduate or above"}
def label_edu(row):
    v2, v3 = row.get("DMDEDUC2", np.nan), row.get("DMDEDUC3", np.nan)
    if pd.notna(v2): return edu2_map.get(int(v2), np.nan)
    if pd.notna(v3): return edu3_map.get(int(v3), np.nan)
    return np.nan
d["EDU_CAT"] = d.apply(label_edu, axis=1)

# Race/ethnicity as an aligned Series
r3 = d.get("RIDRETH3") if "RIDRETH3" in d else pd.Series(index=d.index, dtype=float)
r2 = d.get("RIDRETH2") if "RIDRETH2" in d else pd.Series(index=d.index, dtype=float)
r1 = d.get("RIDRETH1") if "RIDRETH1" in d else pd.Series(index=d.index, dtype=float)
race_series = r3.where(r3.notna(), r2.where(r2.notna(), r1))
race_series = pd.to_numeric(race_series, errors="coerce").astype(pd.Int64Dtype())
d["RACE_ETH_CODE"] = race_series

race_map_r3 = {1:"Mexican American",2:"Other Hispanic",3:"NH White",4:"NH Black",6:"NH Asian",7:"NH Other/Multi"}
race_map_r2 = {1:"Mexican American",2:"Other Hispanic",3:"NH White",4:"NH Black",5:"Other (incl. Multi)"}
race_map_r1 = {1:"Mexican American",2:"Other Hispanic",3:"NH White",4:"NH Black",5:"Other (incl. Multi)"}
def label_race(row):
    if pd.notna(row.get("RIDRETH3", np.nan)): return race_map_r3.get(int(row["RIDRETH3"]), np.nan)
    if pd.notna(row.get("RIDRETH2", np.nan)): return race_map_r2.get(int(row["RIDRETH2"]), np.nan)
    if pd.notna(row.get("RIDRETH1", np.nan)): return race_map_r1.get(int(row["RIDRETH1"]), np.nan)
    return np.nan
d["RACE_ETH"] = d.apply(label_race, axis=1)

d["MARITAL"] = pd.to_numeric(d.get("DMDMARTL", np.nan), errors="coerce").astype(pd.Int64Dtype())
mar_map = {1:"Married",2:"Widowed",3:"Divorced",4:"Separated",5:"Never married",6:"Living with partner"}
d["MARITAL_CAT"] = d["MARITAL"].map(mar_map)

sdoh = d[[c for c in [
    "SEQN","SDDSRVYR","INDFMPIR","INDFMINC","DMDEDUC2","DMDEDUC3","DMDMARTL",
    "RIDRETH1","RIDRETH2","RIDRETH3","PIR","PIR_CAT","EDU","EDU_CAT","RACE_ETH","MARITAL","MARITAL_CAT"
] if c in d.columns]].drop_duplicates(subset=["SEQN"]).copy()
sdoh = ensure_seqn(sdoh)
print("SDOH tidy shape:", sdoh.shape)

DEMO shape: (128809, 13)
SDOH tidy shape: (128809, 17)


## Continue with OCQ/HOQ/HIQ/FSQ merge as in prior notebook

In [5]:
# For brevity here, you can paste the OCQ/HOQ/HIQ/FSQ sections from the previous notebook.
# They will now benefit from the robust helpers above (encoding fallbacks, Int64 dtype, adult filter).

#### OCQ — Employment status (99–18 main; 17-20, 21-23 added; adult filter)

In [67]:
# --- NHANES OCQ: download all cycles (with corrected folders) ----------------
from pathlib import Path
import requests, time, re
import numpy as np
import pandas as pd

# Assumes MOD, read_any, age20, ensure_seqn already exist
OCQ_DIR = (MOD / "ocq"); OCQ_DIR.mkdir(parents=True, exist_ok=True)

BASE = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{folder}/DataFiles/{fname}"

# All cycles + multiple folder candidates where needed
OCQ_SPECS = [
    {"label":"1999-2000", "sddsrvyr":1,  "fname":"OCQ.xpt",
     "folders":["1999"]},

    {"label":"2001-2002", "sddsrvyr":2,  "fname":"OCQ_B.xpt",
     "folders":["2001"]},
    {"label":"2003-2004", "sddsrvyr":3,  "fname":"OCQ_C.xpt",
     "folders":["2003"]},
    {"label":"2005-2006", "sddsrvyr":4,  "fname":"OCQ_D.xpt",
     "folders":["2005"]},
    {"label":"2007-2008", "sddsrvyr":5,  "fname":"OCQ_E.xpt",
     "folders":["2007"]},
    {"label":"2009-2010", "sddsrvyr":6,  "fname":"OCQ_F.xpt",
     "folders":["2009"]},
    {"label":"2011-2012", "sddsrvyr":7,  "fname":"OCQ_G.xpt",
     "folders":["2011"]},
    {"label":"2013-2014", "sddsrvyr":8,  "fname":"OCQ_H.xpt",
     "folders":["2013"]},
    {"label":"2015-2016", "sddsrvyr":9,  "fname":"OCQ_I.xpt",
     "folders":["2015"]},
    {"label":"2017-2018", "sddsrvyr":10, "fname":"OCQ_J.xpt",
     "folders":["2017", "2017-2018"]},

    # Pre-pandemic combined release (your corrected path lives under 2017/)
    {"label":"2017–Mar 2020 (pre-pandemic)", "sddsrvyr":11, "fname":"P_OCQ.xpt",
     "folders":["2017", "2019", "2019-2020", "2017-2018"]},

    # 2021–2023 combined (your corrected path lives under 2021/)
    {"label":"Aug 2021–Aug 2023", "sddsrvyr":12, "fname":"OCQ_L.xpt",
     "folders":["2021", "2021-2022", "2021-2023"]},
    # If/when CDC posts 2023–2024 (M):
    # {"label":"2023-2024", "sddsrvyr":13, "fname":"OCQ_M.xpt", "folders":["2023","2023-2024"]},
]

def _try_download(url: str, dest: Path, retries=3, backoff=1.6) -> bool:
    for i in range(retries):
        try:
            with requests.get(url, stream=True, timeout=30) as r:
                if r.status_code == 200:
                    with open(dest, "wb") as f:
                        for chunk in r.iter_content(1<<15):
                            if chunk: f.write(chunk)
                    return True
                if r.status_code == 404:
                    return False
        except requests.RequestException:
            pass
        time.sleep(backoff**i)
    return False

def download_ocq_all(specs=OCQ_SPECS) -> list[tuple[Path,int]]:
    paths = []
    for s in specs:
        fname_candidates = [s["fname"]]
        # Case variants (.xpt/.XPT)
        if s["fname"].endswith(".xpt"):
            fname_candidates.append(s["fname"][:-4] + ".XPT")
        else:
            fname_candidates.append(s["fname"][:-4] + ".xpt")

        got = False
        for folder in s["folders"]:
            for fname in fname_candidates:
                url = BASE.format(folder=folder, fname=fname)
                local = OCQ_DIR / f"{folder}_{fname}"
                if local.exists() and local.stat().st_size > 0:
                #    print(f"✓ Exists: {s['label']} @ {folder}/{fname}")
                    paths.append((local, s["sddsrvyr"]))
                    got = True
                    break
                print(f"→ Fetch {s['label']}: {url}")
                ok = _try_download(url, local)
                if ok:
                #    print(f"  ✓ Saved {local.name}")
                    paths.append((local, s["sddsrvyr"]))
                    got = True
                    break
            if got: break
        if not got:
            print(f"⚠ Missing: {s['label']} (tried {s['folders']})")
    return paths

# Download
ocq_files = download_ocq_all()

# Build
def _infer_cycle_if_missing(df: pd.DataFrame, sddsrvyr: int) -> pd.DataFrame:
    df = df.copy()
    if "SDDSRVYR" not in df.columns or df["SDDSRVYR"].isna().all():
        df["SDDSRVYR"] = sddsrvyr
    else:
        df["SDDSRVYR"] = pd.to_numeric(df["SDDSRVYR"], errors="coerce")
        df["SDDSRVYR"] = df["SDDSRVYR"].fillna(sddsrvyr)
    return df

def _recode_employment(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["EMPLOY"] = np.nan
    if "OCD150" in df.columns:
        df.loc[df["OCD150"] == 1, "EMPLOY"] = 1  # working
        df.loc[df["OCD150"] == 3, "EMPLOY"] = 2  # not working
    if "OCQ380" in df.columns:
        df.loc[df["OCQ380"] == 5, "EMPLOY"] = 2  # unemployed/looking
        df.loc[df["OCQ380"] == 3, "EMPLOY"] = 3  # retired
        df.loc[df["OCQ380"].isin([4, 6]), "EMPLOY"] = 4  # disabled/other
        df.loc[df["OCQ380"].isin([1, 2, 7]), "EMPLOY"] = 5  # working/unknown
    df["UNEMPLOYMENT"] = (df["EMPLOY"] == 2).astype(pd.Int64Dtype())
    keep = [c for c in ["SEQN", "SDDSRVYR", "EMPLOY", "UNEMPLOYMENT"] if c in df.columns]
    return df[keep]

ocq_parts = []
for p, cyc in ocq_files:
    try:
        df = read_any(p)
        df = _infer_cycle_if_missing(df, cyc)
        df = df.merge(age20[["SEQN"]], on="SEQN", how="inner")  # adults (≥20)
        ocq_parts.append(_recode_employment(df))
    except Exception as e:
        print("OCQ skip", p.name, "—", e)

ocq = (pd.concat(ocq_parts, ignore_index=True)
        if ocq_parts else
        pd.DataFrame(columns=["SEQN","SDDSRVYR","EMPLOY","UNEMPLOYMENT"]))
ocq = ensure_seqn(ocq)

print("OCQ shape:", ocq.shape)

# Optional: avoid 2017–2018 duplication if both J (10) and P (11) are present
s10 = set(ocq.loc[ocq["SDDSRVYR"]==10, "SEQN"])
s11 = set(ocq.loc[ocq["SDDSRVYR"]==11, "SEQN"])
overlap = s10 & s11
if overlap:
    print("De-overlapping 2017–2018 vs pre-pandemic:", len(overlap))
    ocq = ocq[~((ocq["SEQN"].isin(overlap)) & (ocq["SDDSRVYR"]==10))].copy()

# Audit
vc = ocq["SDDSRVYR"].value_counts(dropna=False).sort_index()
print("\nSDDSRVYR counts:")
for k, n in vc.items():
    print(f"{int(k):>2}  {int(n):>7} rows")


OCQ shape: (72122, 4)

SDDSRVYR counts:
 1     4880 rows
 2     5411 rows
 3     5041 rows
 4     4979 rows
 5     5935 rows
 6     6218 rows
 7     5560 rows
 8     5769 rows
 9     5719 rows
10     5569 rows
11     9232 rows
12     7809 rows


In [68]:
s10 = set(ocq.loc[ocq["SDDSRVYR"]==10, "SEQN"])
s11 = set(ocq.loc[ocq["SDDSRVYR"]==11, "SEQN"])
overlap = s10 & s11
print("Overlap SEQNs (10 vs 11):", len(overlap))

# Prefer pre-pandemic records (11); drop cycle 10 for overlapping SEQNs
if overlap:
    ocq = ocq[~((ocq["SEQN"].isin(overlap)) & (ocq["SDDSRVYR"]==10))].copy()

# Recheck counts
# print(ocq["SDDSRVYR"].value_counts().sort_index())


Overlap SEQNs (10 vs 11): 0


In [69]:
# Sanity: no overlap expected (and you saw 0)
s10 = set(ocq.loc[ocq["SDDSRVYR"]==10, "SEQN"])
s11 = set(ocq.loc[ocq["SDDSRVYR"]==11, "SEQN"])
assert len(s10 & s11) == 0, "Found overlap between 10 and 11!"

# Tidy dtypes & save
ocq["SDDSRVYR"] = ocq["SDDSRVYR"].astype("Int8")
ocq["UNEMPLOYMENT"] = ocq["UNEMPLOYMENT"].astype("Int8")
out_ocq = OUT / "ocq_1999_2023.parquet"
ocq.to_parquet(out_ocq, index=False)
print("Wrote:", out_ocq)


Wrote: /Users/dengshuyue/Desktop/SDOH/analysis/output/ocq_1999_2023.parquet


#### HOQ — Housing (subset) (99–18 main; 17-20, 21-23 added; adult filter)

In [70]:
# --- HOQ (Housing): fetch 1999–2023 from CDC, keep HOD050(+HOD051) + HOQ065 (7/9 -> NA) ----
from pathlib import Path
import io, sys, time
import numpy as np
import pandas as pd
import requests

BASE = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public"
HOQ_DIR = (MOD / "hoq")
HOQ_DIR.mkdir(parents=True, exist_ok=True)

# Cycle specs (folder + filename on CDC + SDDSRVYR code)
_HOQ_SPECS = [
    # classic 2-year cycles
    (1,  "1999-2000", "1999", "HOQ.xpt"),
    (2,  "2001-2002", "2001", "HOQ_B.xpt"),
    (3,  "2003-2004", "2003", "HOQ_C.xpt"),
    (4,  "2005-2006", "2005", "HOQ_D.xpt"),
    (5,  "2007-2008", "2007", "HOQ_E.xpt"),
    (6,  "2009-2010", "2009", "HOQ_F.xpt"),
    (7,  "2011-2012", "2011", "HOQ_G.xpt"),
    (8,  "2013-2014", "2013", "HOQ_H.xpt"),
    (9,  "2015-2016", "2015", "HOQ_I.xpt"),
    (10, "2017-2018", "2017", "HOQ_J.xpt"),
    # special combined (pre-pandemic) + post-pandemic combined
    (11, "2017–Mar 2020 (pre-pandemic)", "2017", "P_HOQ.xpt"),
    (12, "Aug 2021–Aug 2023", "2021", "HOQ_L.xpt"),
]

def _fetch_hoq_file(sddsrvyr: int, label: str, folder: str, fname: str) -> Path | None:
    """
    Download an HOQ .xpt file to HOQ_DIR, return local path or None if failed.
    """
    url = f"{BASE}/{folder}/DataFiles/{fname}"
    # nice local name (keeps year folder + filename)
    out = HOQ_DIR / f"{folder}_{fname}"
    print(f"→ Fetch {label}: {url}")
    try:
        if not out.exists() or out.stat().st_size == 0:
            r = requests.get(url, timeout=60)
            r.raise_for_status()
            out.write_bytes(r.content)
            print(f"  ✓ Saved {out.name}")
        else:
            print(f"  • Exists {out.name}")
        return out
    except Exception as e:
        print(f"  ✗ Skip {label} — {e}")
        return None

# Download all HOQ cycles
_downloaded = []
for cyc, label, folder, fname in _HOQ_SPECS:
    p = _fetch_hoq_file(cyc, label, folder, fname)
    if p is not None:
        _downloaded.append((cyc, p))

# Collect candidate files (downloaded + any local extras you already have)
hoq_files: list[Path] = []
if HOQ_DIR.exists():
    for pat in ["HOQ*.xpt", "hoq*.sas7bdat", "hoq*.parquet", "hoq*.csv", "*HOQ*.xpt", "*_HOQ*.xpt", "P_HOQ*.xpt"]:
        hoq_files.extend(sorted(set(HOQ_DIR.glob(pat))))
# ensure downloaded ones are present (order by SDDSRVYR)
dl_set = {p for _, p in _downloaded}
hoq_files = [p for _, p in sorted(_downloaded, key=lambda t: t[0])] + [p for p in hoq_files if p not in dl_set]

# Helper: add SDDSRVYR if missing
def _add_cycle_if_missing(df: pd.DataFrame, cyc: int) -> pd.DataFrame:
    if "SDDSRVYR" not in df.columns:
        df = df.copy()
        df["SDDSRVYR"] = cyc
    return df

def _recode_hoq(df: pd.DataFrame) -> pd.DataFrame:
    """
    - Harmonize HOD050/HOD051 to HOD050
    - Recode HOQ065: 7/9 -> NA
    - Keep SEQN, SDDSRVYR, HOD050, HOQ065
    """
    df = df.copy()
    # rooms var changed name to HOD051 in L cycle; prefer a unified HOD050
    if "HOD050" not in df.columns and "HOD051" in df.columns:
        df = df.rename(columns={"HOD051": "HOD050"})
    # set invalids to NA for tenure
    if "HOQ065" in df.columns:
        df.loc[df["HOQ065"].isin([7, 9]), "HOQ065"] = np.nan
    keep = [c for c in ["SEQN", "SDDSRVYR", "HOD050", "HOQ065"] if c in df.columns]
    return df[keep] if keep else pd.DataFrame(columns=["SEQN","SDDSRVYR","HOD050","HOQ065"])

# Build parts
hoq_parts = []
for p in hoq_files:
    try:
        # infer SDDSRVYR from our spec if possible
        cyc_guess = None
        for cyc, label, folder, fname in _HOQ_SPECS:
            if p.name.endswith(fname) and p.name.startswith(folder + "_"):
                cyc_guess = cyc
                break

        df = read_any(p)  # your helper that reads XPT/SAS/Parquet/CSV
        if cyc_guess is not None:
            df = _add_cycle_if_missing(df, cyc_guess)
        else:
            df = _infer_cycle_if_missing(df, p)  # your existing suffix-based fallback

        # adult filter (≥20) using DEMO-based age20 (expects a frame with SEQN)
        df = df.merge(age20[["SEQN"]], on="SEQN", how="inner")
        part = _recode_hoq(df)
        if not part.empty:
            hoq_parts.append(part)
    except Exception as e:
        print("HOQ skip", p.name, "—", e)

hoq_all = pd.concat(hoq_parts, ignore_index=True) if hoq_parts else pd.DataFrame(columns=["SEQN","SDDSRVYR","HOD050","HOQ065"])
hoq_all = ensure_seqn(hoq_all)  # your helper to clean/ensure SEQN integrity

print("HOQ shape:", hoq_all.shape)
print("\nSDDSRVYR counts:")
print(hoq_all["SDDSRVYR"].value_counts().sort_index())

# Optional: sanity check for possible overlap between 2017–2018 (10) and 2017–Mar 2020 (11)
if {"SDDSRVYR"}.issubset(hoq_all.columns):
    s10 = set(hoq_all.loc[hoq_all["SDDSRVYR"]==10, "SEQN"])
    s11 = set(hoq_all.loc[hoq_all["SDDSRVYR"]==11, "SEQN"])
    print("\nOverlap SEQNs (10 vs 11):", len(s10 & s11))

# Optional: tidy dtypes & save
hoq_all["SDDSRVYR"] = hoq_all["SDDSRVYR"].astype("Int8", errors="ignore")
if "HOQ065" in hoq_all.columns:
    hoq_all["HOQ065"] = hoq_all["HOQ065"].astype("Int8", errors="ignore")
out_path = OUT / "hoq_1999_2023.parquet"
hoq_all.to_parquet(out_path, index=False)
print("Wrote:", out_path)


→ Fetch 1999-2000: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/1999/DataFiles/HOQ.xpt
  ✓ Saved 1999_HOQ.xpt
→ Fetch 2001-2002: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2001/DataFiles/HOQ_B.xpt
  ✓ Saved 2001_HOQ_B.xpt
→ Fetch 2003-2004: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2003/DataFiles/HOQ_C.xpt
  ✓ Saved 2003_HOQ_C.xpt
→ Fetch 2005-2006: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2005/DataFiles/HOQ_D.xpt
  ✓ Saved 2005_HOQ_D.xpt
→ Fetch 2007-2008: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2007/DataFiles/HOQ_E.xpt
  ✓ Saved 2007_HOQ_E.xpt
→ Fetch 2009-2010: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2009/DataFiles/HOQ_F.xpt
  ✓ Saved 2009_HOQ_F.xpt
→ Fetch 2011-2012: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2011/DataFiles/HOQ_G.xpt
  ✓ Saved 2011_HOQ_G.xpt
→ Fetch 2013-2014: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/HOQ_H.xpt
  ✓ Saved 2013_HOQ_H.xpt
→ Fetch 2015-2016: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/HOQ_I.xpt
  ✓

#### HIQ/HIQS 99–18 main; 17-20, 21-23 added; — Health Insurance → INS (adult filter)

In [73]:
# --- HIQ / HIQS (Insurance): fetch 1999–2023, build INS ----------------------
from pathlib import Path
import requests, time, re
import numpy as np
import pandas as pd

HIQ_DIR = (MOD / "hiq")
HIQ_DIR.mkdir(parents=True, exist_ok=True)

BASE = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{folder}/DataFiles/{fname}"

# Cycles + folders + candidate filenames (HIQ & HIQS)
HIQ_SPECS = [
    # SDDSRVYR, label, folder, filename candidates (in priority order)
    (1,  "1999-2000", "1999", ["HIQ.xpt", "HIQS.xpt"]),
    (2,  "2001-2002", "2001", ["HIQ_B.xpt", "HIQS_B.xpt"]),
    (3,  "2003-2004", "2003", ["HIQ_C.xpt", "HIQS_C.xpt"]),
    (4,  "2005-2006", "2005", ["HIQ_D.xpt", "HIQS_D.xpt"]),
    (5,  "2007-2008", "2007", ["HIQ_E.xpt", "HIQS_E.xpt"]),
    (6,  "2009-2010", "2009", ["HIQ_F.xpt", "HIQS_F.xpt"]),
    (7,  "2011-2012", "2011", ["HIQ_G.xpt", "HIQS_G.xpt"]),
    (8,  "2013-2014", "2013", ["HIQ_H.xpt", "HIQS_H.xpt"]),
    (9,  "2015-2016", "2015", ["HIQ_I.xpt", "HIQS_I.xpt"]),
    (10, "2017-2018", "2017", ["HIQ_J.xpt", "HIQS_J.xpt"]),
    # Pre-pandemic combined (under 2017 folder)
    (11, "2017–Mar 2020 (pre-pandemic)", "2017", ["P_HIQ.xpt", "P_HIQS.xpt"]),
    # 2021–2023 combined (under 2021 folder)
    (12, "Aug 2021–Aug 2023", "2021", ["HIQ_L.xpt", "HIQS_L.xpt"]),
]

def _try_download(url: str, dest: Path, retries=3, backoff=1.6) -> bool:
    for i in range(retries):
        try:
            with requests.get(url, stream=True, timeout=45) as r:
                if r.status_code == 200:
                    with open(dest, "wb") as f:
                        for chunk in r.iter_content(1<<15):
                            if chunk: f.write(chunk)
                    return True
                if r.status_code == 404:
                    return False
        except requests.RequestException:
            pass
        time.sleep(backoff**i)
    return False

def download_hiq_all(specs=HIQ_SPECS):
    """Return list of (local_path, sddsrvyr). Tries .xpt and .XPT, HIQ then HIQS."""
    paths = []
    for cyc, label, folder, fnames in specs:
        got = False
        for fname in fnames:
            # try both .xpt and .XPT
            cand = [fname]
            cand += [fname[:-4] + ".XPT"] if fname.lower().endswith(".xpt") else [fname[:-4] + ".xpt"]
            for f in cand:
                url = BASE.format(folder=folder, fname=f)
                local = HIQ_DIR / f"{folder}_{f}"
                if local.exists() and local.stat().st_size > 0:
                    print(f"✓ Exists: {label} @ {folder}/{f}")
                    paths.append((local, cyc))
                    got = True
                    break
                print(f"→ Fetch {label}: {url}")
                ok = _try_download(url, local)
                if ok:
                    print(f"  ✓ Saved {local.name}")
                    paths.append((local, cyc))
                    got = True
                    break
            if got:
                break
        if not got:
            print(f"⚠ Missing: {label} (tried {fnames})")
    return paths

# 1) Download
hiq_paths = download_hiq_all()

# 2) Read + tag cycle + adult filter
hiq_parts = []
for p, cyc in hiq_paths:
    try:
        df = read_any(p)
        df.columns = df.columns.str.upper()
        if "SDDSRVYR" not in df.columns or df["SDDSRVYR"].isna().all():
            df["SDDSRVYR"] = cyc
        else:
            df["SDDSRVYR"] = pd.to_numeric(df["SDDSRVYR"], errors="coerce").fillna(cyc)
        # adults (≥20)
        df = df.merge(age20[["SEQN"]], on="SEQN", how="inner")
        # track source so we can dedupe HIQ vs HIQS within a cycle
        df["_SRC"] = p.name.upper()
        hiq_parts.append(df)
    except Exception as e:
        print("HIQ skip", p.name, "—", e)

hiq_all = (pd.concat(hiq_parts, ignore_index=True)
           if hiq_parts else pd.DataFrame())
hiq_all.columns = hiq_all.columns.str.upper()

# 3) Deduplicate if both HIQ & HIQS exist for same SEQN+cycle
if not hiq_all.empty:
    relevant = [c for c in [
        "HIQ031A","HIQ031B","HIQ031C","HIQ031D","HIQ031E","HIQ031F","HIQ031G","HIQ031H","HIQ031I","HIQ011",
        "HID030A","HID030B","HID030C","HID030D","HID010"
    ] if c in hiq_all.columns]
    hiq_all["_NN"] = hiq_all[relevant].notna().sum(axis=1) if relevant else 0
    hiq_all["_PREF"] = hiq_all["_SRC"].str.contains(r"\bHIQ", case=False, regex=True).astype(int)  # prefer HIQ over HIQS
    hiq_all = (hiq_all
               .sort_values(["SEQN","SDDSRVYR","_PREF","_NN"], ascending=[True,True,False,False])
               .drop_duplicates(subset=["SEQN","SDDSRVYR"], keep="first")
               .drop(columns=["_SRC","_NN","_PREF"], errors="ignore"))

# 4) Build INS using your existing logic (safe column accessor)
def s(df: pd.DataFrame, col: str, default_val=np.nan):
    return df[col] if col in df.columns else pd.Series(default_val, index=df.index)

ins = pd.DataFrame({"SEQN": s(hiq_all, "SEQN")})
if "SDDSRVYR" in hiq_all:
    ins["SDDSRVYR"] = hiq_all["SDDSRVYR"]
ins["INS"] = np.nan

# Private
cond_private = s(hiq_all,"HIQ031A").eq(14) | s(hiq_all,"HID030A").eq(1)
ins.loc[cond_private, "INS"] = 1

# Medicare
cond_med = (s(hiq_all,"HIQ031B").eq(15) & ~s(hiq_all,"HIQ031D").eq(17) & ~s(hiq_all,"HIQ031E").eq(18)) | \
           (s(hiq_all,"HID030B").eq(1) & ~s(hiq_all,"HID030C").eq(1))
ins.loc[cond_med, "INS"] = 2

# Medicaid only or both → 3
cond_mcaid_only = ((s(hiq_all,"HIQ031D").eq(17) | s(hiq_all,"HIQ031E").eq(18)) & ~s(hiq_all,"HIQ031B").eq(15)) | \
                  (~s(hiq_all,"HID030B").eq(1) & s(hiq_all,"HID030C").eq(1))
cond_both = (s(hiq_all,"HIQ031B").eq(15) & s(hiq_all,"HIQ031D").eq(17)) | \
            (s(hiq_all,"HID030B").eq(1) & s(hiq_all,"HID030C").eq(1))
ins.loc[cond_mcaid_only | cond_both, "INS"] = 3

# Other
other_cols = [c for c in ["HIQ031C","HIQ031F","HIQ031G","HIQ031H","HIQ031I","HID030D"] if c in hiq_all.columns]
cond_other = hiq_all[other_cols].eq(1).any(axis=1) if other_cols else pd.Series(False, index=hiq_all.index)
ins.loc[cond_other, "INS"] = 5

# None
none_conds = []
if "HIQ011" in hiq_all: none_conds.append(hiq_all["HIQ011"].eq(2))
if "HID010" in hiq_all: none_conds.append(hiq_all["HID010"].eq(2))
if none_conds:
    from functools import reduce
    ins.loc[reduce(lambda a,b: a|b, none_conds), "INS"] = 0

# Finalize
ins = ensure_seqn(ins)
ins["INS"] = pd.to_numeric(ins["INS"], errors="coerce").astype(pd.Int64Dtype())

print("INS shape:", ins.shape)
print("\nSDDSRVYR counts (in HIQ/HIQS source):")
print(ins["SDDSRVYR"].value_counts(dropna=False).sort_index())


✓ Exists: 1999-2000 @ 1999/HIQ.xpt
✓ Exists: 2001-2002 @ 2001/HIQ_B.xpt
✓ Exists: 2003-2004 @ 2003/HIQ_C.xpt
✓ Exists: 2005-2006 @ 2005/HIQ_D.xpt
✓ Exists: 2007-2008 @ 2007/HIQ_E.xpt
✓ Exists: 2009-2010 @ 2009/HIQ_F.xpt
✓ Exists: 2011-2012 @ 2011/HIQ_G.xpt
✓ Exists: 2013-2014 @ 2013/HIQ_H.xpt
✓ Exists: 2015-2016 @ 2015/HIQ_I.xpt
✓ Exists: 2017-2018 @ 2017/HIQ_J.xpt
✓ Exists: 2017–Mar 2020 (pre-pandemic) @ 2017/P_HIQ.xpt
✓ Exists: Aug 2021–Aug 2023 @ 2021/HIQ_L.xpt
INS shape: (72122, 3)

SDDSRVYR counts (in HIQ/HIQS source):
SDDSRVYR
1     4880
2     5411
3     5041
4     4979
5     5935
6     6218
7     5560
8     5769
9     5719
10    5569
11    9232
12    7809
Name: count, dtype: int64


#### FSQ/FSQS — SNAP & FS; 99–18 main; 17-20, 21-23 added (adult filter)

In [117]:
# --- FSQ/FSQS (SNAP & Food Security): fetch & build with HH + Adult (1999–2020) ---
from pathlib import Path
import requests, time
import numpy as np
import pandas as pd

FSQ_DIR = (MOD / "fsq")
FSQ_DIR.mkdir(parents=True, exist_ok=True)

BASE = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{folder}/DataFiles/{fname}"

# Cycles: 1..10 (1999–2018), 11 = 2017–Mar 2020 (pre-pandemic).
# FYI: FSQ_L (2021–2023) not posted publicly → will be skipped.
FSQ_SPECS = [
    (1,  "1999-2000", "1999", ["FSQ.xpt",   "FSQS.xpt"]),
    (2,  "2001-2002", "2001", ["FSQ_B.xpt", "FSQS_B.xpt"]),
    (3,  "2003-2004", "2003", ["FSQ_C.xpt", "FSQS_C.xpt"]),
    (4,  "2005-2006", "2005", ["FSQ_D.xpt", "FSQS_D.xpt"]),
    (5,  "2007-2008", "2007", ["FSQ_E.xpt", "FSQS_E.xpt"]),
    (6,  "2009-2010", "2009", ["FSQ_F.xpt", "FSQS_F.xpt"]),
    (7,  "2011-2012", "2011", ["FSQ_G.xpt", "FSQS_G.xpt"]),
    (8,  "2013-2014", "2013", ["FSQ_H.xpt", "FSQS_H.xpt"]),
    (9,  "2015-2016", "2015", ["FSQ_I.xpt", "FSQS_I.xpt"]),
    (10, "2017-2018", "2017", ["FSQ_J.xpt", "FSQS_J.xpt"]),
    (11, "2017–Mar 2020 (pre-pandemic)", "2017", ["P_FSQ.xpt", "P_FSQS.xpt"]),
    # (12, "Aug 2021–Aug 2023", "2021", ["FSQ_L.xpt","FSQS_L.xpt"]),  # not published
]

def _try_download(url: str, dest: Path, retries=3, backoff=1.6) -> bool:
    for i in range(retries):
        try:
            with requests.get(url, stream=True, timeout=45) as r:
                if r.status_code == 200:
                    with open(dest, "wb") as f:
                        for chunk in r.iter_content(1<<15):
                            if chunk: f.write(chunk)
                    return True
                if r.status_code == 404:
                    return False
        except requests.RequestException:
            pass
        time.sleep(backoff**i)
    return False

def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = pd.Index([
        (c.decode("utf-8","ignore") if isinstance(c,(bytes,bytearray)) else str(c)).upper()
        for c in df.columns
    ])
    return df

def _attach_cycle(df: pd.DataFrame, cyc: int) -> pd.DataFrame:
    df = df.copy()
    if "SDDSRVYR" in df.columns:
        ser = pd.to_numeric(df["SDDSRVYR"], errors="coerce")
        df["SDDSRVYR"] = ser.fillna(cyc)
    else:
        df["SDDSRVYR"] = cyc
    return df

def download_fsq_all(specs=FSQ_SPECS):
    out = []
    for cyc, label, folder, fnames in specs:
        got = False
        for fname in fnames:
            cand = [fname, fname[:-4] + (".XPT" if fname.lower().endswith(".xpt") else ".xpt")]
            for f in cand:
                url = BASE.format(folder=folder, fname=f)
                local = FSQ_DIR / f"{folder}_{f}"
                if local.exists() and local.stat().st_size > 0:
                    print(f"✓ Exists: {label} @ {folder}/{f}")
                    out.append((local, cyc)); got = True; break
                print(f"→ Fetch {label}: {url}")
                if _try_download(url, local):
                    print(f"  ✓ Saved {local.name}")
                    out.append((local, cyc)); got = True; break
            if got: break
        if not got:
            print(f"ℹ Skipped: {label} — no public FSQ/FSQS at expected URLs")
    return out

# 1) Download & read + adult filter
fsq_paths = download_fsq_all()

fsq_parts = []
for p, cyc in fsq_paths:
    try:
        df = read_any(p)
        df = _normalize_columns(df)
        df = _attach_cycle(df, cyc)
        if "SEQN" not in df.columns:
            print("FSQ skip", p.name, "— no SEQN"); continue
        if 'age20' in globals() and isinstance(age20, pd.DataFrame) and "SEQN" in age20.columns:
            df = df.merge(age20[["SEQN"]], on="SEQN", how="inner")
        df["_SRC"] = p.name.upper()  # to prefer FSQ over FSQS later
        fsq_parts.append(df)
    except Exception as e:
        print("FSQ skip", p.name, "—", e)

fsq_all = pd.concat(fsq_parts, ignore_index=True) if fsq_parts else pd.DataFrame()
fsq_all = _normalize_columns(fsq_all)

if fsq_all.empty or not {"SEQN","SDDSRVYR"}.issubset(fsq_all.columns):
    raise RuntimeError("No FSQ data loaded or missing SEQN/SDDSRVYR. Check downloads.")

# 2) Dedupe FSQ vs FSQS within SEQN+cycle (prefer FSQ, then richer row)
relevant = [c for c in [
    "FSDHH","HHFDSEC","ADFDSEC","FSQ012","FSQ165","FSQ170","FSQ171",
    "FSD170N","FSD180","FSD190","FSD200"
] if c in fsq_all.columns]
fsq_all["_NN"] = fsq_all[relevant].notna().sum(axis=1) if relevant else 0
fsq_all["_PREF"] = fsq_all["_SRC"].str.contains(r"\bFSQ", case=False, regex=True).astype(int)
fsq_all = (fsq_all.sort_values(["SEQN","SDDSRVYR","_PREF","_NN"], ascending=[True,True,False,False])
                   .drop_duplicates(subset=["SEQN","SDDSRVYR"], keep="first")
                   .drop(columns=["_SRC","_NN"], errors="ignore"))

# 3) Build BOTH household & adult FS + SNAP
def col_any(df: pd.DataFrame, names: list[str]):
    for n in names:
        if n in df.columns:
            return df[n]
    return pd.Series(np.nan, index=df.index)

def num_any(df: pd.DataFrame, names: list[str]):
    return pd.to_numeric(col_any(df, names), errors="coerce")

snap = fsq_all[["SEQN","SDDSRVYR"]].copy()

# Raw 4-level categories
snap["FSDHH"]   = pd.to_numeric(fsq_all.get("FSDHH"), errors="coerce")  # household (2003+)
snap["HHFDSEC"] = num_any(fsq_all, ["HHFDSEC","HHFDSEC ","HHfdsec"])
snap["ADFDSEC"] = num_any(fsq_all, ["ADFDSEC","ADFDSEC ","ADfdsec"])

# Household 4-level (harmonized):
# - Prefer FSDHH when present
# - In cycles 1–2, if FSDHH is missing but HHFDSEC exists, use HHFDSEC
snap["FS_HH4"] = snap["FSDHH"]
mask_early = snap["SDDSRVYR"].isin([1,2]) & snap["FS_HH4"].isna() & snap["HHFDSEC"].notna()
snap.loc[mask_early, "FS_HH4"] = snap.loc[mask_early, "HHFDSEC"]

# Adult 4-level
snap["FS_ADULT4"] = snap["ADFDSEC"]

# Binary mappings (1,2 → 0; 3,4 → 1)
map_bin = {1:0, 2:0, 3:1, 4:1}
snap["FS_HH"]    = snap["FS_HH4"].map(map_bin).astype("Int64")
snap["FS_ADULT"] = snap["FS_ADULT4"].map(map_bin).astype("Int64")

# Final FS: prefer household; else adult
snap["FS_FINAL"] = snap["FS_HH"].where(snap["FS_HH"].notna(), snap["FS_ADULT"]).astype("Int64")

# Provenance flags
snap["FS_SOURCE_HH"] = pd.Series(pd.NA, index=snap.index, dtype="string")
snap.loc[snap["FSDHH"].notna(), "FS_SOURCE_HH"] = "FSDHH"
snap.loc[snap["FSDHH"].isna() & snap["HHFDSEC"].notna() & snap["SDDSRVYR"].isin([1,2]), "FS_SOURCE_HH"] = "HHFDSEC"

snap["FS_SOURCE_FINAL"] = pd.Series(pd.NA, index=snap.index, dtype="string")
snap.loc[snap["FS_FINAL"].notna() & snap["FS_HH"].notna(), "FS_SOURCE_FINAL"] = "household"
snap.loc[snap["FS_FINAL"].notna() & snap["FS_HH"].isna() & snap["FS_ADULT"].notna(), "FS_SOURCE_FINAL"] = "adult"

# --- SNAP (ENHANCED: adds 1999–2002 variables FSD180/FSD190 alongside existing ones) ---
snap["SNAP"] = pd.Series(pd.NA, index=snap.index, dtype="Int64")

# Pull numeric helpers (some files have trailing spaces; we guard for that)
n_FSD170N = num_any(fsq_all, ["FSD170N","FSD170N "])     # count authorized persons → ≥1 implies SNAP-related authorization
n_FSD190  = num_any(fsq_all, ["FSD190","FSD190 "])       # months authorized in last 12
cur_FSD200 = (col_any(fsq_all, ["FSD200"]) == 1)         # currently authorized

# YES evidence (set SNAP=1)
yes_mask = (
    (col_any(fsq_all, ["FSQ012"]) == 1) |
    (col_any(fsq_all, ["FSQ171"]) == 1) |
    (col_any(fsq_all, ["FSQ170"]) == 1) |
    (n_FSD170N.ge(1)) |
    (n_FSD190.ge(1)) |
    (col_any(fsq_all, ["FSD180"]) == 1) |   # 1999–2002: authorized in last 12 months
    (cur_FSD200)                             # currently authorized
)
snap.loc[yes_mask, "SNAP"] = 1

# NO evidence (set SNAP=0 only where not already 1)
# Be conservative: require explicit "no" PLUS no months PLUS not current PLUS no authorized persons.
no_mask = (
    (col_any(fsq_all, ["FSQ012"]) == 2) |
    (col_any(fsq_all, ["FSQ171"]) == 2) |
    ((col_any(fsq_all, ["FSQ170"]) == 2) & n_FSD170N.fillna(0).lt(1)) |
    ((col_any(fsq_all, ["FSD180"]) == 2) & n_FSD190.fillna(0).lt(1) & (~cur_FSD200) & n_FSD170N.fillna(0).lt(1))
)
snap.loc[snap["SNAP"].isna() & no_mask, "SNAP"] = 0

snap["SNAP"] = pd.to_numeric(snap["SNAP"], errors="coerce").astype("Int64")

# Types, keys, dedupe
snap = ensure_seqn(snap)  # ensures SEQN Int64 and sane ordering
for c in ["FSDHH","HHFDSEC","ADFDSEC","FS_HH4","FS_ADULT4"]:
    if c in snap.columns:
        snap[c] = pd.to_numeric(snap[c], errors="coerce").astype("Int64")
snap = snap.drop_duplicates(["SEQN","SDDSRVYR"])

# --- Diagnostics
print("SNAP/FS shape:", snap.shape)
print("By cycle (rows):")
print(snap["SDDSRVYR"].value_counts(dropna=False).sort_index())
print("\nCoverage (share non-missing):")
with pd.option_context("display.float_format", "{:.3f}".format):
    print((snap.groupby("SDDSRVYR")[["FS_HH","FS_ADULT","FS_FINAL","SNAP"]]
               .mean().rename(columns=lambda c: f"{c}_pct")))
print("\nSources used for household FS by cycle:")
print(snap.groupby(["SDDSRVYR","FS_SOURCE_HH"])["SEQN"].size().unstack(fill_value=0))


✓ Exists: 1999-2000 @ 1999/FSQ.xpt
✓ Exists: 2001-2002 @ 2001/FSQ_B.xpt
✓ Exists: 2003-2004 @ 2003/FSQ_C.xpt
✓ Exists: 2005-2006 @ 2005/FSQ_D.xpt
✓ Exists: 2007-2008 @ 2007/FSQ_E.xpt
✓ Exists: 2009-2010 @ 2009/FSQ_F.xpt
✓ Exists: 2011-2012 @ 2011/FSQ_G.xpt
✓ Exists: 2013-2014 @ 2013/FSQ_H.xpt
✓ Exists: 2015-2016 @ 2015/FSQ_I.xpt
✓ Exists: 2017-2018 @ 2017/FSQ_J.xpt
✓ Exists: 2017–Mar 2020 (pre-pandemic) @ 2017/P_FSQ.xpt
SNAP/FS shape: (64313, 13)
By cycle (rows):
SDDSRVYR
1     4880
2     5411
3     5041
4     4979
5     5935
6     6218
7     5560
8     5769
9     5719
10    5569
11    9232
Name: count, dtype: int64

Coverage (share non-missing):
          FS_HH_pct  FS_ADULT_pct  FS_FINAL_pct  SNAP_pct
SDDSRVYR                                                 
1             0.125         0.115         0.125     1.000
2             0.132         0.124         0.132     1.000
3             0.133          <NA>         0.133     0.127
4             0.139          <NA>         0.139     0.1

In [118]:
# --- Concise diagnostics: coverage vs prevalence ------------------------------
print("SNAP/FS shape:", snap.shape)
print("Rows by cycle:\n", snap["SDDSRVYR"].value_counts().sort_index(), "\n")

# Coverage (non-missing share) and Prevalence (share==1) for final outputs
final_cols = ["FS_HH","FS_ADULT","FS_FINAL","SNAP"]
cov = snap.groupby("SDDSRVYR")[final_cols].agg(lambda s: s.notna().mean())
prev = snap.groupby("SDDSRVYR")[final_cols].agg(lambda s: (s==1).mean())
with pd.option_context("display.float_format","{:.3f}".format):
    print("Final COVERAGE (non-missing share):\n", cov.rename(columns=lambda c: c+"_cov"), "\n")
    print("Final PREVALENCE (share==1):\n", prev.rename(columns=lambda c: c+"_prev"), "\n")

# Household FS source usage
print("Household FS source by cycle:")
print(snap.groupby(["SDDSRVYR","FS_SOURCE_HH"])["SEQN"].size().unstack(fill_value=0), "\n")

# SNAP source columns — coverage + positive shares
src = ["FSQ012","FSQ170","FSQ171","FSQ165","FSD170N","FSD180","FSD190","FSD200"]
g = fsq_all.groupby("SDDSRVYR")
cov_src = pd.concat(
    [g[c].apply(lambda s: s.notna().mean()).rename(c+"_cov") for c in src if c in fsq_all.columns],
    axis=1
)
pos_src = pd.concat([
    (g["FSQ012"].apply(lambda s: (s==1).mean()) if "FSQ012" in fsq_all.columns else pd.Series(dtype=float)).rename("FSQ012_yes"),
    (g["FSQ170"].apply(lambda s: (s==1).mean()) if "FSQ170" in fsq_all.columns else pd.Series(dtype=float)).rename("FSQ170_yes"),
    (g["FSQ171"].apply(lambda s: (s==1).mean()) if "FSQ171" in fsq_all.columns else pd.Series(dtype=float)).rename("FSQ171_yes"),
    (g["FSD180"].apply(lambda s: (s==1).mean()) if "FSD180" in fsq_all.columns else pd.Series(dtype=float)).rename("FSD180_yes"),
    (g["FSD200"].apply(lambda s: (s==1).mean()) if "FSD200" in fsq_all.columns else pd.Series(dtype=float)).rename("FSD200_yes"),
    (g["FSD170N"].apply(lambda s: pd.to_numeric(s, errors="coerce").ge(1).mean()) if "FSD170N" in fsq_all.columns else pd.Series(dtype=float)).rename("FSD170N_ge1"),
    (g["FSD190"].apply(lambda s: pd.to_numeric(s, errors="coerce").ge(1).mean()) if "FSD190" in fsq_all.columns else pd.Series(dtype=float)).rename("FSD190_ge1"),
    (g["FSQ165"].apply(lambda s: (s==2).mean()) if "FSQ165" in fsq_all.columns else pd.Series(dtype=float)).rename("FSQ165_no"),
], axis=1)
with pd.option_context("display.float_format","{:.3f}".format):
    print("SNAP sources — coverage by cycle:\n", cov_src, "\n")
    print("SNAP sources — positive/evidence shares by cycle:\n", pos_src)


SNAP/FS shape: (64313, 13)
Rows by cycle:
 SDDSRVYR
1     4880
2     5411
3     5041
4     4979
5     5935
6     6218
7     5560
8     5769
9     5719
10    5569
11    9232
Name: count, dtype: int64 

Final COVERAGE (non-missing share):
           FS_HH_cov  FS_ADULT_cov  FS_FINAL_cov  SNAP_cov
SDDSRVYR                                                 
1             0.972         0.972         0.972     0.098
2             0.934         0.934         0.934     0.085
3             0.956         0.000         0.956     0.981
4             0.989         0.000         0.989     0.988
5             0.991         0.000         0.991     0.232
6             0.988         0.000         0.988     0.249
7             0.995         0.000         0.995     0.297
8             0.987         0.000         0.987     0.296
9             0.965         0.000         0.965     0.408
10            0.942         0.000         0.942     0.382
11            0.925         0.000         0.925     0.396 

Final 

In [144]:
# --- FSQ/FSQS (SNAP & Food Security): fetch & build with HH + Adult (1999–2020) ---
from pathlib import Path
import requests, time
import numpy as np
import pandas as pd

FSQ_DIR = (MOD / "fsq"); FSQ_DIR.mkdir(parents=True, exist_ok=True)
BASE = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{folder}/DataFiles/{fname}"

FSQ_SPECS = [
    (1,  "1999-2000", "1999", ["FSQ.xpt",   "FSQS.xpt"]),
    (2,  "2001-2002", "2001", ["FSQ_B.xpt", "FSQS_B.xpt"]),
    (3,  "2003-2004", "2003", ["FSQ_C.xpt", "FSQS_C.xpt"]),
    (4,  "2005-2006", "2005", ["FSQ_D.xpt", "FSQS_D.xpt"]),
    (5,  "2007-2008", "2007", ["FSQ_E.xpt", "FSQS_E.xpt"]),
    (6,  "2009-2010", "2009", ["FSQ_F.xpt", "FSQS_F.xpt"]),
    (7,  "2011-2012", "2011", ["FSQ_G.xpt", "FSQS_G.xpt"]),
    (8,  "2013-2014", "2013", ["FSQ_H.xpt", "FSQS_H.xpt"]),
    (9,  "2015-2016", "2015", ["FSQ_I.xpt", "FSQS_I.xpt"]),
    (10, "2017-2018", "2017", ["FSQ_J.xpt", "FSQS_J.xpt"]),
    (11, "2017–Mar 2020 (pre-pandemic)", "2017", ["P_FSQ.xpt", "P_FSQS.xpt"]),
    # (12, "Aug 2021–Aug 2023", "2021", ["FSQ_L.xpt","FSQS_L.xpt"]),  # not public
]

def _try_download(url: str, dest: Path, retries=3, backoff=1.6) -> bool:
    for i in range(retries):
        try:
            with requests.get(url, stream=True, timeout=45) as r:
                if r.status_code == 200:
                    with open(dest, "wb") as f:
                        for chunk in r.iter_content(1<<15):
                            if chunk: f.write(chunk)
                    return True
                if r.status_code == 404:
                    return False
        except requests.RequestException:
            pass
        time.sleep(backoff**i)
    return False

def _normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = pd.Index([
        (c.decode("utf-8","ignore") if isinstance(c,(bytes,bytearray)) else str(c)).upper()
        for c in df.columns
    ])
    return df

def _attach_cycle(df: pd.DataFrame, cyc: int) -> pd.DataFrame:
    df = df.copy()
    if "SDDSRVYR" in df.columns:
        ser = pd.to_numeric(df["SDDSRVYR"], errors="coerce")
        df["SDDSRVYR"] = ser.fillna(cyc)
    else:
        df["SDDSRVYR"] = cyc
    return df

def download_fsq_all(specs=FSQ_SPECS):
    out = []
    for cyc, label, folder, fnames in specs:
        got = False
        for fname in fnames:
            cand = [fname, fname[:-4] + (".XPT" if fname.lower().endswith(".xpt") else ".xpt")]
            for f in cand:
                url = BASE.format(folder=folder, fname=f)
                local = FSQ_DIR / f"{folder}_{f}"
                if local.exists() and local.stat().st_size > 0:
                    print(f"✓ Exists: {label} @ {folder}/{f}")
                    out.append((local, cyc)); got = True; break
                print(f"→ Fetch {label}: {url}")
                if _try_download(url, local):
                    print(f"  ✓ Saved {local.name}")
                    out.append((local, cyc)); got = True; break
            if got: break
        if not got:
            print(f"ℹ Skipped: {label} — no public FSQ/FSQS at expected URLs")
    return out

# 1) Download & read + adult filter
fsq_paths = download_fsq_all()

fsq_parts = []
for p, cyc in fsq_paths:
    try:
        df = read_any(p)
        df = _normalize_columns(df)
        df = _attach_cycle(df, cyc)
        if "SEQN" not in df.columns:
            print("FSQ skip", p.name, "— no SEQN"); continue
        if 'age20' in globals() and isinstance(age20, pd.DataFrame) and "SEQN" in age20.columns:
            df = df.merge(age20[["SEQN"]], on="SEQN", how="inner")
        df["_SRC"] = p.name.upper()
        fsq_parts.append(df)
    except Exception as e:
        print("FSQ skip", p.name, "—", e)

fsq_all = pd.concat(fsq_parts, ignore_index=True) if fsq_parts else pd.DataFrame()
fsq_all = _normalize_columns(fsq_all)
if fsq_all.empty or not {"SEQN","SDDSRVYR"}.issubset(fsq_all.columns):
    raise RuntimeError("No FSQ data loaded or missing SEQN/SDDSRVYR.")

# 2) Dedupe FSQ vs FSQS by SEQN×cycle (prefer FSQ, then more-filled)
relevant = [c for c in [
    "FSDHH","HHFDSEC","ADFDSEC","FSQ012","FSQ165","FSQ170","FSQ171",
    "FSD170N","FSD180","FSD190","FSD200"
] if c in fsq_all.columns]
fsq_all["_NN"]   = fsq_all[relevant].notna().sum(axis=1) if relevant else 0
fsq_all["_PREF"] = fsq_all["_SRC"].str.contains(r"\bFSQ", case=False, regex=True).astype(int)
fsq_all = (fsq_all.sort_values(["SEQN","SDDSRVYR","_PREF","_NN"], ascending=[True,True,False,False])
                   .drop_duplicates(["SEQN","SDDSRVYR"], keep="first")
                   .drop(columns=["_SRC","_NN"], errors="ignore"))

# 3) Build BOTH household & adult FS (4-level + binary + provenance) and SNAP + SNAP_SOURCE
def col_any(df: pd.DataFrame, names: list[str]):
    for n in names:
        if n in df.columns:
            return df[n]
    return pd.Series(np.nan, index=df.index)

def num_any(df: pd.DataFrame, names: list[str]):
    return pd.to_numeric(col_any(df, names), errors="coerce")

snap = fsq_all[["SEQN","SDDSRVYR"]].copy()

# Raw 4-level categories (keep as extras)
snap["FSDHH"]   = pd.to_numeric(fsq_all.get("FSDHH"), errors="coerce")      # household (2003+)
snap["HHFDSEC"] = num_any(fsq_all, ["HHFDSEC","HHFDSEC ","HHfdsec"])        # early cycles
snap["ADFDSEC"] = num_any(fsq_all, ["ADFDSEC","ADFDSEC ","ADfdsec"])        # early cycles

# Harmonized 4-level
snap["FS_HH4"] = snap["FSDHH"]
mask_early = snap["SDDSRVYR"].isin([1,2]) & snap["FS_HH4"].isna() & snap["HHFDSEC"].notna()
snap.loc[mask_early, "FS_HH4"] = snap.loc[mask_early, "HHFDSEC"]
snap["FS_ADULT4"] = snap["ADFDSEC"]

# Binary mappings
map_bin = {1:0, 2:0, 3:1, 4:1}
snap["FS_HH"]    = snap["FS_HH4"].map(map_bin).astype("Int64")
snap["FS_ADULT"] = snap["FS_ADULT4"].map(map_bin).astype("Int64")
snap["FS_FINAL"] = snap["FS_HH"].where(snap["FS_HH"].notna(), snap["FS_ADULT"]).astype("Int64")

# Provenance
snap["FS_SOURCE_HH"] = pd.Series(pd.NA, index=snap.index, dtype="string")
snap.loc[snap["FSDHH"].notna(), "FS_SOURCE_HH"] = "FSDHH"
snap.loc[snap["FSDHH"].isna() & snap["HHFDSEC"].notna() & snap["SDDSRVYR"].isin([1,2]), "FS_SOURCE_HH"] = "HHFDSEC"

snap["FS_SOURCE_FINAL"] = pd.Series(pd.NA, index=snap.index, dtype="string")
snap.loc[snap["FS_FINAL"].notna() & snap["FS_HH"].notna(), "FS_SOURCE_FINAL"] = "household"
snap.loc[snap["FS_FINAL"].notna() & snap["FS_HH"].isna() & snap["FS_ADULT"].notna(), "FS_SOURCE_FINAL"] = "adult"

# --- SNAP with priority FSD180 for 1999–2002 + SNAP_SOURCE --------------------
snap["SNAP"] = pd.Series(pd.NA, index=snap.index, dtype="Int64")
snap["SNAP_SOURCE"] = pd.Series(pd.NA, index=snap.index, dtype="string")

q012 = col_any(fsq_all, ["FSQ012"])
q171 = col_any(fsq_all, ["FSQ171"])
q170 = col_any(fsq_all, ["FSQ170"])
q165 = col_any(fsq_all, ["FSQ165"])

n170_raw = col_any(fsq_all, ["FSD170N","FSD170N "])
n190_raw = col_any(fsq_all, ["FSD190","FSD190 "])
s180     = col_any(fsq_all, ["FSD180"])
s200     = col_any(fsq_all, ["FSD200"])

# treat 77/99 as missing for counts
n170 = pd.to_numeric(n170_raw, errors="coerce").where(lambda x: x.between(1,30), np.nan)
n190 = pd.to_numeric(n190_raw, errors="coerce").where(lambda x: x.between(1,12), np.nan)

early = snap["SDDSRVYR"].isin([1,2])
late  = ~early

# Early cycles: prioritize FSD180
m = early & (s180 == 1)
snap.loc[m, "SNAP"] = 1; snap.loc[m, "SNAP_SOURCE"] = "FSD180"
m = early & (s180 == 2)
snap.loc[m & snap["SNAP"].isna(), "SNAP"] = 0; snap.loc[m & snap["SNAP"].isna(), "SNAP_SOURCE"] = "FSD180"

m = early & snap["SNAP"].isna() & ((s200 == 1) | n190.ge(1) | n170.ge(1))
snap.loc[m, "SNAP"] = 1; snap.loc[m, "SNAP_SOURCE"] = "FSD200/FSD190/FSD170N"

m = early & snap["SNAP"].isna() & ((s200 == 2) & n190.fillna(0).lt(1) & n170.fillna(0).lt(1))
snap.loc[m, "SNAP"] = 0; snap.loc[m, "SNAP_SOURCE"] = "FSD200(no)+no months/people"

# Later cycles: FSQ items, with fallbacks
m = late & (q012 == 1)
snap.loc[m, "SNAP"] = 1; snap.loc[m, "SNAP_SOURCE"] = "FSQ012"
m = late & (q012 == 2)
snap.loc[m & snap["SNAP"].isna(), "SNAP"] = 0; snap.loc[m & snap["SNAP"].isna(), "SNAP_SOURCE"] = "FSQ012"

m = late & snap["SNAP"].isna() & (q171 == 1)
snap.loc[m, "SNAP"] = 1; snap.loc[m, "SNAP_SOURCE"] = "FSQ171"
m = late & snap["SNAP"].isna() & (q171 == 2)
snap.loc[m, "SNAP"] = 0; snap.loc[m, "SNAP_SOURCE"] = "FSQ171"

m = late & snap["SNAP"].isna() & (q170 == 1)
snap.loc[m, "SNAP"] = 1; snap.loc[m, "SNAP_SOURCE"] = "FSQ170"
m = late & snap["SNAP"].isna() & (q170 == 2) & n170.fillna(0).lt(1)
snap.loc[m, "SNAP"] = 0; snap.loc[m, "SNAP_SOURCE"] = "FSQ170(no)+FSD170N<1"

m = late & snap["SNAP"].isna() & ((n170.ge(1)) | (n190.ge(1)) | (s200 == 1))
snap.loc[m, "SNAP"] = 1; snap.loc[m, "SNAP_SOURCE"] = "FSD170N/190/200"

m = late & snap["SNAP"].isna() & (q165 == 2)
snap.loc[m, "SNAP"] = 0; snap.loc[m, "SNAP_SOURCE"] = "FSQ165(no)"

snap["SNAP"] = pd.to_numeric(snap["SNAP"], errors="coerce").astype("Int64")

# Types & dedupe
snap = ensure_seqn(snap)
for c in ["FSDHH","HHFDSEC","ADFDSEC","FS_HH4","FS_ADULT4"]:
    if c in snap.columns:
        snap[c] = pd.to_numeric(snap[c], errors="coerce").astype("Int64")
snap = snap.drop_duplicates(["SEQN","SDDSRVYR"])

# --- Diagnostics (minimal)
print("SNAP/FS shape:", snap.shape)
print("By cycle (rows):")
print(snap["SDDSRVYR"].value_counts(dropna=False).sort_index())
print("\nFS/SNAP coverage (share non-missing) by cycle:")
with pd.option_context("display.float_format", "{:.3f}".format):
    print((snap.groupby("SDDSRVYR")[["FS_HH","FS_ADULT","FS_FINAL","SNAP"]]
             .agg(lambda s: s.notna().mean())).rename(columns=lambda c: c+"_cov"))
print("\nFS source by cycle:")
print(snap.groupby(["SDDSRVYR","FS_SOURCE_HH"])["SEQN"].size().unstack(fill_value=0))


✓ Exists: 1999-2000 @ 1999/FSQ.xpt
✓ Exists: 2001-2002 @ 2001/FSQ_B.xpt
✓ Exists: 2003-2004 @ 2003/FSQ_C.xpt
✓ Exists: 2005-2006 @ 2005/FSQ_D.xpt
✓ Exists: 2007-2008 @ 2007/FSQ_E.xpt
✓ Exists: 2009-2010 @ 2009/FSQ_F.xpt
✓ Exists: 2011-2012 @ 2011/FSQ_G.xpt
✓ Exists: 2013-2014 @ 2013/FSQ_H.xpt
✓ Exists: 2015-2016 @ 2015/FSQ_I.xpt
✓ Exists: 2017-2018 @ 2017/FSQ_J.xpt
✓ Exists: 2017–Mar 2020 (pre-pandemic) @ 2017/P_FSQ.xpt
SNAP/FS shape: (64313, 14)
By cycle (rows):
SDDSRVYR
1     4880
2     5411
3     5041
4     4979
5     5935
6     6218
7     5560
8     5769
9     5719
10    5569
11    9232
Name: count, dtype: int64

FS/SNAP coverage (share non-missing) by cycle:
          FS_HH_cov  FS_ADULT_cov  FS_FINAL_cov  SNAP_cov
SDDSRVYR                                                 
1             0.972         0.972         0.972     0.097
2             0.934         0.934         0.934     0.085
3             0.956         0.000         0.956     0.981
4             0.989         0.000     

In [145]:
snap.head(100)

Unnamed: 0,SEQN,SDDSRVYR,FSDHH,HHFDSEC,ADFDSEC,FS_HH4,FS_ADULT4,FS_HH,FS_ADULT,FS_FINAL,FS_SOURCE_HH,FS_SOURCE_FINAL,SNAP,SNAP_SOURCE
0,2,1,,1,1,1,1,0,0,0,HHFDSEC,household,,
1,5,1,,1,1,1,1,0,0,0,HHFDSEC,household,,
2,7,1,,,,,,,,,,,,
3,10,1,,,,,,,,,,,,
4,12,1,,1,1,1,1,0,0,0,HHFDSEC,household,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,192,1,,2,2,2,2,0,0,0,HHFDSEC,household,,
96,193,1,,1,1,1,1,0,0,0,HHFDSEC,household,,
97,194,1,,1,1,1,1,0,0,0,HHFDSEC,household,,
98,195,1,,1,1,1,1,0,0,0,HHFDSEC,household,,


##### 
- check snap missingness 

In [146]:
# SNAP missingness by cycle (counts + %), and prevalence
import numpy as np
import pandas as pd

snap_missing = (
    snap.groupby("SDDSRVYR")
        .agg(
            n=("SEQN","size"),
            nonmiss=("SNAP", lambda s: s.notna().sum()),
            missing=("SNAP", lambda s: s.isna().sum()),
            nonmiss_pct=("SNAP", lambda s: s.notna().mean()),
            prev_all=("SNAP", lambda s: s.eq(1).mean()),  # prevalence over all rows (incl. missing)
            prev_nonmiss=("SNAP", lambda s: s[s.notna()].eq(1).mean() if s.notna().any() else np.nan)
        )
)

with pd.option_context("display.float_format","{:.3f}".format):
    print(snap_missing)


             n  nonmiss  missing  nonmiss_pct  prev_all  prev_nonmiss
SDDSRVYR                                                             
1         4880      475     4405        0.097     0.589         0.589
2         5411      460     4951        0.085     0.663         0.663
3         5041     4946       95        0.981     0.127         0.127
4         4979     4918       61        0.988     0.111         0.111
5         5935     5880       55        0.991     0.157         0.157
6         6218     6144       74        0.988     0.198         0.198
7         5560     5514       46        0.992     0.236         0.236
8         5769     5685       84        0.985     0.224         0.224
9         5719     5487      232        0.959     0.259         0.259
10        5569     5201      368        0.934     0.237         0.237
11        9232     8483      749        0.919     0.245         0.245


In [147]:
#### Quick audit — cycles seen & coverage if you want a pulse check

In [148]:
# === Audit helpers ============================================================
def _as_int64(s): 
    return pd.to_numeric(s, errors="coerce").astype("Int64")

def _cycles(df: pd.DataFrame, name: str):
    if df is None or df.empty: print(f"{name} cycles: (empty)"); return
    if "SDDSRVYR" in df.columns and df["SDDSRVYR"].notna().any():
        cyc = pd.to_numeric(df["SDDSRVYR"], errors="coerce").dropna().astype(int).unique()
        print(f"{name} cycles:", sorted(cyc))
    else:
        print(f"{name} cycles: (none)")

def _coverage(base: pd.DataFrame, addon: pd.DataFrame, name: str):
    if addon is None or addon.empty: print(f"{name:>4} | (addon empty)"); return
    b = base.copy(); a = addon.copy()
    if "SEQN" in b: b["SEQN"] = _as_int64(b["SEQN"])
    if "SEQN" in a: a["SEQN"] = _as_int64(a["SEQN"])
    base_u  = b["SEQN"].dropna().nunique()
    add_u   = a["SEQN"].dropna().nunique()
    inner_u = b.merge(a[["SEQN"]].drop_duplicates(), on="SEQN", how="inner")["SEQN"].nunique()
    print(f"{name:>4} | base={base_u:,} | addon(seqn)={add_u:,} | inner keeps={inner_u:,}")

# pick base to compare to
_base_for_audit = base if "base" in globals() else (master if "master" in globals() else sdoh)

for (n, df_) in [("SDOH", sdoh), ("OCQ", ocq), ("HOQ", hoq_all), ("HIQ", ins), ("FSQ", snap)]:
    _cycles(df_, n)
    _coverage(_base_for_audit, df_, n)


SDOH cycles: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(12), np.int64(66)]
SDOH | base=128,809 | addon(seqn)=128,809 | inner keeps=128,809
OCQ cycles: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12)]
 OCQ | base=128,809 | addon(seqn)=72,122 | inner keeps=72,122
HOQ cycles: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(12)]
 HOQ | base=128,809 | addon(seqn)=62,890 | inner keeps=62,890
HIQ cycles: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12)]
 HIQ | base=128,809 | addon(seqn)=72,122 | inner keeps=72,122
FSQ cycles: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5

## Merge these modules onto your base

In [175]:
# === Merge SDOH + modules (left-join with upsert) =============================
master = ensure_seqn(base)
if master["SEQN"].dtype != "Int64":
    master["SEQN"] = pd.to_numeric(master["SEQN"], errors="coerce").astype("Int64")

# SDOH first
sdoh_cols = ["SEQN","PIR","PIR_CAT","INDFMINC","EDU","EDU_CAT","RACE_ETH","MARITAL","MARITAL_CAT"]
sdoh_piece = sdoh[[c for c in sdoh_cols if c in sdoh.columns]].drop_duplicates("SEQN")

def _left_upsert(m: pd.DataFrame, piece: pd.DataFrame, tag: str):
    if piece is None or piece.empty:
        print(f"ℹ️ {tag}: empty — skipped"); 
        return m
    p = ensure_seqn(piece)
    if p["SEQN"].duplicated().any():
        dups = p.loc[p["SEQN"].duplicated(), "SEQN"].unique()[:5]
        raise RuntimeError(f"{tag}: duplicate SEQN detected (e.g., {dups.tolist()} …).")
    overlap = (set(p.columns) & set(m.columns)) - {"SEQN"}
    tmp = m.merge(p, on="SEQN", how="left", validate="one_to_one", suffixes=("", f"_{tag.lower()}"))
    for c in sorted(overlap):
        tmp[c] = tmp[c].fillna(tmp[f"{c}_{tag.lower()}"])
        tmp.drop(columns=[f"{c}_{tag.lower()}"], inplace=True)
    return tmp

if not sdoh_piece.empty:
    master = _left_upsert(master, sdoh_piece, "SDOH")
else:
    print("ℹ️ SDOH: empty — skipped")

# Modules (including FSQ built in Cell A)
#pieces = {
#    "OCQ": ocq[[c for c in ["SEQN","EMPLOY","UNEMPLOYMENT"] if c in ocq.columns]],
#    "HOQ": hoq_all[[c for c in ["SEQN","HOD050","HOQ065"] if c in hoq_all.columns]],
#    "HIQ": ins[[c for c in ["SEQN","INS"] if c in ins.columns]],
#    "FSQ": snap[[c for c in ["SEQN","SNAP","FSDHH","FS", "FS_HH", "FS_ADULT", "FS_FINAL"] if c in snap.columns]],
#}

# --- Modules (incl. FSQ with extras; alias FS := FS_FINAL if FS missing) -----
# Make a safe copy so we can add the FS alias without side effects
snap_use = snap.copy()
if "FS" not in snap_use.columns and "FS_FINAL" in snap_use.columns:
    snap_use["FS"] = snap_use["FS_FINAL"]

fs_cols_base  = ["SEQN", "SNAP", "FSDHH", "FS", "FS_HH", "FS_ADULT", "FS_FINAL"]
fs_cols_extra = ["HHFDSEC", "ADFDSEC", "FS_HH4", "FS_ADULT4",
                 "FS_SOURCE_HH", "FS_SOURCE_FINAL", "SNAP_SOURCE"]
fs_keep = [c for c in (fs_cols_base + fs_cols_extra) if c in snap_use.columns]

pieces = {
    "OCQ": ocq[[c for c in ["SEQN","EMPLOY","UNEMPLOYMENT"] if c in ocq.columns]],
    "HOQ": hoq_all[[c for c in ["SEQN","HOD050","HOQ065"] if c in hoq_all.columns]],
    "HIQ": ins[[c for c in ["SEQN","INS"] if c in ins.columns]],
    "FSQ": snap_use[fs_keep],
}



for name, piece in pieces.items():
    master = _left_upsert(master, piece, name)

print("Merged shape:", master.shape)


Merged shape: (128809, 82)


In [176]:
# rename 
demo_mt_cov_dp_sdoh = master
del master  # optional: drop old handle

#### save final merged table 

In [177]:
# Save final merged table
(out := OUT).mkdir(parents=True, exist_ok=True)

demo_mt_cov_dp_sdoh.to_parquet(out / "demo_mt_cov_dp_sdoh.parquet", index=False)
# demo_mt_cv_dp_sdoh.to_csv(out / "demo_mt_cov_dp_sdoh.csv.gz", index=False, compression="gzip")

print("Saved to:", out)

Saved to: /Users/dengshuyue/Desktop/SDOH/analysis/output


#### check FS_HH and FS_FINAL  

In [169]:
# quick check 
# demo_mt_cov_dp_sdoh["FS_HH"].head(100)

In [170]:
# 3) Coverage by cycle (which cycles actually have FS data?)
# demo_mt_cov_dp_sdoh.groupby("SDDSRVYR")["FS_FINAL"].apply(lambda s: s.notna().mean()).round(3)


#### check: list all column name 

In [165]:
# demo_mt_cov_dp_sdoh.columns.tolist()
# print("\n".join(demo_mt_cov_dp_sdoh.columns))

In [162]:
# --- Quick checks for demo_mt_cov_dp_sdoh (tailored to your columns) ----------
import numpy as np
import pandas as pd

df = demo_mt_cov_dp_sdoh

# 1) Unique key?
dups = df.duplicated(["SEQN","SDDSRVYR"]).sum()
print("Duplicate SEQN×SDDSRVYR rows:", dups)

# 2) Required vs optional SDOH fields (based on what you said you have)
required_core = [
    # OCQ
    "EMPLOY","UNEMPLOYMENT",
    # HOQ
    "HOD050","HOQ065",
    # HIQ
    "INS",
    # FSQ (core you said you have)
    "FSDHH","FS_HH","FS_ADULT","FS_FINAL","SNAP",
]
optional_extras = [
    # FS extras (only if you merged them earlier)
    "HHFDSEC","ADFDSEC","FS_HH4","FS_ADULT4","FS_SOURCE_HH","FS_SOURCE_FINAL","SNAP_SOURCE",
]

missing_core   = [c for c in required_core   if c not in df.columns]
missing_extra  = [c for c in optional_extras if c not in df.columns]
print("Missing CORE cols:", missing_core)
print("Missing OPTIONAL cols (ok to be missing):", missing_extra)

# 3) Coverage + prevalence snapshot for FS and SNAP
cov_prev = (
    df.groupby("SDDSRVYR")
      .agg(
          FS_HH_cov    = ("FS_HH",    lambda s: s.notna().mean()),
          FS_FINAL_cov = ("FS_FINAL", lambda s: s.notna().mean()),
          SNAP_cov     = ("SNAP",     lambda s: s.notna().mean()),
          # prevalence among non-missing
          FS_HH_prev_nonmiss    = ("FS_HH",    lambda s: s[s.notna()].eq(1).mean() if s.notna().any() else np.nan),
          FS_FINAL_prev_nonmiss = ("FS_FINAL", lambda s: s[s.notna()].eq(1).mean() if s.notna().any() else np.nan),
          SNAP_prev_nonmiss     = ("SNAP",     lambda s: s[s.notna()].eq(1).mean() if s.notna().any() else np.nan),
          # overall prevalence (treat missing as 0)
          FS_HH_prev_all    = ("FS_HH",    lambda s: s.fillna(0).eq(1).mean()),
          FS_FINAL_prev_all = ("FS_FINAL", lambda s: s.fillna(0).eq(1).mean()),
          SNAP_prev_all     = ("SNAP",     lambda s: s.fillna(0).eq(1).mean()),
      )
      .round(3)
)
print("\nCoverage & prevalence by cycle:\n", cov_prev)

# 4) FS_FINAL logic check (FS_FINAL should equal FS_HH if present, else FS_ADULT)
expected_fs = df["FS_HH"].where(df["FS_HH"].notna(), df["FS_ADULT"])
logic_ok = (df["FS_FINAL"].fillna(-1) == expected_fs.fillna(-1)).all()
mismatches = (df["FS_FINAL"].fillna(-1) != expected_fs.fillna(-1))
print("\nFS_FINAL logic OK:", logic_ok, " | Mismatch rows:", int(mismatches.sum()))
if mismatches.any():
    print(df.loc[mismatches, ["SEQN","SDDSRVYR","FS_HH","FS_ADULT","FS_FINAL"]].head())

# 5) One row per participant-cycle?
print("\nRows:", len(df), " | Unique keys:", df[["SEQN","SDDSRVYR"]].drop_duplicates().shape[0])


Duplicate SEQN×SDDSRVYR rows: 0
Missing CORE cols: []
Missing OPTIONAL cols (ok to be missing): []

Coverage & prevalence by cycle:
           FS_HH_cov  FS_FINAL_cov  SNAP_cov  FS_HH_prev_nonmiss  \
SDDSRVYR                                                          
1.0           0.476         0.476     0.048               0.125   
2.0           0.458         0.458     0.042               0.132   
3.0           0.476         0.476     0.489               0.133   
4.0           0.476         0.476     0.475               0.139   
5.0           0.579         0.579     0.579               0.160   
6.0           0.583         0.583     0.583               0.203   
7.0           0.567         0.567     0.565               0.202   
8.0           0.559         0.559     0.559               0.193   
9.0           0.553         0.553     0.550               0.248   
10.0          0.567         0.567     0.562               0.234   
12.0          0.000         0.000     0.000                 NaN

In [163]:
added = [c for c in ["HHFDSEC","ADFDSEC","FS_HH4","FS_ADULT4","FS_SOURCE_HH","FS_SOURCE_FINAL","SNAP_SOURCE"]
         if c in demo_mt_cv_dp_sdoh.columns]
print("Added cols:", added)
print("Rows unchanged:", len(demo_mt_cv_dp_sdoh))

# Optional: coverage of the added fields
print(demo_mt_cov_dp_sdoh[added].notna().mean().round(3))


Added cols: ['HHFDSEC', 'ADFDSEC', 'FS_HH4', 'FS_ADULT4', 'FS_SOURCE_HH', 'FS_SOURCE_FINAL', 'SNAP_SOURCE']
Rows unchanged: 128809
HHFDSEC            0.076
ADFDSEC            0.076
FS_HH4             0.482
FS_ADULT4          0.076
FS_SOURCE_HH       0.482
FS_SOURCE_FINAL    0.482
SNAP_SOURCE        0.380
dtype: float64


#### check my column name with previous coworker's name 

In [None]:
# demo_mt_cov_dp_sdoh.columns.tolist()

In [141]:
import pandas as pd

# 85-column reference (from your CSV, after dropping DIABE, ins2, unemployment2,
# and the lowercase duplicates: death_heart, death_cancer, death_resp, death_cerev, death_diabe)
REF85 = [
 'SEQN','marriage','SDDSRVYR','HOD050','HOQ065','employ','unemployment','ELIGSTAT','MORTSTAT',
 'UCOD_LEADING','DIABETES','HYPERTEN','PERMTH_INT','PERMTH_EXM','Death_heart','Death_cancer',
 'Death_resp','Death_cerev','Death_diabe','Death_other','death_cvd','death_cmd','SNAP','FSDHH','FS',
 'ins','RIDAGEYR','SEX','RACE','EDU','INDFMPIR','SMK_AVG','SMK','ALCG2','HEI2015_TOTAL_SCORE',
 'WTDRD1','WTDR2D','DR12DRST','i_FCS','i_Optup','i_HSR','i_nutri','sdmvpsu','sdmvstra','met_hr',
 'perE_alco','dm_self','tchol','hdl','ldl','tg','bmi','CVD','dm_rx','chol_rx','angina_rx','lung_disease',
 'angina','hba1c','sbp','dbp','cancer','wt10','wt','i_FCS_sd','i_Optup_sd','i_nutri_sd','i_HSR_sd',
 'hei2015_sd','Death_inj','Death_alz','Death_infl','Death_kid','death_other1','Death_oth2','death_cmdk',
 'death_cmdkh','death_multi','agesq','py','agestart','ageend','pir','bmic','include'
]

df = demo_mt_cov_dp_sdoh.copy()  # your DataFrame

# Case-insensitive helpers
ref_upper = {c.upper(): c for c in REF85}
df_upper  = {c.upper(): c for c in df.columns}

# Exact, missing, extras (case-sensitive)
exact = sorted(set(REF85) & set(df.columns))
missing = sorted(set(REF85) - set(df.columns))
extras = sorted(set(df.columns) - set(REF85))

# Build an OLD->NEW rename map:
# 1) case-only differences: if df has SBP but ref expects 'sbp', suggest SBP->sbp
rename_map = {}
for up, df_name in df_upper.items():
    if up in ref_upper:
        ref_name = ref_upper[up]
        if df_name != ref_name:
            rename_map[df_name] = ref_name

# 2) common synonym fixes (ONLY if present in df and missing in ref)
synonyms = {
    # examples: tweak as you find more
    'HTN': 'HYPERTEN',
    'TOTAL_CHOL' : 'tchol',
    'HDL_CHOL'   : 'hdl',
    'LDL_CHOL'   : 'ldl',
    'TRIG'       : 'tg',
    'BMI'        : 'bmi',
    'SBP'        : 'sbp',
    'DBP'        : 'dbp',
    'INS': 'ins',
    'EMPLOYMENT_STATUS': 'employ',
    'UNEMPLOYED': 'unemployment',
    'WT': 'wt',
    'WT10': 'wt10',
    'MET_HR': 'met_hr',
}
for old, new in synonyms.items():
    if old in df.columns and new in REF85 and new not in df.columns:
        rename_map[old] = new

# Preview post-rename set
preview_cols = [rename_map.get(c, c) for c in df.columns]
missing_after = sorted(set(REF85) - set(preview_cols))

print("=== Demo vs Ref85 (column names) ===")
print(f"Ref85 count: {len(REF85)}")
print(f"Your count:  {len(df.columns)}")
print(f"Exact matches (case-sensitive): {len(exact)}")
print("\nMissing in your df:")
print(missing)
print("\nExtras in your df:")
print(extras)
print("\nProposed renames (old → new):")
for k, v in rename_map.items():
    print(f"  {k} → {v}")
print("\nStill missing after proposed renames:")
print(missing_after)

# When ready to apply:
# df = df.rename(columns=rename_map)


=== Demo vs Ref85 (column names) ===
Ref85 count: 85
Your count:  74
Exact matches (case-sensitive): 15

Missing in your df:
['ALCG2', 'DR12DRST', 'Death_alz', 'Death_cancer', 'Death_cerev', 'Death_diabe', 'Death_heart', 'Death_infl', 'Death_inj', 'Death_kid', 'Death_oth2', 'Death_other', 'Death_resp', 'FS', 'HEI2015_TOTAL_SCORE', 'HYPERTEN', 'INDFMPIR', 'RACE', 'RIDAGEYR', 'SMK', 'SMK_AVG', 'WTDR2D', 'WTDRD1', 'ageend', 'agesq', 'agestart', 'angina', 'angina_rx', 'bmi', 'bmic', 'cancer', 'chol_rx', 'dbp', 'death_cmd', 'death_cmdk', 'death_cmdkh', 'death_cvd', 'death_multi', 'death_other1', 'dm_rx', 'dm_self', 'employ', 'hba1c', 'hdl', 'hei2015_sd', 'i_FCS', 'i_FCS_sd', 'i_HSR', 'i_HSR_sd', 'i_Optup', 'i_Optup_sd', 'i_nutri', 'i_nutri_sd', 'include', 'ins', 'ldl', 'lung_disease', 'marriage', 'met_hr', 'perE_alco', 'pir', 'py', 'sbp', 'sdmvpsu', 'sdmvstra', 'tchol', 'tg', 'unemployment', 'wt', 'wt10']

Extras in your df:
['AGE_YR', 'ALCOHOL_CAT', 'BMI', 'BMI_CLAS', 'BMXHT', 'BMXWT', 'CA