# 05 — Link SDOH (DEMO) + OCQ/HOQ/HIQ/FSQ with robust I/O (1999–2023)

**Patches included**
- Robust XPT reading with encoding fallbacks.
- Use `pd.Int64Dtype()` for nullable integers instead of `'Int64'` strings.
- Race/ethnicity combined as an aligned `Series`.
- Adult filter uses safe numeric casting for `RIDAGEYR`.


In [1]:
from pathlib import Path
import os
import pandas as pd
import numpy as np

ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
DATA = ROOT / "data"
OUT  = ROOT / "output"
MOD  = DATA / "nhanes_by_module"
OUT.mkdir(parents=True, exist_ok=True)
print("ROOT:", ROOT)

ROOT: /Users/dengshuyue/Desktop/SDOH/analysis


In [2]:
def read_any(p: Path) -> pd.DataFrame:
    s = p.suffix.lower()
    if s == ".xpt":
        import pyreadstat
        # Try default first, then latin-1 fallback (handles DEMO_L.xpt issue)
        try:
            df, _ = pyreadstat.read_xport(str(p))
        except Exception:
            df, _ = pyreadstat.read_xport(str(p), encoding="latin1")
    elif s == ".sas7bdat":
        df = pd.read_sas(str(p), format="sas7bdat", encoding="latin1")
    elif s == ".csv":
        # Try utf-8, then latin-1
        try:
            df = pd.read_csv(p)
        except UnicodeDecodeError:
            df = pd.read_csv(p, encoding="latin1")
    elif s == ".parquet":
        df = pd.read_parquet(p)
    else:
        raise ValueError(f"Unsupported file: {p}")
    df.columns = df.columns.str.upper()
    return df

def ensure_seqn(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d.columns = d.columns.str.upper()
    d["SEQN"] = pd.to_numeric(d["SEQN"], errors="coerce").astype(pd.Int64Dtype())
    return d.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])

In [3]:
BASE = OUT / "cov_core_mort_dep_all_1999_2023.parquet"
if not BASE.exists():
    for cand in [OUT/"cov_core_mort_dep_1999_2023.parquet", OUT/"demo_cov_dep_1999_2023.parquet"]:
        if cand.exists():
            BASE = cand
            break
print("Base file:", BASE)
base = pd.read_parquet(BASE)
base.columns = base.columns.str.upper()
base["SEQN"] = pd.to_numeric(base["SEQN"], errors="coerce").astype(pd.Int64Dtype())
print("Base shape:", base.shape)

Base file: /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_core_mort_dep_all_1999_2023.parquet
Base shape: (128809, 56)


## DEMO across cycles → SDOH + Adult filter

In [4]:
DEMO_DIR = MOD / "demo"
keep_cols = [
    "SEQN","SDDSRVYR","RIDAGEYR","RIAGENDR","WTMEC2YR",
    "INDFMPIR","INDFMINC","DMDEDUC2","DMDEDUC3","DMDMARTL",
    "RIDRETH1","RIDRETH2","RIDRETH3",
]

cand = []
if DEMO_DIR.exists():
    for pat in ["*DEMO*.xpt","*DEMO*.XPT","*demo*.sas7bdat","*demo*.csv","*demo*.parquet"]:
        cand.extend(sorted(DEMO_DIR.glob(pat)))

parts = []
for p in cand:
    try:
        d0 = read_any(p)
        cols = [c for c in keep_cols if c in d0.columns]
        if cols:
            parts.append(d0[cols].copy())
    except Exception as e:
        print("Skip", p.name, "—", e)

demo = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=keep_cols)
demo = ensure_seqn(demo)
print("DEMO shape:", demo.shape)

# Adult filter table (≥20)
ridage = pd.to_numeric(demo.get("RIDAGEYR"), errors="coerce")
age20 = demo.loc[ridage >= 20, ["SEQN","RIDAGEYR","SDDSRVYR"]].copy()

# Derive SDOH harmonized
d = demo.copy()
d["PIR"] = pd.to_numeric(d.get("INDFMPIR", np.nan), errors="coerce")
d["PIR_CAT"] = pd.cut(d["PIR"], [-np.inf,1.3,2.0,4.0,np.inf], labels=["<1.30","1.30–<2.00","2.00–<4.00","≥4.00"]) 
d["EDU"] = np.nan
if "DMDEDUC2" in d: d.loc[d["DMDEDUC2"].notna(), "EDU"] = d.loc[d["DMDEDUC2"].notna(), "DMDEDUC2"]
if "DMDEDUC3" in d: d.loc[d["EDU"].isna() & d["DMDEDUC3"].notna(), "EDU"] = d.loc[d["EDU"].isna(), "DMDEDUC3"]
d["EDU"] = pd.to_numeric(d["EDU"], errors="coerce").astype(pd.Int64Dtype())

edu2_map = {1:"<9th grade", 2:"9–11th (incl. 12th, no diploma)", 3:"High school/GED", 4:"Some college or AA", 5:"College graduate or above"}
edu3_map = {1:"<9th grade", 2:"9–11th", 3:"High school/GED", 4:"Some college or AA", 5:"College graduate or above"}
def label_edu(row):
    v2, v3 = row.get("DMDEDUC2", np.nan), row.get("DMDEDUC3", np.nan)
    if pd.notna(v2): return edu2_map.get(int(v2), np.nan)
    if pd.notna(v3): return edu3_map.get(int(v3), np.nan)
    return np.nan
d["EDU_CAT"] = d.apply(label_edu, axis=1)

# Race/ethnicity as an aligned Series
r3 = d.get("RIDRETH3") if "RIDRETH3" in d else pd.Series(index=d.index, dtype=float)
r2 = d.get("RIDRETH2") if "RIDRETH2" in d else pd.Series(index=d.index, dtype=float)
r1 = d.get("RIDRETH1") if "RIDRETH1" in d else pd.Series(index=d.index, dtype=float)
race_series = r3.where(r3.notna(), r2.where(r2.notna(), r1))
race_series = pd.to_numeric(race_series, errors="coerce").astype(pd.Int64Dtype())
d["RACE_ETH_CODE"] = race_series

race_map_r3 = {1:"Mexican American",2:"Other Hispanic",3:"NH White",4:"NH Black",6:"NH Asian",7:"NH Other/Multi"}
race_map_r2 = {1:"Mexican American",2:"Other Hispanic",3:"NH White",4:"NH Black",5:"Other (incl. Multi)"}
race_map_r1 = {1:"Mexican American",2:"Other Hispanic",3:"NH White",4:"NH Black",5:"Other (incl. Multi)"}
def label_race(row):
    if pd.notna(row.get("RIDRETH3", np.nan)): return race_map_r3.get(int(row["RIDRETH3"]), np.nan)
    if pd.notna(row.get("RIDRETH2", np.nan)): return race_map_r2.get(int(row["RIDRETH2"]), np.nan)
    if pd.notna(row.get("RIDRETH1", np.nan)): return race_map_r1.get(int(row["RIDRETH1"]), np.nan)
    return np.nan
d["RACE_ETH"] = d.apply(label_race, axis=1)

d["MARITAL"] = pd.to_numeric(d.get("DMDMARTL", np.nan), errors="coerce").astype(pd.Int64Dtype())
mar_map = {1:"Married",2:"Widowed",3:"Divorced",4:"Separated",5:"Never married",6:"Living with partner"}
d["MARITAL_CAT"] = d["MARITAL"].map(mar_map)

sdoh = d[[c for c in [
    "SEQN","SDDSRVYR","INDFMPIR","INDFMINC","DMDEDUC2","DMDEDUC3","DMDMARTL",
    "RIDRETH1","RIDRETH2","RIDRETH3","PIR","PIR_CAT","EDU","EDU_CAT","RACE_ETH","MARITAL","MARITAL_CAT"
] if c in d.columns]].drop_duplicates(subset=["SEQN"]).copy()
sdoh = ensure_seqn(sdoh)
print("SDOH tidy shape:", sdoh.shape)

DEMO shape: (128809, 13)
SDOH tidy shape: (128809, 17)


## Continue with OCQ/HOQ/HIQ/FSQ merge as in prior notebook

In [5]:
# For brevity here, you can paste the OCQ/HOQ/HIQ/FSQ sections from the previous notebook.
# They will now benefit from the robust helpers above (encoding fallbacks, Int64 dtype, adult filter).

## HERE! next need extend those to 99-23 and check column name and content to match lu !

#### OCQ — Employment status (1999–2002 early + 2003–2018 main; adult filter)

In [6]:
# --- OCQ (Employment): build EMPLOY + UNEMPLOYMENT --------------------------------
from pathlib import Path
import numpy as np
import pandas as pd

OCQ_DIR = MOD / "ocq"

# Collect candidate files
ocq_files = []
if OCQ_DIR.exists():
    for pat in ["OCQ*.xpt", "ocq*.sas7bdat", "ocq*.parquet", "ocq*.csv"]:
        ocq_files.extend(sorted(OCQ_DIR.glob(pat)))

# Simple cycle inference when SDDSRVYR is missing (suffix A=1 ... J=10)
_CYCLE_HINT = {"A":1,"B":2,"C":3,"D":4,"E":5,"F":6,"G":7,"H":8,"I":9,"J":10,"K":11,"L":12}

def _infer_cycle_if_missing(df: pd.DataFrame, path: Path) -> pd.DataFrame:
    if "SDDSRVYR" not in df.columns:
        name = path.stem.upper()
        cyc = None
        for k, v in _CYCLE_HINT.items():
            if name.endswith(f"_{k}"):
                cyc = v
                break
        if cyc is not None:
            df = df.copy()
            df["SDDSRVYR"] = cyc
    return df

def _recode_employment(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["EMPLOY"] = np.nan
    if "OCD150" in df.columns:
        df.loc[df["OCD150"] == 1, "EMPLOY"] = 1  # working
        df.loc[df["OCD150"] == 3, "EMPLOY"] = 2  # not working
    if "OCQ380" in df.columns:
        df.loc[df["OCQ380"] == 5, "EMPLOY"] = 2  # looking for work
        df.loc[df["OCQ380"] == 3, "EMPLOY"] = 3  # retired
        df.loc[df["OCQ380"].isin([4, 6]), "EMPLOY"] = 4  # disabled/other
        df.loc[df["OCQ380"].isin([1, 2, 7]), "EMPLOY"] = 5  # working or unknown
    df["UNEMPLOYMENT"] = (df["EMPLOY"] == 2).astype(pd.Int64Dtype())
    keep = [c for c in ["SEQN", "SDDSRVYR", "EMPLOY", "UNEMPLOYMENT"] if c in df.columns]
    return df[keep]

ocq_parts = []
for p in ocq_files:
    try:
        df = read_any(p)
        df = _infer_cycle_if_missing(df, p)
        # adult filter (≥20) using DEMO-based age20
        df = df.merge(age20[["SEQN"]], on="SEQN", how="inner")
        ocq_parts.append(_recode_employment(df))
    except Exception as e:
        print("OCQ skip", p.name, "—", e)

ocq = pd.concat(ocq_parts, ignore_index=True) if ocq_parts else pd.DataFrame(columns=["SEQN","SDDSRVYR","EMPLOY","UNEMPLOYMENT"])
ocq = ensure_seqn(ocq)
print("OCQ shape:", ocq.shape)


OCQ shape: (55081, 4)


#### HOQ — Housing (subset) (1999–2002 early + 2003–2018 main; adult filter)

In [7]:
# --- HOQ (Housing): keep HOD050 + HOQ065 (7/9 -> NA) ------------------------------
from pathlib import Path
import numpy as np
import pandas as pd

HOQ_DIR = MOD / "hoq"

hoq_files = []
if HOQ_DIR.exists():
    for pat in ["HOQ*.xpt", "hoq*.sas7bdat", "hoq*.parquet", "hoq*.csv"]:
        hoq_files.extend(sorted(HOQ_DIR.glob(pat)))

hoq_parts = []
for p in hoq_files:
    try:
        df = read_any(p)
        df = _infer_cycle_if_missing(df, p)
        df = df.merge(age20[["SEQN"]], on="SEQN", how="inner")
        if "HOQ065" in df.columns:
            df.loc[df["HOQ065"].isin([7, 9]), "HOQ065"] = np.nan
        keep = [c for c in ["SEQN", "SDDSRVYR", "HOD050", "HOQ065"] if c in df.columns]
        if keep:
            hoq_parts.append(df[keep])
    except Exception as e:
        print("HOQ skip", p.name, "—", e)

hoq_all = pd.concat(hoq_parts, ignore_index=True) if hoq_parts else pd.DataFrame(columns=["SEQN","SDDSRVYR","HOD050","HOQ065"])
hoq_all = ensure_seqn(hoq_all)
print("HOQ shape:", hoq_all.shape)


HOQ shape: (55081, 4)


#### HIQ/HIQS — Health Insurance → INS recode (adult filter)

In [8]:
# --- HIQ/HIQS (Insurance): build INS ---------------------------------------------
from pathlib import Path
import numpy as np
import pandas as pd

HIQ_DIR = MOD / "hiq"

hiq_files = []
if HIQ_DIR.exists():
    for pat in ["HIQ*.xpt", "hiq*.sas7bdat", "hiq*.parquet", "hiq*.csv", "HIQS*.sas7bdat"]:
        hiq_files.extend(sorted(HIQ_DIR.glob(pat)))

hiq_parts = []
for p in hiq_files:
    try:
        df = read_any(p)
        df = _infer_cycle_if_missing(df, p)
        df = df.merge(age20[["SEQN"]], on="SEQN", how="inner")
        hiq_parts.append(df)
    except Exception as e:
        print("HIQ skip", p.name, "—", e)

hiq_all = pd.concat(hiq_parts, ignore_index=True) if hiq_parts else pd.DataFrame()
hiq_all.columns = hiq_all.columns.str.upper()

# helper to safely access a column as a Series aligned to index
def s(df: pd.DataFrame, col: str, default_val=np.nan):
    return df[col] if col in df.columns else pd.Series(default_val, index=df.index)

ins = pd.DataFrame({"SEQN": s(hiq_all, "SEQN")})
if "SDDSRVYR" in hiq_all:
    ins["SDDSRVYR"] = hiq_all["SDDSRVYR"]
ins["INS"] = np.nan  # will cast at end

# Private
cond_private = s(hiq_all,"HIQ031A").eq(14) | s(hiq_all,"HID030A").eq(1)
ins.loc[cond_private, "INS"] = 1

# Medicare
cond_med = (s(hiq_all,"HIQ031B").eq(15) & ~s(hiq_all,"HIQ031D").eq(17) & ~s(hiq_all,"HIQ031E").eq(18)) | \
           (s(hiq_all,"HID030B").eq(1) & ~s(hiq_all,"HID030C").eq(1))
ins.loc[cond_med, "INS"] = 2

# Medicaid only or both → 3
cond_mcaid_only = ((s(hiq_all,"HIQ031D").eq(17) | s(hiq_all,"HIQ031E").eq(18)) & ~s(hiq_all,"HIQ031B").eq(15)) | \
                  (~s(hiq_all,"HID030B").eq(1) & s(hiq_all,"HID030C").eq(1))
cond_both = (s(hiq_all,"HIQ031B").eq(15) & s(hiq_all,"HIQ031D").eq(17)) | \
            (s(hiq_all,"HID030B").eq(1) & s(hiq_all,"HID030C").eq(1))
ins.loc[cond_mcaid_only | cond_both, "INS"] = 3

# Other
other_cols = [c for c in ["HIQ031C","HIQ031F","HIQ031G","HIQ031H","HIQ031I","HID030D"] if c in hiq_all.columns]
cond_other = hiq_all[other_cols].eq(1).any(axis=1) if other_cols else pd.Series(False, index=hiq_all.index)
ins.loc[cond_other, "INS"] = 5

# None
none_conds = []
if "HIQ011" in hiq_all: none_conds.append(hiq_all["HIQ011"].eq(2))
if "HID010" in hiq_all: none_conds.append(hiq_all["HID010"].eq(2))
if none_conds:
    from functools import reduce
    ins.loc[reduce(lambda a,b: a|b, none_conds), "INS"] = 0

ins = ensure_seqn(ins)
ins["INS"] = pd.to_numeric(ins["INS"], errors="coerce").astype(pd.Int64Dtype())
print("INS shape:", ins.shape)


INS shape: (55081, 3)


#### FSQ/FSQS — SNAP & Food Security (adult filter)

In [37]:
# --- FSQ/FSQS (SNAP & Food Security): build `snap` with SEQN, SDDSRVYR, FSDHH, SNAP, FS ----
from pathlib import Path
import numpy as np
import pandas as pd

FSQ_DIR = MOD / "fsq"  # e.g., Path("/Users/.../modules/fsq")

# 1) Collect files
fsq_files = []
if FSQ_DIR.exists():
    for pat in ["FSQ*.xpt", "fsq*.sas7bdat", "fsq*.parquet", "fsq*.csv", "FSQS*.sas7bdat"]:
        fsq_files.extend(sorted(FSQ_DIR.glob(pat)))

# 2) Read & harmonize per-file, restrict to your analysis universe (age20)
fsq_parts = []
for p in fsq_files:
    try:
        df = read_any(p)                             # your generic reader
        df = _infer_cycle_if_missing(df, p)          # should add SDDSRVYR if missing
        if "SEQN" not in df.columns:
            print("FSQ skip", p.name, "— no SEQN")
            continue
        # keep only participants you care about (optional but keeps things tidy)
        if 'age20' in globals() and isinstance(age20, pd.DataFrame) and "SEQN" in age20.columns:
            df = df.merge(age20[["SEQN"]], on="SEQN", how="inner")
        fsq_parts.append(df)
    except Exception as e:
        print("FSQ skip", p.name, "—", e)

fsq_all = pd.concat(fsq_parts, ignore_index=True) if fsq_parts else pd.DataFrame()
fsq_all.columns = fsq_all.columns.str.upper()

# 3) Build the harmonized `snap` piece
def col_or_nan(df, col):
    return df[col] if col in df.columns else pd.Series(np.nan, index=df.index)

snap = pd.DataFrame({"SEQN": col_or_nan(fsq_all, "SEQN")})
if "SDDSRVYR" in fsq_all.columns:
    snap["SDDSRVYR"] = fsq_all["SDDSRVYR"]

# FSDHH (household food security: 1=High, 2=Marginal, 3=Low, 4=Very low)
if "FSDHH" in fsq_all.columns:
    snap["FSDHH"] = pd.to_numeric(fsq_all["FSDHH"], errors="coerce")
else:
    snap["FSDHH"] = np.nan

# SNAP (program participation; multiple instrument variants across years)
snap["SNAP"] = np.nan

# map common variants to 0/1; unknown/missing stay NaN
if "FSQ165" in fsq_all.columns:  # sometimes a negative wording (e.g., not received)
    snap.loc[fsq_all["FSQ165"].eq(2), "SNAP"] = 0
if "FSQ012" in fsq_all.columns:
    snap.loc[fsq_all["FSQ012"].eq(1), "SNAP"] = 1
    snap.loc[fsq_all["FSQ012"].eq(2), "SNAP"] = 0
if "FSQ171" in fsq_all.columns:
    snap.loc[fsq_all["FSQ171"].eq(1), "SNAP"] = 1
    snap.loc[fsq_all["FSQ171"].eq(2), "SNAP"] = 0
if "FSD170N" in fsq_all.columns:
    # number of months on SNAP; >=1 implies participated
    snap.loc[pd.to_numeric(fsq_all["FSD170N"], errors="coerce").ge(1), "SNAP"] = 1
if "FSQ170" in fsq_all.columns:
    snap.loc[fsq_all["FSQ170"].eq(1), "SNAP"] = 1
    snap.loc[(fsq_all["FSQ170"].eq(2)) & (pd.to_numeric(col_or_nan(fsq_all, "FSD170N"), errors="coerce").fillna(0).lt(1)), "SNAP"] = 0
if "FSD200" in fsq_all.columns:
    snap.loc[fsq_all["FSD200"].eq(1), "SNAP"] = 1

# FS (binary food insecurity) — CORRECT mapping:
# 1,2 = food secure (0); 3,4 = food insecure (1)
snap["FS"] = snap["FSDHH"].map({1:0, 2:0, 3:1, 4:1})

# 4) Types, dedupe, key hygiene
snap = ensure_seqn(snap)  # your helper; ensures Int64 SEQN and sorts/indexes
for c in ["FSDHH", "SNAP", "FS"]:
    if c in snap.columns:
        snap[c] = pd.to_numeric(snap[c], errors="coerce").astype("Int64")
snap = snap.drop_duplicates("SEQN")

print("SNAP/FS shape:", snap.shape)
print("Non-missing rates:", {c: snap[c].notna().mean() if c in snap else None for c in ["FSDHH","SNAP","FS"]})
print("FS=1 (insecure) share:", (snap["FS"]==1).mean() if "FS" in snap else None)


SNAP/FS shape: (55081, 5)
Non-missing rates: {'FSDHH': np.float64(0.7943755559993464), 'SNAP': np.float64(0.8117681233093081), 'FS': np.float64(0.7943755559993464)}
FS=1 (insecure) share: 0.1901268426465547


In [43]:
# === FSQ (SNAP & Food Security) ===============================================
from pathlib import Path
import numpy as np
import pandas as pd

FSQ_DIR = MOD / "fsq"

# 1) gather files
fsq_files = []
if FSQ_DIR.exists():
    for pat in ["FSQ*.xpt","fsq*.sas7bdat","fsq*.parquet","fsq*.csv","FSQS*.sas7bdat"]:
        fsq_files.extend(sorted(FSQ_DIR.glob(pat)))

# 2) read, add cycle, restrict to analysis universe
fsq_parts = []
for p in fsq_files:
    try:
        df = read_any(p)
        df = _infer_cycle_if_missing(df, p)
        if "SEQN" not in df.columns:
            print("FSQ skip", p.name, "— no SEQN"); 
            continue
        if 'age20' in globals() and isinstance(age20, pd.DataFrame) and "SEQN" in age20.columns:
            df = df.merge(age20[["SEQN"]], on="SEQN", how="inner")
        fsq_parts.append(df)
    except Exception as e:
        print("FSQ skip", p.name, "—", e)

fsq_all = pd.concat(fsq_parts, ignore_index=True) if fsq_parts else pd.DataFrame()
fsq_all.columns = fsq_all.columns.str.upper()

# 3) harmonize -> snap (SEQN, SDDSRVYR, FSDHH, SNAP, FS)
def col_or_nan(df, col):
    return df[col] if col in df.columns else pd.Series(np.nan, index=df.index)

snap = pd.DataFrame({"SEQN": col_or_nan(fsq_all, "SEQN")})
if "SDDSRVYR" in fsq_all.columns:
    snap["SDDSRVYR"] = fsq_all["SDDSRVYR"]

# FSDHH (1=High, 2=Marginal, 3=Low, 4=Very low)
snap["FSDHH"] = pd.to_numeric(col_or_nan(fsq_all, "FSDHH"), errors="coerce")

# SNAP (map variants to 0/1; leave others NaN)
snap["SNAP"] = np.nan
if "FSQ165" in fsq_all.columns: snap.loc[fsq_all["FSQ165"].eq(2), "SNAP"] = 0
if "FSQ012" in fsq_all.columns:
    snap.loc[fsq_all["FSQ012"].eq(1), "SNAP"] = 1
    snap.loc[fsq_all["FSQ012"].eq(2), "SNAP"] = 0
if "FSQ171" in fsq_all.columns:
    snap.loc[fsq_all["FSQ171"].eq(1), "SNAP"] = 1
    snap.loc[fsq_all["FSQ171"].eq(2), "SNAP"] = 0
if "FSD170N" in fsq_all.columns:
    snap.loc[pd.to_numeric(fsq_all["FSD170N"], errors="coerce").ge(1), "SNAP"] = 1
if "FSQ170" in fsq_all.columns:
    snap.loc[fsq_all["FSQ170"].eq(1), "SNAP"] = 1
    snap.loc[(fsq_all["FSQ170"].eq(2)) & (pd.to_numeric(col_or_nan(fsq_all,"FSD170N"),errors="coerce").fillna(0).lt(1)), "SNAP"] = 0
if "FSD200" in fsq_all.columns: snap.loc[fsq_all["FSD200"].eq(1), "SNAP"] = 1

# FS binary (1 = food insecure for FSDHH 3/4; 0 for 1/2)
snap["FS"] = snap["FSDHH"].map({1:0, 2:0, 3:1, 4:1})

# types + dedupe
snap = ensure_seqn(snap)
for c in ["FSDHH","SNAP","FS"]:
    if c in snap.columns:
        snap[c] = pd.to_numeric(snap[c], errors="coerce").astype("Int64")
snap = snap.drop_duplicates("SEQN")

print("SNAP/FS shape:", snap.shape)


SNAP/FS shape: (55081, 5)


In [38]:
#### Quick audit — cycles seen & coverage if you want a pulse check

In [44]:
# === Audit helpers ============================================================
def _as_int64(s): 
    return pd.to_numeric(s, errors="coerce").astype("Int64")

def _cycles(df: pd.DataFrame, name: str):
    if df is None or df.empty: print(f"{name} cycles: (empty)"); return
    if "SDDSRVYR" in df.columns and df["SDDSRVYR"].notna().any():
        cyc = pd.to_numeric(df["SDDSRVYR"], errors="coerce").dropna().astype(int).unique()
        print(f"{name} cycles:", sorted(cyc))
    else:
        print(f"{name} cycles: (none)")

def _coverage(base: pd.DataFrame, addon: pd.DataFrame, name: str):
    if addon is None or addon.empty: print(f"{name:>4} | (addon empty)"); return
    b = base.copy(); a = addon.copy()
    if "SEQN" in b: b["SEQN"] = _as_int64(b["SEQN"])
    if "SEQN" in a: a["SEQN"] = _as_int64(a["SEQN"])
    base_u  = b["SEQN"].dropna().nunique()
    add_u   = a["SEQN"].dropna().nunique()
    inner_u = b.merge(a[["SEQN"]].drop_duplicates(), on="SEQN", how="inner")["SEQN"].nunique()
    print(f"{name:>4} | base={base_u:,} | addon(seqn)={add_u:,} | inner keeps={inner_u:,}")

# pick base to compare to
_base_for_audit = base if "base" in globals() else (master if "master" in globals() else sdoh)

for (n, df_) in [("SDOH", sdoh), ("OCQ", ocq), ("HOQ", hoq_all), ("HIQ", ins), ("FSQ", snap)]:
    _cycles(df_, n)
    _coverage(_base_for_audit, df_, n)


SDOH cycles: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(12), np.int64(66)]
SDOH | base=128,809 | addon(seqn)=128,809 | inner keeps=128,809
OCQ cycles: [np.int64(2), np.int64(3)]
 OCQ | base=128,809 | addon(seqn)=55,081 | inner keeps=55,081
HOQ cycles: [np.int64(2)]
 HOQ | base=128,809 | addon(seqn)=55,081 | inner keeps=55,081
HIQ cycles: [np.int64(2)]
 HIQ | base=128,809 | addon(seqn)=55,081 | inner keeps=55,081
FSQ cycles: [np.int64(2)]
 FSQ | base=128,809 | addon(seqn)=55,081 | inner keeps=55,081


## Merge these modules onto your base

In [49]:
# === Merge SDOH + modules (left-join with upsert) =============================
master = ensure_seqn(base)
if master["SEQN"].dtype != "Int64":
    master["SEQN"] = pd.to_numeric(master["SEQN"], errors="coerce").astype("Int64")

# SDOH first
sdoh_cols = ["SEQN","PIR","PIR_CAT","INDFMINC","EDU","EDU_CAT","RACE_ETH","MARITAL","MARITAL_CAT"]
sdoh_piece = sdoh[[c for c in sdoh_cols if c in sdoh.columns]].drop_duplicates("SEQN")

def _left_upsert(m: pd.DataFrame, piece: pd.DataFrame, tag: str):
    if piece is None or piece.empty:
        print(f"ℹ️ {tag}: empty — skipped"); 
        return m
    p = ensure_seqn(piece)
    if p["SEQN"].duplicated().any():
        dups = p.loc[p["SEQN"].duplicated(), "SEQN"].unique()[:5]
        raise RuntimeError(f"{tag}: duplicate SEQN detected (e.g., {dups.tolist()} …).")
    overlap = (set(p.columns) & set(m.columns)) - {"SEQN"}
    tmp = m.merge(p, on="SEQN", how="left", validate="one_to_one", suffixes=("", f"_{tag.lower()}"))
    for c in sorted(overlap):
        tmp[c] = tmp[c].fillna(tmp[f"{c}_{tag.lower()}"])
        tmp.drop(columns=[f"{c}_{tag.lower()}"], inplace=True)
    return tmp

if not sdoh_piece.empty:
    master = _left_upsert(master, sdoh_piece, "SDOH")
else:
    print("ℹ️ SDOH: empty — skipped")

# Modules (including FSQ built in Cell A)
pieces = {
    "OCQ": ocq[[c for c in ["SEQN","EMPLOY","UNEMPLOYMENT"] if c in ocq.columns]],
    "HOQ": hoq_all[[c for c in ["SEQN","HOD050","HOQ065"] if c in hoq_all.columns]],
    "HIQ": ins[[c for c in ["SEQN","INS"] if c in ins.columns]],
    "FSQ": snap[[c for c in ["SEQN","SNAP","FSDHH","FS"] if c in snap.columns]],
}

for name, piece in pieces.items():
    master = _left_upsert(master, piece, name)

print("Merged shape:", master.shape)


Merged shape: (128809, 72)


In [52]:
# FS sanity
if all(c in master.columns for c in ["SNAP","FSDHH","FS"]):
    print("FS NA rates:", master[["SNAP","FSDHH","FS"]].isna().mean().to_dict())

# finalize & save
demo_mt_cv_dp_sdoh = master.copy()

from pathlib import Path
from datetime import datetime
out_dir = Path("/Users/dengshuyue/Desktop/SDOH/analysis/out"); out_dir.mkdir(parents=True, exist_ok=True)
canon = out_dir / "demo_mt_cv_dp_sdoh.parquet"
ver   = out_dir / f"demo_mt_cv_dp_sdoh_{datetime.now().strftime('%Y%m%d_%H%M')}.parquet"

demo_mt_cv_dp_sdoh.to_parquet(canon, index=False)
demo_mt_cv_dp_sdoh.to_parquet(ver, index=False)
print("Saved:\n ", canon, "\n ", ver)


FS NA rates: {'SNAP': 0.6528736346062776, 'FSDHH': 0.6603110031131365, 'FS': 0.6603110031131365}
Saved:
  /Users/dengshuyue/Desktop/SDOH/analysis/out/demo_mt_cv_dp_sdoh.parquet 
  /Users/dengshuyue/Desktop/SDOH/analysis/out/demo_mt_cv_dp_sdoh_20250916_2033.parquet


In [53]:
# rename 
demo_mt_cv_dp_sdoh = master
del master  # optional: drop old handle

In [55]:
demo_mt_cv_dp_sdoh["FS"].head(100)

0     <NA>
1     <NA>
2     <NA>
3     <NA>
4     <NA>
      ... 
95    <NA>
96    <NA>
97    <NA>
98    <NA>
99    <NA>
Name: FS, Length: 100, dtype: Int64

In [56]:
# 3) Coverage by cycle (which cycles actually have FS data?)
demo_mt_cv_dp_sdoh.groupby("SDDSRVYR")["FS"].apply(lambda s: s.notna().mean()).round(3)


SDDSRVYR
1.0     0.000
2.0     0.000
3.0     0.476
4.0     0.476
5.0     0.579
6.0     0.583
7.0     0.567
8.0     0.559
9.0     0.553
10.0    0.567
12.0    0.000
66.0    0.000
Name: FS, dtype: float64

In [58]:
demo_mt_cv_dp_sdoh.columns.tolist()
print("\n".join(demo_mt_cv_dp_sdoh.columns))

SEQN
SDDSRVYR
SDMVPSU
SDMVSTRA
WTMEC2YR
AGE_YR
RIAGENDR
SEX
FEMALE
SMK_STATUS
CIGS_PER_DAY
PACK_YEARS
FORMER_SMOKER
DRINKS_PER_DAY
ALCOHOL_CAT
LTPA
METSCORE
IMP
BMXWT
BMXHT
BMI
BMI_CLAS
DIABETES
HTN
HIGH_CHOL
CVD
CANCER
SBP
DBP
TCHOL
HDL
LDL
TG
DMDHHSIZ
ELIGSTAT
MORTSTAT
PERMTH_EXM
PERMTH_INT
UCOD_LEADING
IS_POST2018
IS_ADULT
MORTALITY_COVERED
EVENT
CENSORED
FU_YRS_EXM
FU_YRS_INT
UCOD_LABEL
PHQ9
PHQ9_GE10
DPQ_CAT
DEP_IMP
CIDI_SCORE_RAW
CIDI_12M_MDE
WTSCI2YR
DEP_HARMONIZED
DEP_SOURCE
PIR
PIR_CAT
INDFMINC
EDU
EDU_CAT
RACE_ETH
MARITAL
MARITAL_CAT
EMPLOY
UNEMPLOYMENT
HOD050
HOQ065
INS
SNAP
FSDHH
FS


#### check my column name with previous coworker's name 

In [None]:
# demo_mt_cv_dp_sdoh.columns.tolist()

In [24]:
import pandas as pd

# 85-column reference (from your CSV, after dropping DIABE, ins2, unemployment2,
# and the lowercase duplicates: death_heart, death_cancer, death_resp, death_cerev, death_diabe)
REF85 = [
 'SEQN','marriage','SDDSRVYR','HOD050','HOQ065','employ','unemployment','ELIGSTAT','MORTSTAT',
 'UCOD_LEADING','DIABETES','HYPERTEN','PERMTH_INT','PERMTH_EXM','Death_heart','Death_cancer',
 'Death_resp','Death_cerev','Death_diabe','Death_other','death_cvd','death_cmd','SNAP','FSDHH','FS',
 'ins','RIDAGEYR','SEX','RACE','EDU','INDFMPIR','SMK_AVG','SMK','ALCG2','HEI2015_TOTAL_SCORE',
 'WTDRD1','WTDR2D','DR12DRST','i_FCS','i_Optup','i_HSR','i_nutri','sdmvpsu','sdmvstra','met_hr',
 'perE_alco','dm_self','tchol','hdl','ldl','tg','bmi','CVD','dm_rx','chol_rx','angina_rx','lung_disease',
 'angina','hba1c','sbp','dbp','cancer','wt10','wt','i_FCS_sd','i_Optup_sd','i_nutri_sd','i_HSR_sd',
 'hei2015_sd','Death_inj','Death_alz','Death_infl','Death_kid','death_other1','Death_oth2','death_cmdk',
 'death_cmdkh','death_multi','agesq','py','agestart','ageend','pir','bmic','include'
]

df = demo_mt_cv_dp_sdoh.copy()  # your DataFrame

# Case-insensitive helpers
ref_upper = {c.upper(): c for c in REF85}
df_upper  = {c.upper(): c for c in df.columns}

# Exact, missing, extras (case-sensitive)
exact = sorted(set(REF85) & set(df.columns))
missing = sorted(set(REF85) - set(df.columns))
extras = sorted(set(df.columns) - set(REF85))

# Build an OLD->NEW rename map:
# 1) case-only differences: if df has SBP but ref expects 'sbp', suggest SBP->sbp
rename_map = {}
for up, df_name in df_upper.items():
    if up in ref_upper:
        ref_name = ref_upper[up]
        if df_name != ref_name:
            rename_map[df_name] = ref_name

# 2) common synonym fixes (ONLY if present in df and missing in ref)
synonyms = {
    # examples: tweak as you find more
    'HTN': 'HYPERTEN',
    'TOTAL_CHOL' : 'tchol',
    'HDL_CHOL'   : 'hdl',
    'LDL_CHOL'   : 'ldl',
    'TRIG'       : 'tg',
    'BMI'        : 'bmi',
    'SBP'        : 'sbp',
    'DBP'        : 'dbp',
    'INS': 'ins',
    'EMPLOYMENT_STATUS': 'employ',
    'UNEMPLOYED': 'unemployment',
    'WT': 'wt',
    'WT10': 'wt10',
    'MET_HR': 'met_hr',
}
for old, new in synonyms.items():
    if old in df.columns and new in REF85 and new not in df.columns:
        rename_map[old] = new

# Preview post-rename set
preview_cols = [rename_map.get(c, c) for c in df.columns]
missing_after = sorted(set(REF85) - set(preview_cols))

print("=== Demo vs Ref85 (column names) ===")
print(f"Ref85 count: {len(REF85)}")
print(f"Your count:  {len(df.columns)}")
print(f"Exact matches (case-sensitive): {len(exact)}")
print("\nMissing in your df:")
print(missing)
print("\nExtras in your df:")
print(extras)
print("\nProposed renames (old → new):")
for k, v in rename_map.items():
    print(f"  {k} → {v}")
print("\nStill missing after proposed renames:")
print(missing_after)

# When ready to apply:
# df = df.rename(columns=rename_map)


=== Demo vs Ref85 (column names) ===
Ref85 count: 85
Your count:  72
Exact matches (case-sensitive): 16

Missing in your df:
['ALCG2', 'DR12DRST', 'Death_alz', 'Death_cancer', 'Death_cerev', 'Death_diabe', 'Death_heart', 'Death_infl', 'Death_inj', 'Death_kid', 'Death_oth2', 'Death_other', 'Death_resp', 'HEI2015_TOTAL_SCORE', 'HYPERTEN', 'INDFMPIR', 'RACE', 'RIDAGEYR', 'SMK', 'SMK_AVG', 'WTDR2D', 'WTDRD1', 'ageend', 'agesq', 'agestart', 'angina', 'angina_rx', 'bmi', 'bmic', 'cancer', 'chol_rx', 'dbp', 'death_cmd', 'death_cmdk', 'death_cmdkh', 'death_cvd', 'death_multi', 'death_other1', 'dm_rx', 'dm_self', 'employ', 'hba1c', 'hdl', 'hei2015_sd', 'i_FCS', 'i_FCS_sd', 'i_HSR', 'i_HSR_sd', 'i_Optup', 'i_Optup_sd', 'i_nutri', 'i_nutri_sd', 'include', 'ins', 'ldl', 'lung_disease', 'marriage', 'met_hr', 'perE_alco', 'pir', 'py', 'sbp', 'sdmvpsu', 'sdmvstra', 'tchol', 'tg', 'unemployment', 'wt', 'wt10']

Extras in your df:
['AGE_YR', 'ALCOHOL_CAT', 'BMI', 'BMI_CLAS', 'BMXHT', 'BMXWT', 'CANCER',