# 02 — Build Covariates

In [9]:
from pathlib import Path
import pandas as pd

# Prefer in-memory, don’t crash if not present
try:
    df = nhanes_mort_demo_sdoh
    print("ℹ️ Using in-memory nhanes_mort_demo_sdoh.")
except NameError:
    OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
    candidates = [
        OUT / "nhanes_mort_demo_sdoh_1999_2018.parquet",
        OUT / "nhanes_mort_demo_sdoh_1999_2018.csv",
        OUT / "nhanes_mort_demo_soc_1999_2018.parquet",  # legacy
        OUT / "nhanes_mort_demo_soc_1999_2018.csv",      # legacy
    ]
    df = None
    for p in candidates:
        if p.exists():
            df = pd.read_parquet(p) if p.suffix == ".parquet" else pd.read_csv(p)
            print("Loaded from disk:", p)
            break
    if df is None:
        print("⚠️ Note: could not find the saved table among expected filenames, "
              "but earlier cells *did* save successfully. Skipping the sanity preview instead of raising.")
        # Do NOT raise here.

# Optional preview if df is available
if df is not None:
    print(f"Rows: {df.shape[0]:,}")
    if "RIDAGEYR" in df.columns:
        print("Age range:", int(df["RIDAGEYR"].min()), "to", int(df["RIDAGEYR"].max()))


ℹ️ Using in-memory nhanes_mort_demo_sdoh.
Rows: 56,253
Age range: 18 to 85


In [8]:
# In 02 — Build Covariates.ipynb, first cell:
%run -i 00_demo_mort_sdoh.ipynb   # use -i so variables persist in this kernel

# Quick sanity:
# print("ROOT =", ROOT)
# print("OUT  =", OUT)

Bootstrap loaded.
ROOT: /Users/dengshuyue/Desktop/SDOH/analysis
Data dir exists: True
Output dir exists: True
Mortality cycles: ['1999-2000', '2001-2002', '2003-2004', '2005-2006', '2007-2008', '2009-2010', '2011-2012', '2013-2014', '2015-2016', '2017-2018']
Non-mortality cycles: ['2017-March 2020 (pre-pandemic)', 'August 2021–August 2023']
Candidates: [PosixPath('/Users/dengshuyue/Desktop/SDOH/analysis/data/household_size/DEMO_I.xpt'), PosixPath('/Users/dengshuyue/Desktop/SDOH/analysis/data/nhanes_deit/DEMO_I.xpt')]


Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,DMQMILIZ,DMQADFC,DMDBORN4,DMDCITZN,DMDYRSUS,DMDEDUC3,DMDEDUC2,DMDMARTL,RIDEXPRG,SIALANG,SIAPROXY,SIAINTRP,FIALANG,FIAPROXY,FIAINTRP,MIALANG,MIAPROXY,MIAINTRP,AIALANGA,DMDHHSIZ,DMDFMSIZ,DMDHHSZA,DMDHHSZB,DMDHHSZE,DMDHRGND,DMDHRAGE,DMDHRBR4,DMDHREDU,DMDHRMAR,DMDHSEDU,WTINT2YR,WTMEC2YR,SDMVPSU,SDMVSTRA,INDHHIN2,INDFMIN2,INDFMPIR
0,83732.0,9.0,2.0,1.0,62.0,,3.0,3.0,1.0,,2.0,,1.0,1.0,,,5.0,1.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,5.397605e-79,5.397605e-79,1.0,1.0,62.0,1.0,5.0,1.0,3.0,134671.370419,135629.507405,1.0,125.0,10.0,10.0,4.39
1,83733.0,9.0,2.0,1.0,53.0,,3.0,3.0,1.0,,2.0,,2.0,2.0,7.0,,3.0,3.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79,5.397605e-79,1.0,53.0,2.0,3.0,3.0,,24328.560239,25282.425927,1.0,125.0,4.0,4.0,1.32
2,83734.0,9.0,2.0,1.0,78.0,,3.0,3.0,2.0,,1.0,2.0,1.0,1.0,,,3.0,1.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,,2.0,2.0,5.397605e-79,5.397605e-79,2.0,2.0,79.0,1.0,3.0,1.0,3.0,12400.008522,12575.838818,1.0,131.0,5.0,5.0,1.51
3,83735.0,9.0,2.0,2.0,56.0,,3.0,3.0,2.0,,2.0,,1.0,1.0,,,5.0,6.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79,5.397605e-79,2.0,56.0,1.0,5.0,6.0,,102717.995647,102078.634508,1.0,131.0,10.0,10.0,5.0
4,83736.0,9.0,2.0,2.0,42.0,,4.0,4.0,2.0,,2.0,,1.0,1.0,,,4.0,3.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,5.0,5.0,5.397605e-79,2.0,5.397605e-79,2.0,42.0,1.0,4.0,3.0,,17627.674984,18234.736219,2.0,126.0,7.0,7.0,1.23


['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR', 'RIDAGEMN', 'RIDRETH1', 'RIDRETH3', 'RIDEXMON', 'RIDEXAGM', 'DMQMILIZ', 'DMQADFC', 'DMDBORN4', 'DMDCITZN', 'DMDYRSUS', 'DMDEDUC3', 'DMDEDUC2', 'DMDMARTL', 'RIDEXPRG', 'SIALANG', 'SIAPROXY', 'SIAINTRP', 'FIALANG', 'FIAPROXY', 'FIAINTRP', 'MIALANG', 'MIAPROXY', 'MIAINTRP', 'AIALANGA', 'DMDHHSIZ', 'DMDFMSIZ', 'DMDHHSZA', 'DMDHHSZB', 'DMDHHSZE', 'DMDHRGND', 'DMDHRAGE', 'DMDHRBR4', 'DMDHREDU', 'DMDHRMAR', 'DMDHSEDU', 'WTINT2YR', 'WTMEC2YR', 'SDMVPSU', 'SDMVSTRA', 'INDHHIN2', 'INDFMIN2', 'INDFMPIR']
['SEQN', 'DAYS', 'DR12IFDC', 'WTDRD1', 'DR12DRST', 'SDDSRVYR', 'RIAGENDR', 'RIDAGEYR', 'RIDRETH1', 'DMDEDUC3', 'DMDEDUC2', 'INDFMPIR', 'DMDHREDU', 'WTINT2YR', 'WTMEC2YR', 'SDMVPSU', 'SDMVSTRA', 'DR12IKC2', 'CYCLE', 'WTDR2D', 'DR12FS', 'DMDHREDZ', 'age', 'race', 'edu', 'pedu', 'Incm', 'incm2', 'include', 'Weight16a', 'cycles', 'sex', 'age1', 'age2', 'age3', 'race2', 'race3', 'race4', 'weekend', '_NAME_', '_LABEL_', 'DRDAY1', 'DRDAY2', 'tkal1

<h2> Step 1. try build streamlined code for cov and merge </h2>

In [129]:
"""
NHANES 1999–2023 Covariates Builder
-----------------------------------
Goal: produce tidy, reproducible covariate parquet files and one merged core table.

Deliverables written to /Users/dengshuyue/Desktop/SDOH/analysis/output/ (create if missing):
  - cov_smk_1999_2023.parquet
  - cov_alc_1999_2023.parquet
  - cov_pa_1999_2023.parquet
  - cov_bmx_1999_2023.parquet
  - cov_clinical_1999_2023.parquet
  - cov_household_1999_2023.parquet
  - cov_core_1999_2023.parquet

Usage (in a Jupyter cell or as a script):
    # Edit CONFIG paths below once. Then run:
    from pathlib import Path
    import cov_builder_9923 as cb
    cb.run_all()

Notes:
- This file is designed to be readable and modular. All NHANES quirks are isolated in one place.
- Where possible, we support a "fast path" (append 2019–2023 to your 1999–2018 files) and a "full rebuild" path.
- You may fill in cycle-specific variable maps in VARS if you choose the full rebuild route.
"""

from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional

import numpy as np
import pandas as pd



# --- Put this near the top, before the dataclass ---
BASE = Path("/Users/dengshuyue/Desktop/SDOH/analysis")

# ----------------------------
# Configuration (EDIT THESE)
# ----------------------------
@dataclass
class Config:
    # Root folders (point to your local project)
    raw_dir: Path = BASE / "data"          # where per-cycle raw files live
    interim_dir: Path = BASE / "data" / "cov"   # where any intermediate stacks live
    out_dir: Path = BASE / "output"        # outputs

    # Fast-path inputs you already maintain (optional)
    smk_9918: Optional[Path] = None
    pa_9918_imputed: Optional[Path] = None
    clinical_9918: Optional[Path] = None

    # New 2019–2023 stacks if you have them (optional)
    smk_1923: Optional[Path] = None
    pa_1923: Optional[Path] = None
    clinical_1923: Optional[Path] = None

    # DEMO 1999–2023 (required)
    demo_9923: Path = BASE / "data" / "cov" / "demo9923.parquet"
    demo_9918: Optional[Path] = None

    # BMX 1999–2023 (required if not reconstructed from raw)
    bmx_9923: Optional[Path] = None

    # Output file names
    cov_smk: str = "cov_smk_1999_2023.parquet"
    cov_alc: str = "cov_alc_1999_2023.parquet"
    cov_pa: str = "cov_pa_1999_2023.parquet"
    cov_bmx: str = "cov_bmx_1999_2023.parquet"
    cov_clinical: str = "cov_clinical_1999_2023.parquet"
    cov_household: str = "cov_household_1999_2023.parquet"
    cov_core: str = "cov_core_1999_2023.parquet"

CONFIG = Config()


# ---------------------------------
# Helpers: IO, cleaning, harmonize
# ---------------------------------

NHANES_MISS = {7, 9, 77, 99, 777, 999, 7777, 9999, 77777, 99999}


def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)


def nhanes_na(series: pd.Series) -> pd.Series:
    """Map NHANES special missing codes to NaN (keeps genuine zeros)."""
    return series.mask(series.isin(NHANES_MISS)).astype(float)


def upper_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.upper() for c in df.columns]
    return df


def read_parquet_safe(path: Optional[Path]) -> Optional[pd.DataFrame]:
    if path and Path(path).exists():
        return pd.read_parquet(path)
    return None


def pick_first_existing(*candidates: Optional[Path]) -> Optional[Path]:
    """Return the first existing path among candidates (or None)."""
    for c in candidates:
        if c is not None and Path(c).exists():
            return Path(c)
    return None

# --------------
# Smoking (SMQ)
# --------------

# Minimal variable assumptions for fast-path append:
# Expect 1999–2018 stack with columns: SEQN, SMK_STATUS, CIGS_PER_DAY, PACK_YEARS, FORMER_SMOKER
# For 2019–2023, we derive using stable SMQ fields when available.

SMK_STATUS_CATS = pd.CategoricalDtype(categories=["NEVER", "FORMER", "CURRENT"], ordered=True)


def derive_smoking_from_smq(df: pd.DataFrame) -> pd.DataFrame:
    """Derive compact smoking features from raw/stacked SMQ.
    Expected fields (if present):
      - SMQ020: Smoked at least 100 cigarettes in life? (1=Yes, 2=No)
      - SMQ040: Do you now smoke cigarettes? (1=Every day, 2=Some days, 3=Not at all)
      - SMD030 / SMQ050Q/U pairs for intensity and duration in years may vary by cycle.
    We defensively compute:
      SMK_STATUS: NEVER/FORMER/CURRENT
      CIGS_PER_DAY: numeric (NaN if unknown)
      PACK_YEARS: (cigs/day / 20) * years
      FORMER_SMOKER: 1/0
    """
    d = upper_df(df)

    # Life 100 cigs (ever smoker)
    ever = d.get("SMQ020")
    if ever is not None:
        ever = ever.replace({2: 0}).replace({1: 1})
    else:
        ever = pd.Series(np.nan, index=d.index)

    # Current smoking status
    smq040 = d.get("SMQ040")
    # Default NEVER if explicitly said no to ever smoking
    smk_status = pd.Series("NEVER", index=d.index)
    smk_status[ever == 1] = "FORMER"
    if smq040 is not None:
        smk_status.loc[smq040.isin([1, 2])] = "CURRENT"
        smk_status.loc[smq040 == 3] = "FORMER"
    smk_status = smk_status.astype(SMK_STATUS_CATS)

    # Intensity (cigarettes per day) — prefer SMQ050Q (quantity) with unit SMQ050U (1=per day, 2=per week, etc.).
    cigs_q = nhanes_na(d.get("SMQ050Q", pd.Series(np.nan, index=d.index)))
    cigs_u = d.get("SMQ050U")  # 1=day, 2=week, 3=month
    cigs_per_day = cigs_q.copy()
    if cigs_u is not None:
        cigs_per_day = (
            cigs_q.where(cigs_u == 1)
            .fillna((cigs_q / 7).where(cigs_u == 2))
            .fillna((cigs_q / 30).where(cigs_u == 3))
        )

    # Years smoked (approx). Prefer SMD030 (years smoked) or computed from start/stop when available.
    years_smoked = nhanes_na(d.get("SMD030", pd.Series(np.nan, index=d.index)))

    pack_years = (cigs_per_day / 20.0) * years_smoked

    out = pd.DataFrame({
        "SEQN": d["SEQN"],
        "SMK_STATUS": smk_status.astype("string").str.upper(),
        "CIGS_PER_DAY": cigs_per_day,
        "PACK_YEARS": pack_years,
    })
    out["FORMER_SMOKER"] = (out["SMK_STATUS"] == "FORMER").astype("int8")

    return out


def build_smk(cfg: Config = CONFIG) -> pd.DataFrame:
    ensure_dir(cfg.out_dir)

    # Auto-pick sources: prefer a combined 99–23 stack if present; else 99–18 (+ optional 19–23 append)
    def _read_any(p: Path) -> pd.DataFrame:
        return pd.read_parquet(p) if str(p).endswith(".parquet") else pd.read_csv(p)

    # Look for combined stack first
    smk_9923 = pick_first_existing(
        cfg.interim_dir / "smk_9923.parquet",
        cfg.interim_dir / "smk_9923.csv",
    )
    if smk_9923:
        smk = upper_df(_read_any(smk_9923))
    else:
        # Then look for separate 99–18 and 19–23 pieces
        smk_9918 = pick_first_existing(
            cfg.smk_9918,
            cfg.interim_dir / "smk_9918.parquet",
            cfg.interim_dir / "smk_9918.csv",
        )
        smk_1923 = pick_first_existing(
            cfg.smk_1923,
            cfg.interim_dir / "smk_1923.parquet",
            cfg.interim_dir / "smk_1923.csv",
        )
        if smk_9918 is None:
            raise FileNotFoundError("Provide smk_9923 or smk_9918 under ./interim/ (csv/parquet).")
        df_9918 = upper_df(_read_any(smk_9918))
        if smk_1923:
            df_1923 = upper_df(_read_any(smk_1923))
            smk = pd.concat([df_9918, df_1923], ignore_index=True)
        else:
            smk = df_9918

    # Enforce schema and types
    keep = ["SEQN", "SMK_STATUS", "CIGS_PER_DAY", "PACK_YEARS", "FORMER_SMOKER"]
    smk = smk[keep].copy()
    smk["SMK_STATUS"] = smk["SMK_STATUS"].str.upper()
    smk["FORMER_SMOKER"] = smk["FORMER_SMOKER"].astype("int8")

    smk.to_parquet(cfg.out_dir / cfg.cov_smk, index=False)
    return smk

# -----------------
# Alcohol (ALQ)
# -----------------

# CDC categories (derived):
#  - NONE: 0 drinks or lifetime < 12 drinks
#  - MODERATE: Women 0<–1 per day; Men 0<–2 per day
#  - HEAVY: Women >1 per day; Men >2 per day


def _drinks_per_day_from_alq(df: pd.DataFrame) -> pd.Series:
    d = upper_df(df)
    # Typical quantity/frequency fields (vary by cycle). We'll try the common ALQ120Q/U (past 12 months):
    q = nhanes_na(d.get("ALQ120Q", pd.Series(np.nan, index=d.index)))  # number of drinks per occasion
    u = d.get("ALQ120U")  # 1=day, 2=week, 3=month, 4=year
    f = nhanes_na(d.get("ALQ121U", pd.Series(np.nan, index=d.index)))  # frequency unit count (e.g., days/week)
    fu = d.get("ALQ121Q")  # NOTE: some cycles flip Q/U naming; we guard below

    # Fallback alternative naming (older cycles use ALQ120U as unit, ALQ120Q as number of days, etc.).
    # We attempt to normalize as: drinks_per_day = (drinks_per_occasion * occasions_per_unit) / days_in_unit.
    drinks_per_occasion = q

    # occasions per unit (approx). Try ALQ101/ALQ120 combos when available.
    occ_q = nhanes_na(d.get("ALQ101", pd.Series(np.nan, index=d.index)))  # days drank in past year
    if occ_q.notna().any():
        occasions_per_year = occ_q
    else:
        # try to build from ALQ120U (unit) and ALQ120Q (count) if that pair encodes frequency
        count = nhanes_na(d.get("ALQ120Q", pd.Series(np.nan, index=d.index)))
        unit = d.get("ALQ120U")
        occasions_per_year = pd.Series(np.nan, index=d.index)
        if unit is not None:
            occasions_per_year = (
                count.where(unit == 4)  # per year
                .fillna((count * 12).where(unit == 3))  # per month
                .fillna((count * 52).where(unit == 2))  # per week
                .fillna((count * 365).where(unit == 1))  # per day
            )

    drinks_per_day = (drinks_per_occasion * occasions_per_year) / 365.0
    return drinks_per_day


def categorize_alcohol(drinks_per_day: pd.Series, sex: pd.Series, lifetime_lt12: Optional[pd.Series] = None) -> pd.Series:
    sex = sex.replace({1: "M", 2: "F"}).astype("string").str.upper()
    cat = pd.Series("NONE", index=drinks_per_day.index, dtype="string")
    if lifetime_lt12 is not None:
        cat = cat.where(~(lifetime_lt12 == 1), "NONE")

    # Set MODERATE vs HEAVY by sex
    mod_mask = (
        ((sex == "F") & (drinks_per_day > 0) & (drinks_per_day <= 1)) |
        ((sex == "M") & (drinks_per_day > 0) & (drinks_per_day <= 2))
    )
    heavy_mask = (
        ((sex == "F") & (drinks_per_day > 1)) |
        ((sex == "M") & (drinks_per_day > 2))
    )
    cat = cat.where(~mod_mask, "MODERATE")
    cat = cat.where(~heavy_mask, "HEAVY")
    return cat.str.upper()


def build_alc(cfg: Config = CONFIG) -> pd.DataFrame:
    ensure_dir(cfg.out_dir)

    # ✅ B) Prefer the alcohol file you already built in /output
    out_path = cfg.out_dir / cfg.cov_alc
    if out_path.exists():
        return pd.read_parquet(out_path)

    # Fallback: try to build from interim stacks (legacy flow)
    demo = pd.read_parquet(cfg.demo_9923)[["SEQN", "RIAGENDR"]].rename(columns={"RIAGENDR": "SEX"})

    # Expect a stacked ALQ file at interim_dir/alq_9923.parquet; fall back to 1999–2018
    alq_9923 = cfg.interim_dir / "alq_9923.parquet"
    alq_9918 = cfg.interim_dir / "alq_9918.parquet"
    if alq_9923.exists():
        alq = pd.read_parquet(alq_9923)
    elif alq_9918.exists():
        alq = pd.read_parquet(alq_9918)
    else:
        raise FileNotFoundError(
            "Alcohol file not prebuilt in /output and no interim ALQ stack found.\n"
            "Either run the ALQ fetcher that writes output/cov_alc_1999_2023.parquet, "
            "or provide data/cov/alq_9923.parquet (or alq_9918.parquet)."
        )
    alq = upper_df(alq)

    drinks_per_day = _drinks_per_day_from_alq(alq)

    lifetime_lt12 = None
    if "ALQ110" in alq.columns:
        lifetime_lt12 = (alq["ALQ110"] == 2).astype("int8")  # 2=no -> <12 drinks lifetime

    sex_series = demo.set_index("SEQN").loc[alq["SEQN"], "SEX"].reset_index(drop=True)
    alc_cat = categorize_alcohol(drinks_per_day, sex=sex_series, lifetime_lt12=lifetime_lt12)

    out = pd.DataFrame({
        "SEQN": alq["SEQN"],
        "DRINKS_PER_DAY": drinks_per_day,
        "ALCOHOL_CAT": alc_cat,
    })
    out.to_parquet(out_path, index=False)
    return out


# -----------------------------
# Physical activity (PAQ/PAD)
# -----------------------------

def build_pa(cfg: Config = CONFIG) -> pd.DataFrame:
    ensure_dir(cfg.out_dir)

    def _read_any(p: Path) -> pd.DataFrame:
        return pd.read_parquet(p) if str(p).endswith(".parquet") else pd.read_csv(p)

    # Prefer combined 99–23 if present
    pa_9923 = pick_first_existing(
        cfg.interim_dir / "totalpa_9923_imputed.parquet",
        cfg.interim_dir / "totalpa_9923_imputed.csv",
    )
    if pa_9923:
        pa = upper_df(_read_any(pa_9923))
    else:
        # Fallback: 99–18 (+ optional 19–23 append)
        pa_9918 = pick_first_existing(
            cfg.pa_9918_imputed,
            cfg.interim_dir / "totalpa_9918_imputed.parquet",
            cfg.interim_dir / "totalpa_9918_imputed.csv",
        )
        if pa_9918 is None:
            raise FileNotFoundError("Provide totalpa_9923_imputed or totalpa_9918_imputed under ./interim/.")
        pa = upper_df(_read_any(pa_9918))
        pa_1923 = pick_first_existing(
            cfg.pa_1923,
            cfg.interim_dir / "totalpa_1923_imputed.parquet",
            cfg.interim_dir / "totalpa_1923_imputed.csv",
        )
        if pa_1923:
            pa = pd.concat([pa, upper_df(_read_any(pa_1923))], ignore_index=True)

    # Enforce names
    needed = ["SEQN", "LTPA_MET_HR_WK", "LTPA_IMPUTED_FLAG"]
    for col in needed:
        if col not in pa.columns:
            raise ValueError(f"PA table missing column: {col}")

    pa["LTPA_IMPUTED_FLAG"] = pa["LTPA_IMPUTED_FLAG"].astype("int8")
    pa.to_parquet(cfg.out_dir / cfg.cov_pa, index=False)
    return pa

# -----------------------------
# Anthropometrics (BMX)
# -----------------------------

def build_bmx(cfg: Config = CONFIG) -> pd.DataFrame:
    ensure_dir(cfg.out_dir)

    if cfg.bmx_9923 is None:
        # Expect stacked BMX at interim
        # prefer 2019–2023 stack; fall back to 1999–2018 if that's all you have for now
        bmx_9923 = cfg.interim_dir / "bmx_9923.parquet"
        bmx_9918 = cfg.interim_dir / "bmx_9918.parquet"
        if bmx_9923.exists():
            bmx = pd.read_parquet(bmx_9923)
        elif bmx_9918.exists():
            bmx = pd.read_parquet(bmx_9918)
        else:
            raise FileNotFoundError("Provide ./interim/bmx_9923.parquet (preferred) or ./interim/bmx_9918.parquet, or set cfg.bmx_9923.")
    else:
        bmx = pd.read_parquet(cfg.bmx_9923)

    bmx = upper_df(bmx)

    # Compute BMI if missing
    for col in ["BMXHT", "BMXWT"]:
        if col not in bmx.columns:
            raise ValueError(f"BMX table missing {col}")

    bmx["BMI"] = bmx.get("BMXBMI", np.nan)
    missing_bmi = bmx["BMI"].isna()
    bmx.loc[missing_bmi, "BMI"] = bmx.loc[missing_bmi, "BMXWT"] / (bmx.loc[missing_bmi, "BMXHT"] / 100.0) ** 2

    out = bmx[["SEQN", "BMXWT", "BMXHT", "BMI"]].copy()
    out.to_parquet(cfg.out_dir / cfg.cov_bmx, index=False)
    return out

# ------------------------------------
# Clinical & conditions + vitals/labs
# ------------------------------------

@dataclass
class ClinicalThresholds:
    # Parameterize thresholds to stay aligned with your protocol
    htn_sbp: float = 140.0
    htn_dbp: float = 90.0
    a1c_diabetes: float = 6.5
    fpg_diabetes: float = 126.0  # mg/dL


THR = ClinicalThresholds()


def build_clinical(cfg: Config = CONFIG, thr: ClinicalThresholds = THR) -> pd.DataFrame:
    ensure_dir(cfg.out_dir)

    def _read_any(p: Path) -> pd.DataFrame:
        return pd.read_parquet(p) if str(p).endswith(".parquet") else pd.read_csv(p)

    # Prefer combined 99–23 if present
    clin_9923 = pick_first_existing(
        cfg.interim_dir / "clinical_9923.parquet",
        cfg.interim_dir / "clinical_9923.csv",
    )
    if clin_9923:
        clin = upper_df(_read_any(clin_9923))
    else:
        # Fallback: 99–18 (+ optional 19–23 append)
        p9918 = pick_first_existing(
            cfg.clinical_9918,
            cfg.interim_dir / "clinical_9918.parquet",
            cfg.interim_dir / "clinical_9918.csv",
            cfg.interim_dir / "nhanes_primary_anal_full_singleimputation_v2.parquet",
            cfg.interim_dir / "nhanes_primary_anal_full_singleimputation_v2.csv",
        )
        p1923 = pick_first_existing(
            cfg.clinical_1923,
            cfg.interim_dir / "clinical_1923.parquet",
            cfg.interim_dir / "clinical_1923.csv",
        )
        if p9918 is None:
            raise FileNotFoundError("Provide clinical_9923 or clinical_9918 under ./interim/.")
        df_9918 = upper_df(_read_any(p9918))
        if p1923:
            df_1923 = upper_df(_read_any(p1923))
            clin = pd.concat([df_9918, df_1923], ignore_index=True)
        else:
            clin = df_9918

    # Expect these columns already present per your v2 schema
    keep = [
        "SEQN", "BMI_CLAS", "DIABETES", "HTN", "HIGH_CHOL", "CVD", "CANCER",
        "SBP", "DBP", "TCHOL", "HDL", "LDL", "TG"
    ]
    missing = [c for c in keep if c not in clin.columns]
    if missing:
        raise ValueError(f"clinical table missing columns: {missing}")
    out = clin[keep].copy()

    # Types
    for b in ["DIABETES", "HTN", "HIGH_CHOL", "CVD", "CANCER"]:
        out[b] = out[b].astype("Int8")
    out.to_parquet(cfg.out_dir / cfg.cov_clinical, index=False)
    return out

# ----------------
# Household size
# ----------------

def build_household(cfg: Config = CONFIG) -> pd.DataFrame:
    ensure_dir(cfg.out_dir)
    demo = pd.read_parquet(cfg.demo_9923)
    demo = upper_df(demo)
    if "DMDHHSIZ" not in demo.columns:
        raise ValueError("DMDHHSIZ not found in DEMO 1999–2023 stack.")
    out = demo[["SEQN", "DMDHHSIZ"]].copy()
    out.to_parquet(cfg.out_dir / cfg.cov_household, index=False)
    return out

# ----------------------
# Survey design fields
# ----------------------

SURVEY_KEEP = ["SEQN", "SDDSRVYR", "SDMVPSU", "SDMVSTRA", "WTMEC2YR"]


def get_survey_core(cfg: Config = CONFIG) -> pd.DataFrame:
    """Auto-pick DEMO table: prefer 1999–2023; fallback to 1999–2018.
    Expects: SEQN, SDDSRVYR, SDMVPSU, SDMVSTRA, WTMEC2YR in the chosen table.
    """
    # Choose explicit path if it exists, else look in interim_dir for standard names
    demo_path = None
    if cfg.demo_9923 and Path(cfg.demo_9923).exists():
        demo_path = Path(cfg.demo_9923)
    elif cfg.demo_9918 and Path(cfg.demo_9918).exists():
        demo_path = Path(cfg.demo_9918)
    else:
        demo_path = pick_first_existing(
            cfg.interim_dir / "demo_9923.parquet",
            cfg.interim_dir / "demo_9918.parquet",
        )
    if demo_path is None:
        raise FileNotFoundError("Could not find DEMO table (expected demo_9923.parquet or demo_9918.parquet).")

    demo = upper_df(pd.read_parquet(demo_path))
    miss = [c for c in SURVEY_KEEP if c not in demo.columns]
    if miss:
        raise ValueError(f"Missing survey fields in DEMO: {miss}")
    return demo[SURVEY_KEEP].copy()

# --------------------------
# Merge to cov_core (LEFT)
# --------------------------


def build_core(cfg: Config = CONFIG) -> pd.DataFrame:
    ensure_dir(cfg.out_dir)

    # Load components (each function writes its own parquet already)
    smk = pd.read_parquet(cfg.out_dir / cfg.cov_smk)
    alc = pd.read_parquet(cfg.out_dir / cfg.cov_alc)
    pa = pd.read_parquet(cfg.out_dir / cfg.cov_pa)
    bmx = pd.read_parquet(cfg.out_dir / cfg.cov_bmx)
    clin = pd.read_parquet(cfg.out_dir / cfg.cov_clinical)
    hh = pd.read_parquet(cfg.out_dir / cfg.cov_household)
    survey = get_survey_core(cfg)

    # LEFT-join all on SEQN
    core = survey
    for part in [smk, alc, pa, bmx, clin, hh]:
        core = core.merge(part, on="SEQN", how="left")

    # Uppercase column names
    core.columns = [c.upper() for c in core.columns]

    core.to_parquet(cfg.out_dir / cfg.cov_core, index=False)
    return core

# --------------
# Orchestrator
# --------------

def run_all(cfg: Config = CONFIG) -> Dict[str, pd.DataFrame]:
    ensure_dir(cfg.out_dir)
    out = {}
    out["smk"] = build_smk(cfg)
    out["alc"] = build_alc(cfg)
    out["pa"] = build_pa(cfg)
    out["bmx"] = build_bmx(cfg)
    out["clinical"] = build_clinical(cfg)
    out["household"] = build_household(cfg)
    out["core"] = build_core(cfg)
    return out

# ---------------------------
# Minimal sanity validations
# ---------------------------

def quick_checks(cfg: Config = CONFIG) -> pd.Series:
    core = pd.read_parquet(cfg.out_dir / cfg.cov_core)
    checks = {
        "n_rows": len(core),
        "n_unique_seqn": core["SEQN"].nunique(),
        "missing_bmi_pct": core["BMI"].isna().mean(),
        "missing_alcohol_cat_pct": core["ALCOHOL_CAT"].isna().mean(),
        "missing_smk_status_pct": core["SMK_STATUS"].isna().mean(),
        "has_weights": int("WTMEC2YR" in core.columns),
    }
    return pd.Series(checks)

if __name__ == "__main__":
    # Keep outputs in /Users/dengshuyue/Desktop/SDOH/analysis/output
    ensure_dir(CONFIG.out_dir)
    # run_all(CONFIG)  # uncomment to execute end-to-end
    pass


In [130]:
from pathlib import Path

BASE = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
out = BASE / "output"

need = {
    "smk": out / "cov_smk_1999_2023.parquet",
    "alc": out / "cov_alc_1999_2023.parquet",
    "pa":  out / "cov_pa_1999_2023.parquet",
    "bmx": out / "cov_bmx_1999_2023.parquet",
    "clin":out / "cov_clinical_1999_2023.parquet",
    "hh":  out / "cov_household_1999_2023.parquet",
}
for k,p in need.items():
    print(f"{k:4} →", "OK" if p.exists() else "MISSING", p)


smk  → OK /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_smk_1999_2023.parquet
alc  → OK /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_alc_1999_2023.parquet
pa   → OK /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_pa_1999_2023.parquet
bmx  → OK /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_bmx_1999_2023.parquet
clin → OK /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_clinical_1999_2023.parquet
hh   → OK /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_household_1999_2023.parquet


In [128]:

# delete later 
# --- Build a minimal DEMO with survey design fields, then rebuild core ---

from pathlib import Path
import pandas as pd

ROOT    = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
INTERIM = ROOT / "interim"
INTERIM.mkdir(parents=True, exist_ok=True)

NEEDED = ["SEQN","SDDSRVYR","SDMVPSU","SDMVSTRA","WTMEC2YR"]

def _cols_ok(df, need=NEEDED):
    cols = [c.upper() for c in df.columns]
    return all(c in cols for c in need)

# 1) Try existing candidates first
candidates = [
    CONFIG.demo_9923,
    ROOT/"output"/"nhanes_mort_demo_sdoh_1999_2018.parquet",
    ROOT/"data"/"demo9923.csv",
]

chosen = None
why = None
for p in candidates:
    if p is None: 
        continue
    p = Path(p)
    if not p.exists():
        continue
    try:
        df = pd.read_parquet(p) if p.suffix==".parquet" else pd.read_csv(p, low_memory=False)
        df.columns = [c.upper() for c in df.columns]
        if _cols_ok(df):
            # minimal clean
            df = df[NEEDED].copy()
            df["SEQN"] = pd.to_numeric(df["SEQN"], errors="coerce")
            df = df.dropna(subset=["SEQN"]).drop_duplicates("SEQN")
            outp = INTERIM/"survey_design_9923.parquet"
            df.to_parquet(outp, index=False)
            chosen = outp
            why = f"Using existing file → {p}"
            break
    except Exception:
        pass

# 2) If not found, build from raw DEMO XPTs
if chosen is None:
    import glob
    xpt_dirs = [
        ROOT/"data"/"nhanes_by_module"/"DEMO",
        ROOT/"data"/"nhanes_deit",
        ROOT/"data"/"household_size",
    ]
    frames = []
    for d in xpt_dirs:
        for xpt in glob.glob(str(d/"*.xpt")):
            try:
                df = pd.read_sas(xpt, format="xport")
                df.columns = [c.upper() for c in df.columns]
                if "SEQN" not in df.columns:
                    continue
                # keep only available of the needed ones
                keep = [c for c in NEEDED if c in df.columns]
                if not {"SDMVPSU","SDMVSTRA","WTMEC2YR"}.issubset(set(keep)):
                    # skip if this file doesn't have the design trio
                    continue
                sub = df[keep].copy()
                frames.append(sub)
            except Exception:
                continue

    if not frames:
        raise RuntimeError("Couldn't build DEMO survey table from raw XPTs; no suitable files with SDMVPSU/SDMVSTRA/WTMEC2YR were found.")

    demo = pd.concat(frames, ignore_index=True)
    demo = demo.drop_duplicates(subset=["SEQN"], keep="first")
    # Normalize types
    demo["SEQN"]     = pd.to_numeric(demo["SEQN"], errors="coerce")
    demo["SDDSRVYR"] = pd.to_numeric(demo["SDDSRVYR"], errors="coerce")
    demo["SDMVPSU"]  = pd.to_numeric(demo["SDMVPSU"], errors="coerce")
    demo["SDMVSTRA"] = pd.to_numeric(demo["SDMVSTRA"], errors="coerce")
    demo["WTMEC2YR"] = pd.to_numeric(demo["WTMEC2YR"], errors="coerce")

    demo = demo.dropna(subset=["SEQN"]).drop_duplicates("SEQN")
    outp = INTERIM/"survey_design_9923.parquet"
    demo.to_parquet(outp, index=False)
    chosen = outp
    why = f"Built from raw DEMO XPTs under {ROOT}/data"

print("Survey DEMO source:", why)
print("→", chosen)

# Point CONFIG to the newly created survey table and rebuild core
CONFIG.demo_9923 = Path(chosen)

# Rebuild just the core (smk/pa already produced)
core = build_core(CONFIG)
print("✓ cov_core written to:", CONFIG.out_dir / CONFIG.cov_core)

# Quick sanity
print(quick_checks(CONFIG))

# Peek a few rows
display(core.sort_values("SEQN").head(10))


Survey DEMO source: Using existing file → /Users/dengshuyue/Desktop/SDOH/analysis/data/cov/demo9923.parquet
→ /Users/dengshuyue/Desktop/SDOH/analysis/interim/survey_design_9923.parquet
✓ cov_core written to: /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_core_1999_2023.parquet
n_rows                     128809.000000
n_unique_seqn              128809.000000
missing_bmi_pct                 0.318378
missing_alcohol_cat_pct         0.465402
missing_smk_status_pct          0.572903
has_weights                     1.000000
dtype: float64


Unnamed: 0,SEQN,SDDSRVYR,SDMVPSU,SDMVSTRA,WTMEC2YR,SMK_STATUS,CIGS_PER_DAY,PACK_YEARS,FORMER_SMOKER,DRINKS_PER_DAY,LIFETIME_LT12,ALCOHOL_CAT,LTPA_MET_HR_WK,LTPA_IMPUTED_FLAG,BMXWT,BMXHT,BMI,BMI_CLAS,DIABETES,HTN,HIGH_CHOL,CVD,CANCER,SBP,DBP,TCHOL,HDL,LDL,TG,DMDHHSIZ
0,1.0,1.0,1.0,5.0,10982.898896,,,,,,,,,,12.5,91.6,14.897695,UNDER,0,0,0,0,0,91.333333,56.0,131.0,59.0,54.0,99.0,3.0
1,2.0,1.0,3.0,1.0,28325.384898,NEVER,,,0.0,0.789041,0.0,moderate,0.0,1.0,75.4,174.0,24.904215,NORMAL,0,0,0,0,1,100.666667,56.666667,215.0,54.0,136.0,128.0,1.0
2,3.0,1.0,2.0,7.0,46192.256945,,,,,,,,,,32.9,136.6,17.631713,UNDER,0,0,0,0,0,108.666667,62.0,129.0,30.0,58.0,202.0,4.0
3,4.0,1.0,1.0,2.0,10251.26002,,,,,,,,,,13.3,,,,0,0,1,0,0,95.333333,61.333333,211.0,43.0,161.0,37.0,7.0
4,5.0,1.0,2.0,8.0,99445.065735,FORMER,,,1.0,12.0,0.0,moderate,41.066667,1.0,92.5,178.3,29.096386,OVER,0,1,1,0,0,122.0,82.666667,279.0,42.0,168.0,347.0,3.0
5,6.0,1.0,2.0,2.0,39656.600444,,,,,,,,,,59.2,162.0,22.557537,NORMAL,0,0,0,0,0,114.666667,68.0,153.0,61.0,59.0,181.0,2.0
6,7.0,1.0,2.0,4.0,25525.423409,FORMER,,8030.0,1.0,,1.0,none,3.033333,1.0,78.0,162.9,29.393577,OVER,0,0,1,0,0,125.333333,80.0,245.0,105.0,127.0,62.0,1.0
7,8.0,1.0,1.0,6.0,31510.587866,,,,,,,,,,40.7,162.0,15.508307,UNDER,0,0,0,0,0,100.666667,49.333333,162.0,67.0,88.0,33.0,7.0
8,9.0,1.0,2.0,9.0,7575.870247,,,,,,,,,,45.5,156.9,18.482704,UNDER,0,0,0,0,0,109.333333,53.333333,148.0,58.0,79.0,56.0,4.0
9,10.0,1.0,1.0,7.0,22445.808572,CURRENT,1.0,,0.0,0.19726,0.0,moderate,0.0,1.0,111.8,190.1,30.936955,OBESE,0,1,0,0,0,145.333333,96.0,140.0,51.0,80.0,45.0,1.0


In [127]:
import pandas as pd
from pathlib import Path

core = pd.read_parquet("/Users/dengshuyue/Desktop/SDOH/analysis/out/cov_core_1999_2023.parquet")

era = (core["SDDSRVYR"] <= 10).map({True:"1999–2018", False:"2019–2023"})
def miss(col): return core[col].isna().groupby(era).mean().mul(100).round(1)

checks = pd.DataFrame({
    "SMK_STATUS %NA": miss("SMK_STATUS"),
    "LTPA_MET_HR_WK %NA": miss("LTPA_MET_HR_WK"),
    "BMI %NA": miss("BMI"),
    "WTMEC2YR %NA": miss("WTMEC2YR"),
})
checks


Unnamed: 0_level_0,SMK_STATUS %NA,LTPA_MET_HR_WK %NA,BMI %NA,WTMEC2YR %NA
SDDSRVYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1999–2018,45.7,45.6,13.3,0.0
2019–2023,100.0,100.0,100.0,0.0


<h2> Step 2. add Alcohol by fetch </h2>

In [125]:
# %% NHANES ALQ → DRINKS_PER_DAY + ALCOHOL_CAT (proposal thresholds)
from pathlib import Path
import pandas as pd
import numpy as np
import requests

# --------- YOUR fixed locations ---------
BASE        = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
OUTPUT_DIR  = BASE / "output"                # final parquet location
DATA_COV    = BASE / "data" / "cov"          # keep inputs here
ALC_STORE   = DATA_COV / "alcohol"           # ONLY new subfolder we may create
DEMO_PKL    = BASE / "data" / "demo9923.pkl"           # your existing demo
DEMO_PARQ   = DATA_COV / "demo9923.parquet"            # canonical fallback we may build

ALC_STORE.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
DATA_COV.mkdir(parents=True, exist_ok=True)

# --------- URL map (two mirrors each) ---------
ALQ_URLS = {
    "1999-2000": [
        "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/1999/DataFiles/ALQ.xpt",
        "https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/ALQ.XPT",
    ],
    "2001-2002": [
        "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2001/DataFiles/ALQ_B.xpt",
        "https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/ALQ_B.XPT",
    ],
    "2003-2004": [
        "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2003/DataFiles/ALQ_C.xpt",
        "https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/ALQ_C.XPT",
    ],
    "2005-2006": [
        "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2005/DataFiles/ALQ_D.xpt",
        "https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/ALQ_D.XPT",
    ],
    "2007-2008": [
        "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2007/DataFiles/ALQ_E.xpt",
        "https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/ALQ_E.XPT",
    ],
    "2009-2010": [
        "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2009/DataFiles/ALQ_F.xpt",
        "https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/ALQ_F.XPT",
    ],
    "2011-2012": [
        "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2011/DataFiles/ALQ_G.xpt",
        "https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/ALQ_G.XPT",
    ],
    "2013-2014": [
        "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/ALQ_H.xpt",
        "https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/ALQ_H.XPT",
    ],
    "2015-2016": [
        "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/ALQ_I.xpt",
        "https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/ALQ_I.XPT",
    ],
    "2017-2018": [
        "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/ALQ_J.xpt",
        "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/ALQ_J.XPT",
    ],
    "2017-March 2020 (pre-pandemic)": [
        "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_ALQ.xpt",
        "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2020/P_ALQ.XPT",
    ],
    "August 2021–August 2023": [
        "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/ALQ_L.xpt",
        "https://wwwn.cdc.gov/Nchs/Nhanes/2021-2023/ALQ_L.XPT",
        "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/ALQ_Q.xpt",
        "https://wwwn.cdc.gov/Nchs/Nhanes/2021-2023/ALQ_Q.XPT",
    ],
}
CYCLES = list(ALQ_URLS.keys())

# --------- helpers ---------
def fetch(url: str, dest: Path, timeout=90):
    headers={"User-Agent":"nhanes-fetch/1.0"}
    with requests.get(url, headers=headers, stream=True, timeout=timeout) as r:
        r.raise_for_status()
        tmp = dest.with_suffix(dest.suffix + ".downloading")
        with open(tmp, "wb") as f:
            for chunk in r.iter_content(1<<15):
                if chunk: f.write(chunk)
        tmp.rename(dest)
    return dest

def ensure_xpt(cycle: str) -> Path:
    """Return local ALQ .xpt for cycle stored under data/cov/alcohol/."""
    for url in ALQ_URLS[cycle]:
        name = Path(url).name
        local = ALC_STORE / name
        if local.exists():
            return local
        try:
            print(f"⬇️  {cycle} → {name}")
            return fetch(url, local)
        except Exception as e:
            print(f"   ⚠️ {e}")
    raise FileNotFoundError(f"ALQ not available for {cycle}")

def read_xpt(p: Path) -> pd.DataFrame:
    try:
        import pyreadstat
        df, _ = pyreadstat.read_xport(p)
    except Exception:
        df = pd.read_sas(p, format="xport")
    df.columns = [c.upper() for c in df.columns]
    return df

NH_MISS = {7, 9, 77, 99, 777, 999, 7777, 9999, 77777, 99999}
def clean_num(s: pd.Series) -> pd.Series:
    s = pd.to_numeric(s, errors="coerce")
    return s.mask(s.isin(NH_MISS))

# --------- load ALQ across cycles ---------
parts = []
for cycle in CYCLES:
    try:
        fp = ensure_xpt(cycle)
    except FileNotFoundError as e:
        print(f"⚠️ {e}"); continue
    df = read_xpt(fp)
    df["CYCLE"] = cycle
    keep = [c for c in ["SEQN","CYCLE","ALQ110","ALQ101","ALQ151","ALQ120Q","ALQ120U","ALQ130"] if c in df.columns]
    parts.append(df[keep])
if not parts:
    raise RuntimeError("No ALQ data available.")
alq = pd.concat(parts, ignore_index=True)

# --------- derive DRINKS_PER_DAY (per proposal) ---------
count = clean_num(alq.get("ALQ120Q", pd.Series(np.nan, index=alq.index)))
unit  = alq.get("ALQ120U", pd.Series(np.nan, index=alq.index))
per_year = pd.Series(np.nan, index=alq.index, dtype="float")
per_year = per_year.where(~(unit == 1), 365.0)
per_year = per_year.where(~(unit == 2), 52.142)
per_year = per_year.where(~(unit == 3), 12.0)
per_year = per_year.where(~(unit == 4), 1.0)
occasions_per_year = count * per_year

drinks_per_occasion = clean_num(alq.get("ALQ130", pd.Series(np.nan, index=alq.index)))
dpd = (occasions_per_year * drinks_per_occasion) / 365.0
dpd = dpd.where(occasions_per_year.notna() & drinks_per_occasion.notna())

# lifetime <12 indicator (prefer ALQ110; fall back to ALQ151; else NA)
life_lt12 = None
if "ALQ110" in alq.columns:
    life_lt12 = (alq["ALQ110"] == 2).astype("Int8")
elif "ALQ151" in alq.columns:
    life_lt12 = (alq["ALQ151"] == 2).astype("Int8")

alc_min = alq[["SEQN"]].copy()
alc_min["DRINKS_PER_DAY"] = dpd
if life_lt12 is not None:
    alc_min["LIFETIME_LT12"] = life_lt12

# --------- load DEMO sex (prefer your .pkl; else cov/demo9923.parquet; else build minimal) ---------
if DEMO_PKL.exists():
    demo = pd.read_pickle(DEMO_PKL)
elif DEMO_PARQ.exists():
    demo = pd.read_parquet(DEMO_PARQ)
else:
    # Minimal build (SEQN, RIAGENDR) only, stored exactly at data/cov/demo9923.parquet
    print("ℹ️  DEMO not found — building minimal RIAGENDR stack into", DEMO_PARQ)
    DEMO_URLS = {
        "1999-2000": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/1999/DataFiles/DEMO.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/DEMO.XPT",
        ],
        "2001-2002": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2001/DataFiles/DEMO_B.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/DEMO_B.XPT",
        ],
        "2003-2004": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2003/DataFiles/DEMO_C.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/DEMO_C.XPT",
        ],
        "2005-2006": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2005/DataFiles/DEMO_D.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/DEMO_D.XPT",
        ],
        "2007-2008": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2007/DataFiles/DEMO_E.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/DEMO_E.XPT",
        ],
        "2009-2010": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2009/DataFiles/DEMO_F.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/DEMO_F.XPT",
        ],
        "2011-2012": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2011/DataFiles/DEMO_G.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/DEMO_G.XPT",
        ],
        "2013-2014": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/DEMO_H.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DEMO_H.XPT",
        ],
        "2015-2016": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/DEMO_I.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DEMO_I.XPT",
        ],
        "2017-2018": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DEMO_J.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT",
        ],
        "2017-March 2020 (pre-pandemic)": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DEMO.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2020/P_DEMO.XPT",
        ],
        "August 2021–August 2023": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2021-2023/DEMO_L.XPT",
        ],
    }
    def read_xpt_min(p: Path) -> pd.DataFrame:
        try:
            import pyreadstat
            df, _ = pyreadstat.read_xport(p)
        except Exception:
            df = pd.read_sas(p, format="xport")
        df.columns = [c.upper() for c in df.columns]
        return df

    demo_parts = []
    for cyc, urls in DEMO_URLS.items():
        got = None
        for u in urls:
            try:
                name = Path(u).name
                dest = DATA_COV / name  # store right under data/cov
                if not dest.exists():
                    print(f"⬇️  DEMO {cyc} → {name}")
                    fetch(u, dest)
                got = dest; break
            except Exception as e:
                print("   ⚠️", e)
        if got is None:
            continue
        demo_df = read_xpt_min(got)
        keep = [c for c in ["SEQN","RIAGENDR"] if c in demo_df.columns]
        if keep:
            demo_parts.append(demo_df[keep])
    if not demo_parts:
        raise RuntimeError("Could not build minimal DEMO (RIAGENDR).")
    demo = pd.concat(demo_parts, ignore_index=True).drop_duplicates("SEQN")
    demo.to_parquet(DEMO_PARQ, index=False)

# standardize columns
demo.columns = [c.upper() for c in demo.columns]
demo = demo.drop_duplicates("SEQN")

# --------- categorize per proposal (safe dtype handling; no np.select) ---------
sex  = pd.to_numeric(demo.set_index("SEQN")["RIAGENDR"].reindex(alc_min["SEQN"]), errors="coerce")  # 1=male, 2=female
dpd  = pd.to_numeric(alc_min["DRINKS_PER_DAY"], errors="coerce")
life = pd.to_numeric(alc_min.get("LIFETIME_LT12", pd.Series(pd.NA, index=alc_min.index)), errors="coerce")

none_mask     = dpd.isna() | (dpd < 0.03) | (life == 1)                    # <12 lifetime OR <0.03/day
heavy_mask    = ((sex == 2) & (dpd >= 1.0)) | ((sex == 1) & (dpd >= 2.0))  # women ≥1/day; men ≥2/day
moderate_mask = (~none_mask) & (~heavy_mask)

cat = pd.Series("none", index=alc_min.index, dtype="object")
cat.loc[moderate_mask] = "moderate"
cat.loc[heavy_mask]    = "heavy"

alc_min["ALCOHOL_CAT"] = pd.Categorical(cat, categories=["none","moderate","heavy"], ordered=True)

# --------- write to YOUR output folder ---------
out_path = OUTPUT_DIR / "cov_alc_1999_2023.parquet"
alc_min.to_parquet(out_path, index=False)
print("✅ wrote", out_path)
print(alc_min["ALCOHOL_CAT"].value_counts(dropna=False).rename_axis("ALCOHOL_CAT").to_frame("n").head(10))


✅ wrote /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_alc_1999_2023.parquet
                 n
ALCOHOL_CAT       
none         42792
moderate     20674
heavy         5395


<h2> Step2.1 add Alcohol back to core </h2>

In [126]:
from pathlib import Path
import pandas as pd

BASE = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
core_path = BASE / "output" / "cov_core_1999_2023.parquet"
alc_path  = BASE / "output" / "cov_alc_1999_2023.parquet"

core = pd.read_parquet(core_path)
alc  = pd.read_parquet(alc_path)[["SEQN","DRINKS_PER_DAY","ALCOHOL_CAT"]]

core_v2 = core.merge(alc, on="SEQN", how="left")
out_core_v2 = BASE / "output" / "cov_core_1999_2023_v2.parquet"
core_v2.to_parquet(out_core_v2, index=False)

print("✅ wrote", out_core_v2)
print(core_v2["ALCOHOL_CAT"].value_counts(dropna=False).rename_axis("ALCOHOL_CAT").to_frame("n").head(10))


✅ wrote /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_core_1999_2023_v2.parquet
                 n
ALCOHOL_CAT       
NaN          59948
none         42792
moderate     20674
heavy         5395


<h2> Point config to your actual files (1999–2018)</h2>

In [73]:
import numpy as np
import pandas as pd
from pathlib import Path

def _normalize_sex(series: pd.Series) -> pd.Series:
    """Map common encodings to 'M'/'F'. Unknown -> <NA>."""
    s = series.copy()
    # unify case
    if s.dtype == "O" or pd.api.types.is_string_dtype(s):
        s = s.astype("string").str.strip().str.upper()
        s = s.replace(
            {
                "MALE": "M", "M": "M", "1": "M", "01": "M",
                "FEMALE": "F", "F": "F", "2": "F", "02": "F",
            }
        )
    else:
        # numeric codes like 1/2
        s = pd.to_numeric(s, errors="coerce")
        s = s.map({1: "M", 2: "F"}).astype("string")
    # anything else becomes <NA>
    s = s.where(s.isin(["M", "F"]), pd.NA)
    return s

def _get_sex_from_demo(demo_df: pd.DataFrame) -> pd.DataFrame:
    d = demo_df.copy()
    d.columns = [c.upper() for c in d.columns]
    # try common candidates
    cand = [c for c in d.columns if c.upper() in
            {"RIAGENDR","SEX","GENDER","RIAGENDR_X","RIAGENDR_Y","SEX_CODE"}]
    if not cand:
        # fuzzy search
        cand = [c for c in d.columns if "GEND" in c.upper() or c.upper().startswith("SEX")]
    if cand:
        sex = _normalize_sex(d[cand[0]])
        return pd.DataFrame({"SEQN": d["SEQN"], "SEX": sex})
    else:
        # no sex column; return NA-sex
        return pd.DataFrame({"SEQN": d["SEQN"], "SEX": pd.Series(pd.NA, index=d.index, dtype="string")})

def categorize_alcohol(drinks_per_day: pd.Series,
                       sex: pd.Series,
                       lifetime_lt12: pd.Series | None = None) -> pd.Series:
    """CDC cats; if sex unknown, only label NONE for zero/lifetime<12; else NA."""
    dpd = pd.to_numeric(drinks_per_day, errors="coerce")
    sex = sex.astype("string").str.upper()

    # Start as NA; fill to NONE for zero intake
    cat = pd.Series(pd.NA, index=dpd.index, dtype="string")

    # Lifetime <12 drinks -> NONE
    if lifetime_lt12 is not None:
        cat = cat.where(~(lifetime_lt12 == 1), "NONE")

    # 0 drinks/day -> NONE regardless of sex
    cat = cat.where(~(dpd.fillna(0) == 0), "NONE")

    # Apply sex-specific thresholds where sex is known and dpd>0
    maskF = (sex == "F") & (dpd > 0)
    maskM = (sex == "M") & (dpd > 0)

    # Moderate
    cat = cat.mask(maskF & (dpd <= 1), "MODERATE")
    cat = cat.mask(maskM & (dpd <= 2), "MODERATE")

    # Heavy (overwrites moderate where applicable)
    cat = cat.mask(maskF & (dpd > 1), "HEAVY")
    cat = cat.mask(maskM & (dpd > 2), "HEAVY")

    return cat.str.upper()

def build_alc(cfg: Config = CONFIG) -> pd.DataFrame:
    ensure_dir(cfg.out_dir)

    # Load DEMO (any of your configured/demo interim paths)
    demo_p = None
    if cfg.demo_9923 and Path(cfg.demo_9923).exists():
        demo_p = Path(cfg.demo_9923)
    elif cfg.demo_9918 and Path(cfg.demo_9918).exists():
        demo_p = Path(cfg.demo_9918)
    else:
        demo_p = pick_first_existing(cfg.interim_dir / "demo_9923.parquet",
                                     cfg.interim_dir / "demo_9918.parquet")
    if demo_p is None:
        raise FileNotFoundError("Need DEMO table for ALQ (demo_9923/demo_9918).")

    demo = pd.read_parquet(demo_p) if str(demo_p).endswith(".parquet") else pd.read_csv(demo_p, low_memory=False)
    demo = demo.rename(columns={c: c.upper() for c in demo.columns})
    demo_sex = _get_sex_from_demo(demo)  # -> SEQN, SEX ('M'/'F'/NA)

    # Load ALQ stack (prefer 99–23, else 99–18)
    alq_9923 = cfg.interim_dir / "alq_9923.parquet"
    alq_9918 = cfg.interim_dir / "alq_9918.parquet"
    if alq_9923.exists():
        alq = pd.read_parquet(alq_9923)
    elif alq_9918.exists():
        alq = pd.read_parquet(alq_9918)
    else:
        raise FileNotFoundError("Provide ./interim/alq_9923.parquet (preferred) or ./interim/alq_9918.parquet.")
    alq = alq.rename(columns={c: c.upper() for c in alq.columns})

    # Drinks/day + lifetime <12 flag
    drinks_per_day = _drinks_per_day_from_alq(alq)
    lifetime_lt12 = None
    if "ALQ110" in alq.columns:
        # ALQ110: “Had at least 12 drinks in lifetime?” 1=Yes, 2=No -> lifetime_lt12 = 1 if 'No'
        lifetime_lt12 = (alq["ALQ110"] == 2).astype("Int8")

    # Bring in SEX where available
    alq_sex = alq[["SEQN"]].merge(demo_sex, on="SEQN", how="left")["SEX"]

    alc_cat = categorize_alcohol(drinks_per_day, sex=alq_sex, lifetime_lt12=lifetime_lt12)

    out = pd.DataFrame({
        "SEQN": alq["SEQN"],
        "DRINKS_PER_DAY": drinks_per_day,
        "ALCOHOL_CAT": alc_cat,
    })
    out.to_parquet(cfg.out_dir / cfg.cov_alc, index=False)
    return out


In [74]:
# --- PATCH: robust Physical Activity loader ---
import pandas as pd
import numpy as np
from pathlib import Path

def _read_any(p: Path) -> pd.DataFrame:
    return pd.read_parquet(p) if str(p).endswith(".parquet") else pd.read_csv(p, low_memory=False)

def _find_first(dcols, candidates):
    for c in candidates:
        if c in dcols: 
            return c
    return None

def _find_by_partials(dcols, must_have=(), any_of=()):
    for c in dcols:
        u = c.upper()
        if all(m in u for m in must_have) and (not any_of or any(a in u for a in any_of)):
            return c
    return None

def build_pa(cfg: Config = CONFIG) -> pd.DataFrame:
    ensure_dir(cfg.out_dir)

    # Load combined or 99–18 (+ optional 19–23)
    pa_9923 = pick_first_existing(
        cfg.interim_dir / "totalpa_9923_imputed.parquet",
        cfg.interim_dir / "totalpa_9923_imputed.csv",
    )
    if pa_9923:
        pa = _read_any(pa_9923)
    else:
        pa_9918 = pick_first_existing(
            cfg.pa_9918_imputed,
            cfg.interim_dir / "totalpa_9918_imputed.parquet",
            cfg.interim_dir / "totalpa_9918_imputed.csv",
        )
        if pa_9918 is None:
            raise FileNotFoundError("Provide totalpa_9923_imputed or totalpa_9918_imputed under ./interim/.")
        pa = _read_any(pa_9918)
        pa_1923 = pick_first_existing(
            cfg.pa_1923,
            cfg.interim_dir / "totalpa_1923_imputed.parquet",
            cfg.interim_dir / "totalpa_1923_imputed.csv",
        )
        if pa_1923:
            pa = pd.concat([pa, _read_any(pa_1923)], ignore_index=True)

    # Normalize columns
    pa = pa.copy()
    pa.columns = [c.upper() for c in pa.columns]
    if "SEQN" not in pa.columns:
        raise ValueError("PA table missing SEQN.")

    # 1) Find MET hours/week (or convert from minutes)
    cols = pa.columns
    met_hr_candidates_exact = [
        "LTPA_MET_HR_WK", "TOTALPA_MET_HR_WK", "TOTPA_MET_HR_WK",
        "PA_MET_HR_WK", "MET_HR_WK", "TOTAL_PA_MET_HR_WK"
    ]
    met_min_candidates_exact = [
        "LTPA_MET_MIN_WK", "TOTALPA_MET_MIN_WK", "TOTPA_MET_MIN_WK",
        "PA_MET_MIN_WK", "MET_MIN_WK", "TOTAL_PA_MET_MIN_WK"
    ]
    met_hr_col  = _find_first(cols, met_hr_candidates_exact)
    met_min_col = _find_first(cols, met_min_candidates_exact)

    # Fuzzy fallbacks
    if met_hr_col is None:
        met_hr_col = _find_by_partials(cols, must_have=("MET","HR"), any_of=("WK","WEEK"))
    if met_min_col is None:
        met_min_col = _find_by_partials(cols, must_have=("MET","MIN"), any_of=("WK","WEEK"))

    if met_hr_col is not None:
        met_hr = pd.to_numeric(pa[met_hr_col], errors="coerce")
    elif met_min_col is not None:
        met_hr = pd.to_numeric(pa[met_min_col], errors="coerce") / 60.0
    else:
        # last resort: if there is a total minutes col like "TOTAL_MIN_WK", convert; else create NA
        tot_min_col = _find_by_partials(cols, must_have=("MIN",), any_of=("WK","WEEK"))
        if tot_min_col is not None:
            met_hr = pd.to_numeric(pa[tot_min_col], errors="coerce") / 60.0
        else:
            print("Warning: could not find MET hours or minutes; setting LTPA_MET_HR_WK to NaN.")
            met_hr = pd.Series(np.nan, index=pa.index, dtype="float")

    # 2) Find imputation flag (default 0)
    flag_candidates_exact = [
        "LTPA_IMPUTED_FLAG", "PA_IMPUTED_FLAG", "TOTALPA_IMPUTED_FLAG",
        "IMPUTED_FLAG", "IMPUTED"
    ]
    flag_col = _find_first(cols, flag_candidates_exact)
    if flag_col is None:
        flag_col = _find_by_partials(cols, must_have=("IMPUT",), any_of=("FLAG",""))

    if flag_col is not None:
        imputed = pd.to_numeric(pa[flag_col], errors="coerce").fillna(0).astype("int8")
    else:
        imputed = pd.Series(0, index=pa.index, dtype="int8")

    out = pd.DataFrame({
        "SEQN": pa["SEQN"],
        "LTPA_MET_HR_WK": met_hr,
        "LTPA_IMPUTED_FLAG": imputed,
    })
    out.to_parquet(cfg.out_dir / cfg.cov_pa, index=False)
    return out
# --- end patch ---


In [75]:
# --- PATCH: robust clinical builder (derives BMI_CLAS, HTN, HIGH_CHOL if missing) ---
import numpy as np
import pandas as pd
from pathlib import Path
from dataclasses import dataclass

def build_clinical(cfg: Config = CONFIG, thr: ClinicalThresholds = THR) -> pd.DataFrame:
    ensure_dir(cfg.out_dir)

    def _read_any(p: Path) -> pd.DataFrame:
        return pd.read_parquet(p) if str(p).endswith(".parquet") else pd.read_csv(p, low_memory=False)

    # Prefer combined 99–23 if present, else 99–18 (+ optional 19–23)
    clin_9923 = pick_first_existing(
        cfg.interim_dir / "clinical_9923.parquet",
        cfg.interim_dir / "clinical_9923.csv",
    )
    if clin_9923:
        clin = _read_any(clin_9923)
    else:
        p9918 = pick_first_existing(
            cfg.clinical_9918,
            cfg.interim_dir / "clinical_9918.parquet",
            cfg.interim_dir / "clinical_9918.csv",
            cfg.interim_dir / "nhanes_primary_anal_full_singleimputation_v2.parquet",
            cfg.interim_dir / "nhanes_primary_anal_full_singleimputation_v2.csv",
        )
        p1923 = pick_first_existing(
            cfg.clinical_1923,
            cfg.interim_dir / "clinical_1923.parquet",
            cfg.interim_dir / "clinical_1923.csv",
        )
        if p9918 is None:
            raise FileNotFoundError("Provide clinical_9923 or clinical_9918 under ./interim/.")
        df_9918 = _read_any(p9918)
        if p1923:
            df_1923 = _read_any(p1923)
            clin = pd.concat([df_9918, df_1923], ignore_index=True)
        else:
            clin = df_9918

    clin = clin.copy()
    clin.columns = [c.upper() for c in clin.columns]

    # -------------------------
    # BMI_CLAS (if missing)
    # -------------------------
    if "BMI_CLAS" not in clin.columns:
        # Try BMI from BMX output; else any BMI in this table; else NA
        bmi_series = None
        try:
            bmx_path = cfg.out_dir / cfg.cov_bmx
            if bmx_path.exists():
                bmx = pd.read_parquet(bmx_path)
                bmx.columns = [c.upper() for c in bmx.columns]
                if {"SEQN","BMI"}.issubset(bmx.columns) and "SEQN" in clin.columns:
                    bmi_map = bmx.set_index("SEQN")["BMI"]
                    bmi_series = clin["SEQN"].map(bmi_map)
        except Exception:
            pass
        if bmi_series is None:
            bmi_series = pd.to_numeric(clin.get("BMI", np.nan), errors="coerce")

        def _bmi_class(x):
            if pd.isna(x): return pd.NA
            if x < 18.5:  return "UNDER"
            if x < 25:    return "NORMAL"
            if x < 30:    return "OVER"
            return "OBESE"

        clin["BMI_CLAS"] = pd.Series([_bmi_class(v) for v in bmi_series], dtype="string")

    # -------------------------
    # HTN (if missing)
    # -------------------------
    if "HTN" not in clin.columns:
        # Look for common diagnosis/med flags
        diag_col = next((c for c in clin.columns if (("HTN" in c or "HYPERT" in c) and "MED" not in c and c != "HTN")), None)
        med_col  = next((c for c in clin.columns if ("MED" in c and ("BP" in c or "HYPER" in c))), None)
        sbp = pd.to_numeric(clin.get("SBP", np.nan), errors="coerce")
        dbp = pd.to_numeric(clin.get("DBP", np.nan), errors="coerce")

        htn = pd.Series(0, index=clin.index, dtype="Int8")
        if diag_col:
            diag = pd.to_numeric(clin[diag_col], errors="coerce")
            htn = ((diag == 1) | (diag > 0)).astype("Int8")
        if med_col:
            med = pd.to_numeric(clin[med_col], errors="coerce")
            htn = ((htn == 1) | (med == 1) | (med > 0)).astype("Int8")
        if ("SBP" in clin.columns) or ("DBP" in clin.columns):
            htn = ((htn == 1) | (sbp >= thr.htn_sbp) | (dbp >= thr.htn_dbp)).astype("Int8")

        clin["HTN"] = htn

    # -------------------------
    # HIGH_CHOL (if missing)
    # -------------------------
    if "HIGH_CHOL" not in clin.columns:
        chol_med_col = next((c for c in clin.columns if ("CHOL" in c and "MED" in c)), None)
        tch = pd.to_numeric(clin.get("TCHOL", np.nan), errors="coerce")
        ldl = pd.to_numeric(clin.get("LDL", np.nan), errors="coerce")

        high_chol = ((tch >= 240) | (ldl >= 160)).astype("Int8")
        if chol_med_col:
            med = pd.to_numeric(clin[chol_med_col], errors="coerce")
            high_chol = ((high_chol == 1) | (med == 1) | (med > 0)).astype("Int8")

        clin["HIGH_CHOL"] = high_chol

    # Ensure remaining expected columns exist (fill as NA if absent)
    keep = ["SEQN", "BMI_CLAS", "DIABETES", "HTN", "HIGH_CHOL", "CVD", "CANCER", "SBP", "DBP", "TCHOL", "HDL", "LDL", "TG"]
    for col in keep:
        if col not in clin.columns:
            clin[col] = pd.Series(np.nan, index=clin.index)

    out = clin[keep].copy()
    for b in ["DIABETES", "HTN", "HIGH_CHOL", "CVD", "CANCER"]:
        out[b] = pd.to_numeric(out[b], errors="coerce").astype("Int8")

    out.to_parquet(cfg.out_dir / cfg.cov_clinical, index=False)
    return out
# --- end patch ---


In [76]:
# --- PATCH: robust household builder ---
import pandas as pd, numpy as np
from pathlib import Path

def _read_any_demo(p: Path) -> pd.DataFrame:
    ext = p.suffix.lower()
    if ext == ".parquet": return pd.read_parquet(p)
    if ext == ".csv":     return pd.read_csv(p, low_memory=False)
    # XPORT or sas7bdat
    return pd.read_sas(p, format="xport" if ext==".xpt" else None)

def _stack_household_from_raw(search_root: Path) -> pd.DataFrame | None:
    pats = ["*DEMO*.xpt","*DEMO*.sas7bdat","*DEMO*.csv","*DEMO*.parquet"]
    files = []
    for pat in pats:
        files += list(search_root.rglob(pat))
    frames = []
    seen = set()
    for f in files:
        if f in seen: 
            continue
        seen.add(f)
        try:
            df = _read_any_demo(f)
            df.columns = [c.upper() for c in df.columns]
            if {"SEQN","DMDHHSIZ"}.issubset(df.columns):
                frames.append(df[["SEQN","DMDHHSIZ"]])
        except Exception:
            pass
    if not frames:
        return None
    out = pd.concat(frames, ignore_index=True)
    out = out.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"], keep="last")
    return out

def build_household(cfg: Config = CONFIG) -> pd.DataFrame:
    ensure_dir(cfg.out_dir)
    # Pick the DEMO table you configured earlier (parquet or csv)
    demo_p = None
    if cfg.demo_9923 and Path(cfg.demo_9923).exists():
        demo_p = Path(cfg.demo_9923)
    elif cfg.demo_9918 and Path(cfg.demo_9918).exists():
        demo_p = Path(cfg.demo_9918)
    else:
        demo_p = pick_first_existing(cfg.interim_dir / "demo_9923.parquet",
                                     cfg.interim_dir / "demo_9918.parquet")
    if demo_p is None:
        raise FileNotFoundError("Need DEMO (demo_9923/demo_9918) to build household.")

    demo = pd.read_parquet(demo_p) if str(demo_p).endswith(".parquet") else pd.read_csv(demo_p, low_memory=False)
    demo.columns = [c.upper() for c in demo.columns]
    if "SEQN" not in demo.columns:
        raise ValueError("DEMO missing SEQN.")

    # Case 1: DMDHHSIZ already present
    if "DMDHHSIZ" in demo.columns:
        out = demo[["SEQN","DMDHHSIZ"]].copy()
        out.to_parquet(cfg.out_dir / cfg.cov_household, index=False)
        return out

    # Case 2: use DMDFMSIZ (family size) as proxy if available
    if "DMDFMSIZ" in demo.columns:
        out = demo[["SEQN","DMDFMSIZ"]].rename(columns={"DMDFMSIZ":"DMDHHSIZ"})
        out.to_parquet(cfg.out_dir / cfg.cov_household, index=False)
        print("Note: DMDHHSIZ not found; used DMDFMSIZ as proxy.")
        return out

    # Case 3: scan raw DEMO files anywhere in your project for DMDHHSIZ and merge by SEQN
    # Search roots: cfg.raw_dir (if exists) and the project root inferred from interim_dir
    roots = []
    if cfg.raw_dir and Path(cfg.raw_dir).exists(): 
        roots.append(Path(cfg.raw_dir))
    # project root = parent of interim folder
    try:
        roots.append(Path(cfg.interim_dir).resolve().parent)
    except Exception:
        pass

    hh_lookup = None
    for r in roots:
        hh_lookup = _stack_household_from_raw(r)
        if hh_lookup is not None:
            break

    if hh_lookup is not None:
        merged = demo[["SEQN"]].merge(hh_lookup, on="SEQN", how="left")
        merged.to_parquet(cfg.out_dir / cfg.cov_household, index=False)
        print(f"Household size recovered from raw files under {r}.")
        return merged

    # Case 4: last resort — write NA column so pipeline can proceed
    print("Warning: Could not locate DMDHHSIZ or DMDFMSIZ; writing NA household size.")
    out = pd.DataFrame({"SEQN": demo["SEQN"], "DMDHHSIZ": pd.Series(np.nan, index=demo.index, dtype="float")})
    out.to_parquet(cfg.out_dir / cfg.cov_household, index=False)
    return out
# --- end patch ---


In [77]:
# --- PATCH: robust survey design merger ---
import pandas as pd
import numpy as np
from pathlib import Path

SURVEY_KEEP = ["SEQN", "SDDSRVYR", "SDMVPSU", "SDMVSTRA", "WTMEC2YR"]

def _read_any(p: Path) -> pd.DataFrame:
    return pd.read_parquet(p) if str(p).endswith(".parquet") else pd.read_csv(p, low_memory=False)

def _normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d.columns = [c.upper() for c in d.columns]
    return d

def _pick_weight(cols: list[str]) -> str | None:
    up = [c.upper() for c in cols]
    # Best to ok: WTMEC2YR
    if "WTMEC2YR" in up: return cols[up.index("WTMEC2YR")]
    # Good fallbacks seen in NHANES pooled files
    for cand in ["WTINT2YR","WTMEC4YR","WTMECYR","WTMEC_2YR","WT_ME_C2YR","WTSAF2YR"]:
        if cand in up: return cols[up.index(cand)]
    # Sometimes duplicates like WTMEC2YR_X
    for c in up:
        if "WTMEC" in c and "2YR" in c:
            return cols[up.index(c)]
    return None

def _has_all(d: pd.DataFrame, need=SURVEY_KEEP) -> bool:
    return all(c in d.columns for c in need)

def get_survey_core(cfg: Config = CONFIG) -> pd.DataFrame:
    # Candidate DEMO tables (prefer 99–23, then 99–18, then interim)
    candidates = []
    for p in [cfg.demo_9923, cfg.demo_9918,
              cfg.interim_dir / "demo_9923.parquet",
              cfg.interim_dir / "demo_9918.parquet"]:
        if p and Path(p).exists():
            candidates.append(Path(p))

    if not candidates:
        raise FileNotFoundError("No DEMO table found for survey fields.")

    # Load base (prefer the first candidate)
    base = _normalize_cols(_read_any(candidates[0]))
    if "SEQN" not in base.columns:
        raise ValueError("DEMO missing SEQN.")

    # Ensure SDDSRVYR exists if possible (some custom stacks omit it)
    if "SDDSRVYR" not in base.columns:
        # Try to take it from any backup candidate
        for p in candidates[1:]:
            d2 = _normalize_cols(_read_any(p))
            if {"SEQN","SDDSRVYR"}.issubset(d2.columns):
                base = base.merge(d2[["SEQN","SDDSRVYR"]], on="SEQN", how="left")
                break
        if "SDDSRVYR" not in base.columns:
            base["SDDSRVYR"] = pd.Series(np.nan, index=base.index)

    # Get/derive PSU & STRATA
    for field, fuzzy in [("SDMVPSU","PSU"), ("SDMVSTRA","STRA")]:
        if field not in base.columns:
            # Try from backups
            pulled = False
            for p in candidates[1:]:
                d2 = _normalize_cols(_read_any(p))
                if {"SEQN",field}.issubset(d2.columns):
                    base = base.merge(d2[["SEQN",field]], on="SEQN", how="left")
                    pulled = True
                    break
            if not pulled:
                # Fuzzy fallback from same table (e.g., SDMVPSU_X)
                alt = next((c for c in base.columns if fuzzy in c and c != field), None)
                if alt:
                    base[field] = base[alt]
                else:
                    base[field] = pd.Series(np.nan, index=base.index)

    # Get/derive WTMEC2YR (weight)
    if "WTMEC2YR" not in base.columns:
        # Try to copy from backups under any known alias
        weight_name = _pick_weight(base.columns)
        if weight_name and weight_name.upper() != "WTMEC2YR":
            base["WTMEC2YR"] = pd.to_numeric(base[weight_name], errors="coerce")
        else:
            pulled = False
            for p in candidates[1:]:
                d2 = _normalize_cols(_read_any(p))
                wn = _pick_weight(d2.columns)
                if {"SEQN"}.issubset(d2.columns) and wn:
                    base = base.merge(d2[["SEQN", wn]].rename(columns={wn:"WTMEC2YR"}), on="SEQN", how="left")
                    pulled = True
                    break
            if not pulled:
                base["WTMEC2YR"] = pd.Series(np.nan, index=base.index)

    # Final subset
    survey = base[[c for c in SURVEY_KEEP if c in base.columns]].copy()

    # Assert presence (allow NaN values; we just need the columns)
    missing_cols = [c for c in SURVEY_KEEP if c not in survey.columns]
    if missing_cols:
        # Create any still-missing as NaN so downstream can proceed
        for c in missing_cols:
            survey[c] = pd.Series(np.nan, index=survey.index)
        print(f"Note: created placeholder columns for: {missing_cols}")

    return survey
# --- end patch ---


In [78]:
from pathlib import Path
import pandas as pd
import numpy as np

ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
INTERIM = ROOT / "interim"
OUT = ROOT / "out"
INTERIM.mkdir(parents=True, exist_ok=True)
OUT.mkdir(parents=True, exist_ok=True)

# ------------------
# 1) Smoking mapping
# ------------------
smk_csv = ROOT / "data" / "cov" / "smk_9918.csv"
smk = pd.read_csv(smk_csv, low_memory=False)
smk.columns = [c.upper() for c in smk.columns]  # ['UNNAMED: 0','SEQN','SMK','SMK_YR','PACK','PACK_YR','SMK_AVG']

out_smk = pd.DataFrame({"SEQN": smk["SEQN"]})

# CIGS_PER_DAY from SMK_AVG (cigs/day)
out_smk["CIGS_PER_DAY"] = pd.to_numeric(smk.get("SMK_AVG"), errors="coerce")

# PACK_YEARS from PACK_YR
out_smk["PACK_YEARS"] = pd.to_numeric(smk.get("PACK_YR"), errors="coerce")

# SMK_STATUS from SMK (numeric coding: 1=NEVER, 2=FORMER, 3=CURRENT)
map_num = {1: "NEVER", 2: "FORMER", 3: "CURRENT"}
s = pd.to_numeric(smk.get("SMK"), errors="coerce").map(map_num)
out_smk["SMK_STATUS"] = s.astype("string")

# FORMER_SMOKER (handle NA → False)
out_smk["FORMER_SMOKER"] = out_smk["SMK_STATUS"].eq("FORMER").fillna(False).astype("int8")

# Save standardized smoking parquet for the builder
(out_smk[["SEQN","SMK_STATUS","CIGS_PER_DAY","PACK_YEARS","FORMER_SMOKER"]]
 .to_parquet(INTERIM / "smk_9918.parquet", index=False))
print("✓ Standardized Smoking →", INTERIM / "smk_9918.parquet")

# Quick sanity
print("SMK (raw) value counts:")
print(smk["SMK"].value_counts(dropna=False).head(10))
print("\nSMK_STATUS (mapped) value counts:")
print(out_smk["SMK_STATUS"].value_counts(dropna=False).head(10))


# -----------------------------
# 2) Physical activity mapping
# -----------------------------
# Your PA columns: ['SEQN','sddsrvyr','ltpa', ... 'imp']
pa_csv = ROOT / "data" / "cov" / "totalpa_9918_imputed.csv"
pa = pd.read_csv(pa_csv, low_memory=False)
pa.columns = [c.upper() for c in pa.columns]  # includes 'LTPA' and 'IMP'

out_pa = pd.DataFrame({"SEQN": pa["SEQN"]})
out_pa["LTPA_MET_HR_WK"] = pd.to_numeric(pa.get("LTPA"), errors="coerce")
out_pa["LTPA_IMPUTED_FLAG"] = pd.to_numeric(pa.get("IMP"), errors="coerce").fillna(0).astype("int8")

out_pa.to_parquet(INTERIM / "totalpa_9918_imputed.parquet", index=False)
print("✓ Standardized PA →", INTERIM / "totalpa_9918_imputed.parquet")


# -----------------------------------------
# 3) Point CONFIG to these standardized files
# -----------------------------------------
CONFIG.smk_9918 = INTERIM / "smk_9918.parquet"
CONFIG.pa_9918_imputed = INTERIM / "totalpa_9918_imputed.parquet"

# --------------------------------
# 4) Rebuild SMK, PA, and the core
# --------------------------------
smk_res = build_smk(CONFIG)
pa_res  = build_pa(CONFIG)
core    = build_core(CONFIG)

print("\nDone. Non-missing now (% non-NA):")
def nonmiss(s): return float(1 - s.isna().mean())
print({
    "SMK_STATUS":     nonmiss(core["SMK_STATUS"]),
    "CIGS_PER_DAY":   nonmiss(core["CIGS_PER_DAY"]),
    "PACK_YEARS":     nonmiss(core["PACK_YEARS"]),
    "LTPA_MET_HR_WK": nonmiss(core["LTPA_MET_HR_WK"]),
})

# (Optional) Peek a few rows
try:
    display(core[["SEQN","SMK_STATUS","CIGS_PER_DAY","PACK_YEARS","LTPA_MET_HR_WK","LTPA_IMPUTED_FLAG"]].head())
except NameError:
    print(core[["SEQN","SMK_STATUS","CIGS_PER_DAY","PACK_YEARS","LTPA_MET_HR_WK","LTPA_IMPUTED_FLAG"]].head())


✓ Standardized Smoking → /Users/dengshuyue/Desktop/SDOH/analysis/interim/smk_9918.parquet
SMK (raw) value counts:
SMK
1.0    29985
2.0    13598
3.0    11431
NaN       67
Name: count, dtype: int64

SMK_STATUS (mapped) value counts:
SMK_STATUS
NEVER      29985
FORMER     13598
CURRENT    11431
<NA>          67
Name: count, dtype: Int64
✓ Standardized PA → /Users/dengshuyue/Desktop/SDOH/analysis/interim/totalpa_9918_imputed.parquet

Done. Non-missing now (% non-NA):
{'SMK_STATUS': 0.4857791238774736, 'CIGS_PER_DAY': 0.10000971311004958, 'PACK_YEARS': 0.10717092424657171, 'LTPA_MET_HR_WK': 0.4863707405804908}


Unnamed: 0,SEQN,SMK_STATUS,CIGS_PER_DAY,PACK_YEARS,LTPA_MET_HR_WK,LTPA_IMPUTED_FLAG
0,130378.0,,,,,
1,130379.0,,,,,
2,130380.0,,,,,
3,130381.0,,,,,
4,130382.0,,,,,


In [79]:
res = run_all(CONFIG)
print(quick_checks(CONFIG))

vars(CONFIG) 

Household size recovered from raw files under /Users/dengshuyue/Desktop/SDOH/analysis.
n_rows                     113249.000000
n_unique_seqn              113249.000000
missing_bmi_pct                 0.224726
missing_alcohol_cat_pct         0.000000
missing_smk_status_pct          0.514221
has_weights                     1.000000
dtype: float64


{'raw_dir': PosixPath('/Users/you/nhanes/raw'),
 'interim_dir': PosixPath('interim'),
 'out_dir': PosixPath('out'),
 'smk_9918': PosixPath('/Users/dengshuyue/Desktop/SDOH/analysis/interim/smk_9918.parquet'),
 'pa_9918_imputed': PosixPath('/Users/dengshuyue/Desktop/SDOH/analysis/interim/totalpa_9918_imputed.parquet'),
 'clinical_9918': None,
 'smk_1923': None,
 'pa_1923': None,
 'clinical_1923': None,
 'demo_9923': PosixPath('/Users/dengshuyue/Desktop/SDOH/analysis/interim/survey_design_9923.parquet'),
 'demo_9918': None,
 'bmx_9923': None,
 'cov_smk': 'cov_smk_1999_2023.parquet',
 'cov_alc': 'cov_alc_1999_2023.parquet',
 'cov_pa': 'cov_pa_1999_2023.parquet',
 'cov_bmx': 'cov_bmx_1999_2023.parquet',
 'cov_clinical': 'cov_clinical_1999_2023.parquet',
 'cov_household': 'cov_household_1999_2023.parquet',
 'cov_core': 'cov_core_1999_2023.parquet'}

In [80]:
from pathlib import Path
core_path = Path("/Users/dengshuyue/Desktop/SDOH/analysis/out/cov_core_1999_2023.parquet")
core_path.exists(), core_path


(True,
 PosixPath('/Users/dengshuyue/Desktop/SDOH/analysis/out/cov_core_1999_2023.parquet'))

In [81]:
# === Merge cov_core into your existing master (LEFT join) ===
from pathlib import Path
import pandas as pd

def _load_any(p: Path) -> pd.DataFrame:
    return pd.read_parquet(p) if p.suffix.lower()==".parquet" else pd.read_csv(p, low_memory=False)

def merge_core_into_master(
    master_path: Path,
    core_path: Path,
    out_path: Path,
    keep_master_dups: bool = True,
    prefer_core_for: list[str] | None = None,
) -> pd.DataFrame:
    """
    LEFT-join cov_core (1999–2023) onto your master (1999–2018).
    - keep_master_dups=True: if a column exists in both, keep the master's version.
      (Core duplicates get suffix "_CORE" temporarily and are then dropped.)
    - prefer_core_for=['BMI', 'SBP', ...]: for listed columns, take values from core
      (if present) and overwrite master.
    """
    if not master_path.exists():
        raise FileNotFoundError(f"Master not found: {master_path}")
    if not core_path.exists():
        raise FileNotFoundError(f"Core not found:   {core_path}\n"
                                "Run the core builder first to create cov_core_1999_2023.parquet.")

    master = _load_any(master_path)
    core   = _load_any(core_path)

    # Normalize keys/columns
    master.columns = master.columns.str.upper()
    core.columns   = core.columns.str.upper()

    master["SEQN"] = pd.to_numeric(master["SEQN"], errors="coerce").astype("Int64")
    core["SEQN"]   = pd.to_numeric(core["SEQN"],   errors="coerce").astype("Int64")

    # Merge
    merged = master.merge(core, on="SEQN", how="left", suffixes=("", "_CORE"), validate="one_to_one")

    # Optionally overwrite some columns with core values (only where core non-null)
    if prefer_core_for:
        for col in prefer_core_for:
            c_core = f"{col}_CORE"
            if c_core in merged.columns:
                if col not in merged.columns:
                    # If master didn't have the column, just rename core's
                    merged.rename(columns={c_core: col}, inplace=True)
                else:
                    # Overwrite master col with non-missing core values
                    merged[col] = merged[col].where(merged[c_core].isna(), merged[c_core])
                    merged.drop(columns=[c_core], inplace=True)

    # If keeping master versions, drop remaining *_CORE duplicates
    if keep_master_dups:
        dup_cols = [c for c in merged.columns if c.endswith("_CORE")]
        merged.drop(columns=dup_cols, inplace=True)

    # Save
    out_path.parent.mkdir(parents=True, exist_ok=True)
    if out_path.suffix.lower()==".parquet":
        merged.to_parquet(out_path, index=False)
    else:
        merged.to_csv(out_path, index=False)

    # Log
    n_rows = len(merged)
    n_matched = int(merged["SEQN"].isin(core["SEQN"]).sum())
    print(f"Merged shape: {merged.shape} | Matched SEQN: {n_matched} / {n_rows}")
    return merged

# ---- Configure paths for your project ----
ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
OUT  = ROOT / "out"

master_path = OUT / "nhanes_mort_demo_sdoh_1999_2018.parquet"     # your existing master
core_path   = OUT / "cov_core_1999_2023.parquet"                   # written by the core builder
out_path    = OUT / "nhanes_mort_demo_sdoh_core_1999_2023.parquet" # new merged output

# If you want certain variables to come from core, list them here; else leave [].
prefer_core = []   # e.g., ['BMI','SBP','DBP']

merged = merge_core_into_master(
    master_path=master_path,
    core_path=core_path,
    out_path=out_path,
    keep_master_dups=True,
    prefer_core_for=prefer_core
)

# Quick NA audit on key covariates (only those present will be shown)
audit_cols = ["SMK_STATUS","CIGS_PER_DAY","PACK_YEARS",
              "ALCOHOL_CAT","LTPA_MET_HR_WK","BMI",
              "DIABETES","HTN","HIGH_CHOL","CVD","CANCER",
              "SBP","DBP","TCHOL","HDL","LDL","TG"]
present = [c for c in audit_cols if c in merged.columns]
na_pct = {c: round(float(merged[c].isna().mean())*100, 1) for c in present}
print("\nMissingness (% NA) in merged:")
for k in present:
    print(f"  {k:16s} {na_pct[k]:5.1f}%")

# Peek a few columns
show_cols = ["SEQN","SMK_STATUS","CIGS_PER_DAY","PACK_YEARS",
             "ALCOHOL_CAT","LTPA_MET_HR_WK","BMI","SBP","DBP"]
print("\nPreview:")
print(merged[[c for c in show_cols if c in merged.columns]].head(10))


Merged shape: (56253, 46) | Matched SEQN: 56253 / 56253

Missingness (% NA) in merged:
  SMK_STATUS         7.1%
  CIGS_PER_DAY      80.8%
  PACK_YEARS        79.5%
  ALCOHOL_CAT        0.0%
  LTPA_MET_HR_WK     7.1%
  BMI                2.1%
  DIABETES          85.1%
  HTN                0.0%
  HIGH_CHOL          0.0%
  CVD                0.0%
  CANCER             0.0%
  SBP                0.0%
  DBP                0.0%
  TCHOL              0.0%
  HDL                0.0%
  LDL                0.0%
  TG                 0.0%

Preview:
   SEQN SMK_STATUS  CIGS_PER_DAY  PACK_YEARS ALCOHOL_CAT  LTPA_MET_HR_WK  \
0     2      NEVER           NaN         NaN        NONE        0.000000   
1     5     FORMER           NaN         NaN        NONE       41.066667   
2     6       <NA>           NaN         NaN        NONE             NaN   
3     7     FORMER           NaN      8030.0        NONE        3.033333   
4    10    CURRENT           1.0         NaN        NONE        0.000000   
5    

<h2>peek at the merged table HERE!!!!! understand the core table</h2>

In [87]:
import pandas as pd

core_path = "/Users/dengshuyue/Desktop/SDOH/analysis/out/cov_core_1999_2023.parquet"
core = pd.read_parquet(core_path)

# Quick stats
print("shape:", core.shape)
print("unique SEQN:", core["SEQN"].nunique())
print("SEQN range:", int(core["SEQN"].min()), "→", int(core["SEQN"].max()))
print("first 40 columns:", core.columns.tolist()[:40])

# Optional: widen display for many columns
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)

# Sort once, then peek head/tail (ALL columns)
sorted_core = core.sort_values("SEQN", kind="mergesort")

# Try to use rich display if available (Jupyter); else print to stdout
try:
    from IPython.display import display  # only exists in notebooks
    display(sorted_core.head(10))
    display(sorted_core.tail(10))
except Exception:
    print("\nHead (sorted by SEQN):")
    print(sorted_core.head(10).to_string(index=False))
    print("\nTail (sorted by SEQN):")
    print(sorted_core.tail(10).to_string(index=False))


shape: (113249, 29)
unique SEQN: 113249
SEQN range: 1 → 142310
first 40 columns: ['SEQN', 'SDDSRVYR', 'SDMVPSU', 'SDMVSTRA', 'WTMEC2YR', 'SMK_STATUS', 'CIGS_PER_DAY', 'PACK_YEARS', 'FORMER_SMOKER', 'DRINKS_PER_DAY', 'ALCOHOL_CAT', 'LTPA_MET_HR_WK', 'LTPA_IMPUTED_FLAG', 'BMXWT', 'BMXHT', 'BMI', 'BMI_CLAS', 'DIABETES', 'HTN', 'HIGH_CHOL', 'CVD', 'CANCER', 'SBP', 'DBP', 'TCHOL', 'HDL', 'LDL', 'TG', 'DMDHHSIZ']


Unnamed: 0,SEQN,SDDSRVYR,SDMVPSU,SDMVSTRA,WTMEC2YR,SMK_STATUS,CIGS_PER_DAY,PACK_YEARS,FORMER_SMOKER,DRINKS_PER_DAY,ALCOHOL_CAT,LTPA_MET_HR_WK,LTPA_IMPUTED_FLAG,BMXWT,BMXHT,BMI,BMI_CLAS,DIABETES,HTN,HIGH_CHOL,CVD,CANCER,SBP,DBP,TCHOL,HDL,LDL,TG,DMDHHSIZ
52723,1.0,1.0,1.0,5.0,10982.898896,,,,,,NONE,,,12.5,91.6,14.897695,UNDER,0,0,0,0,0,91.333333,56.0,131.0,59.0,54.0,99.0,3.0
52724,2.0,1.0,3.0,1.0,28325.384898,NEVER,,,0.0,,NONE,0.0,1.0,75.4,174.0,24.904215,NORMAL,0,0,0,0,1,100.666667,56.666667,215.0,54.0,136.0,128.0,1.0
52725,3.0,1.0,2.0,7.0,46192.256945,,,,,,NONE,,,32.9,136.6,17.631713,UNDER,0,0,0,0,0,108.666667,62.0,129.0,30.0,58.0,202.0,4.0
52726,4.0,1.0,1.0,2.0,10251.26002,,,,,,NONE,,,13.3,,,,0,0,1,0,0,95.333333,61.333333,211.0,43.0,161.0,37.0,7.0
52727,5.0,1.0,2.0,8.0,99445.065735,FORMER,,,1.0,,NONE,41.066667,1.0,92.5,178.3,29.096386,OVER,0,1,1,0,0,122.0,82.666667,279.0,42.0,168.0,347.0,3.0
52728,6.0,1.0,2.0,2.0,39656.600444,,,,,,NONE,,,59.2,162.0,22.557537,NORMAL,0,0,0,0,0,114.666667,68.0,153.0,61.0,59.0,181.0,2.0
52729,7.0,1.0,2.0,4.0,25525.423409,FORMER,,8030.0,1.0,,NONE,3.033333,1.0,78.0,162.9,29.393577,OVER,0,0,1,0,0,125.333333,80.0,245.0,105.0,127.0,62.0,1.0
52730,8.0,1.0,1.0,6.0,31510.587866,,,,,,NONE,,,40.7,162.0,15.508307,UNDER,0,0,0,0,0,100.666667,49.333333,162.0,67.0,88.0,33.0,7.0
52731,9.0,1.0,2.0,9.0,7575.870247,,,,,,NONE,,,45.5,156.9,18.482704,UNDER,0,0,0,0,0,109.333333,53.333333,148.0,58.0,79.0,56.0,4.0
52732,10.0,1.0,1.0,7.0,22445.808572,CURRENT,1.0,,0.0,,NONE,0.0,1.0,111.8,190.1,30.936955,OBESE,0,1,0,0,0,145.333333,96.0,140.0,51.0,80.0,45.0,1.0


Unnamed: 0,SEQN,SDDSRVYR,SDMVPSU,SDMVSTRA,WTMEC2YR,SMK_STATUS,CIGS_PER_DAY,PACK_YEARS,FORMER_SMOKER,DRINKS_PER_DAY,ALCOHOL_CAT,LTPA_MET_HR_WK,LTPA_IMPUTED_FLAG,BMXWT,BMXHT,BMI,BMI_CLAS,DIABETES,HTN,HIGH_CHOL,CVD,CANCER,SBP,DBP,TCHOL,HDL,LDL,TG,DMDHHSIZ
11923,142301.0,12.0,1.0,179.0,17561.693314,,,,,,NONE,,,,,,,,,,,,,,,,,,1.0
11924,142302.0,12.0,1.0,179.0,21064.211105,,,,,,NONE,,,,,,,,,,,,,,,,,,2.0
11925,142303.0,12.0,1.0,181.0,47087.576098,,,,,,NONE,,,,,,,,,,,,,,,,,,2.0
11926,142304.0,12.0,1.0,176.0,22000.421342,,,,,,NONE,,,,,,,,,,,,,,,,,,5.0
11927,142305.0,12.0,2.0,180.0,43483.407534,,,,,,NONE,,,,,,,,,,,,,,,,,,4.0
11928,142306.0,12.0,1.0,176.0,13459.129019,,,,,,NONE,,,,,,,,,,,,,,,,,,2.0
11929,142307.0,12.0,1.0,181.0,64962.328962,,,,,,NONE,,,,,,,,,,,,,,,,,,5.0
11930,142308.0,12.0,2.0,183.0,44367.534132,,,,,,NONE,,,,,,,,,,,,,,,,,,3.0
11931,142309.0,12.0,1.0,176.0,46249.361849,,,,,,NONE,,,,,,,,,,,,,,,,,,5.0
11932,142310.0,12.0,2.0,187.0,49647.225467,,,,,,NONE,,,,,,,,,,,,,,,,,,2.0


<h2>check why smk is NA</h2>

In [71]:
from pathlib import Path
import pandas as pd
import numpy as np

ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
p = ROOT / "interim" / "smk_9918.parquet"

print("File exists:", p.exists(), p)
smk = pd.read_parquet(p)

# SEQN range & basics
seqn = smk["SEQN"]
print("\nSEQN info:")
print({
    "dtype": str(seqn.dtype),
    "rows": len(seqn),
    "n_unique": int(seqn.nunique()),
    "min": float(seqn.min()) if pd.api.types.is_numeric_dtype(seqn) else None,
    "max": float(seqn.max()) if pd.api.types.is_numeric_dtype(seqn) else None,
    "looks_float_with_decimals": bool(pd.api.types.is_float_dtype(seqn) and ((seqn % 1) != 0).any())
})

# SMK_STATUS NA percentage
na_pct = float(smk["SMK_STATUS"].isna().mean())
print("\nSMK_STATUS NA %:", na_pct)

print("\nSMK_STATUS value counts (top):")
print(smk["SMK_STATUS"].value_counts(dropna=False).head(10))

# Peek a few rows sorted by SEQN
cols = [c for c in ["SEQN","SMK_STATUS","CIGS_PER_DAY","PACK_YEARS","FORMER_SMOKER"] if c in smk.columns]

try:
    # Jupyter-friendly
    display(smk.sort_values("SEQN", kind="mergesort")[cols].head(10))
    # display(smk.sort_values("SEQN", kind="mergesort")[cols].tail(10))
except NameError:
    # Fallback if not in a notebook
    print("\nHead (sorted by SEQN):")
    print(smk.sort_values("SEQN", kind="mergesort")[cols].head(10).to_string(index=False))
    # print("\nTail (sorted by SEQN):")
    # print(smk.sort_values("SEQN", kind="mergesort")[cols].tail(10).to_string(index=False))



File exists: True /Users/dengshuyue/Desktop/SDOH/analysis/interim/smk_9918.parquet

SEQN info:
{'dtype': 'float64', 'rows': 55081, 'n_unique': 55081, 'min': 2.0, 'max': 102956.0, 'looks_float_with_decimals': False}

SMK_STATUS NA %: 0.0012163904068553586

SMK_STATUS value counts (top):
SMK_STATUS
NEVER      29985
FORMER     13598
CURRENT    11431
<NA>          67
Name: count, dtype: Int64


Unnamed: 0,SEQN,SMK_STATUS,CIGS_PER_DAY,PACK_YEARS,FORMER_SMOKER
0,2.0,NEVER,,,0
1,5.0,FORMER,,,1
2,7.0,FORMER,7300.0,8030.0,1
3,10.0,CURRENT,1.0,388.85,0
4,12.0,NEVER,,,0
5,13.0,FORMER,1095.0,1095.0,1
6,14.0,CURRENT,1.0,388.85,0
7,15.0,CURRENT,2.0,777.7,0
8,16.0,NEVER,,,0
9,20.0,FORMER,,,1


<h2>check demo </h2>

In [None]:
# Step 1: Read dietary score data
scores = pd.read_excel(os.path.join(folder_path, "i.scores.xlsx"), engine="openpyxl")

# Rename columns to match desired output
scores = scores.rename(columns={
    "seqn": "SEQN",
    "i.FCS": "i_FCS",
    "i.optup": "i_optup",  # keep lowercase here first
    "i.HSR": "i_HSR",
    "i.nutri": "i_nutri"
})

# Then copy and rename for output
scores2 = scores[["SEQN", "i_FCS", "i_optup", "i_HSR", "i_nutri"]].copy()
scores2 = scores2.rename(columns={"i_optup": "i_Optup"})
scores2 = scores2.sort_values("SEQN")


# Step 2: Read covariates from Lu paper
covar = pd.read_sas(os.path.join(folder_path, "covar.sas7bdat"), format="sas7bdat")
covar = covar.rename(columns=str.upper)  # make all column names uppercase to match SAS style
# filter available variables only
covar_vars = ["SEQN", "RIDAGEYR", "SEX", "RACE", "EDU", "INDFMPIR", "SMK_AVG", "SMK_PAST",
              "SMK", "ALCG2", "HEI2015_TOTAL_SCORE", "DIABE"]
covar = covar[[col for col in covar_vars if col in covar.columns]].copy()
covar = covar.sort_values("SEQN")

# Step 3: Read covariates from Meghan paper
covariates1_raw = pd.read_csv(os.path.join(folder_path, "covariates.csv"))
covariates1 = covariates1_raw.rename(columns={"seqn": "SEQN"})
covariates_vars = ["SEQN", "sdmvpsu", "sdmvstra", "met_hr", "perE_alco", "dm_self",
                   "tchol", "hdl", "ldl", "tg", "bmi", "CVD", "dm_rx", "chol_rx",
                   "angina_rx", "lung_disease", "angina", "hba1c", "sbp", "dbp", "cancer"]
covariates1 = covariates1[[col for col in covariates_vars if col in covariates1.columns]].copy()
covariates1 = covariates1.sort_values("SEQN")

# Step 4: Read dietary weight data (filter DAYS == 1)
dietwt = pd.read_sas(os.path.join(folder_path, "gg.sas7bdat"), format="sas7bdat")

# Check for expected columns
required_cols = ["SEQN", "DAYS", "WTDRD1", "WTDR2D", "DR12DRST"]
missing = [col for col in required_cols if col not in dietwt.columns]
if missing:
    print(f"Warning: Missing columns from gg.sas7bdat: {missing}")

# Filter and select
dietwt = dietwt[dietwt["DAYS"] == 1][["SEQN", "WTDRD1", "WTDR2D", "DR12DRST"]].copy()
dietwt = dietwt.sort_values("SEQN")


# Step 5: Read mortality data
mort = pd.read_sas(os.path.join(folder_path, "mortality9918.sas7bdat"), format="sas7bdat")
mort = mort.sort_values("SEQN")

def summarize_df(name, df):
    print(f"{name}:")
    print(f"  Rows: {df.shape[0]}")
    print(f"  Unique SEQN: {df['SEQN'].nunique()}")
    print("-" * 40)

summarize_df("scores2", scores2)
summarize_df("covar", covar)
summarize_df("covariates1", covariates1)
summarize_df("dietwt", dietwt)
summarize_df("mort", mort)

In [None]:
# 🔥🔥🔥🔥🔥🔥 NOW WORK AT HERE!!!!!!
# try to extend covariates.csv to 99-18 currently is 03-18



In [None]:
# Step 2: Variable transformations
score_mort["wt10"] = score_mort["WTDRD1"] / 10
score_mort["wt"] = score_mort["WTDR2D"] / 8
score_mort["i_FCS_sd"] = score_mort["i_FCS"] / 10.89
score_mort["i_Optup_sd"] = score_mort["i_Optup"] / 8.17
score_mort["i_nutri_sd"] = -score_mort["i_nutri"] / 3.17
score_mort["i_HSR_sd"] = score_mort["i_HSR"] / 1.01
score_mort["hei2015_sd"] = score_mort["HEI2015_TOTAL_SCORE"] / 13

# Step 3: Recode death indicators
for cause, code in {
    "death_heart": "001", "death_cancer": "002", "death_resp": "003", "Death_inj": "004",
    "death_cerev": "005", "Death_alz": "006", "death_diabe": "007",
    "Death_infl": "008", "Death_kid": "009", "death_other1": "010"
}.items():
    score_mort[cause] = (score_mort["UCOD_LEADING"] == code).astype(int)

# Step 4: Composite categories
score_mort["Death_other"] = score_mort[["death_resp", "Death_inj", "Death_alz", "Death_infl"]].sum(axis=1).clip(upper=1)
score_mort["Death_oth2"] = score_mort[["death_resp", "Death_inj", "Death_alz", "Death_infl", "death_other1"]].sum(axis=1).clip(upper=1)
score_mort["death_cvd"] = score_mort[["death_heart", "death_cerev"]].sum(axis=1).clip(upper=1)
score_mort["death_cmd"] = score_mort[["death_heart", "death_cerev", "death_diabe"]].sum(axis=1).clip(upper=1)
score_mort["death_cmdk"] = score_mort[["death_heart", "death_cerev", "death_diabe", "Death_kid"]].sum(axis=1).clip(upper=1)
score_mort["death_cmdkh"] = score_mort["death_cmdk"]
score_mort.loc[score_mort["DIABETES"] == 1, "death_cmdkh"] = 1
score_mort.loc[score_mort["HYPERTEN"] == 1, "death_cmdkh"] = 1
score_mort["death_cmd"] = score_mort["death_cmd"].fillna(0)
score_mort.loc[score_mort["death_cmd"] == 1, ["Death_other", "Death_oth2"]] = 0

# Step 5: Multiple cause mortality
score_mort["death_multi"] = score_mort["MORTSTAT"]
score_mort.loc[score_mort["death_cmd"] == 1, "death_multi"] = 1
score_mort.loc[score_mort["death_cancer"] == 1, "death_multi"] = 2
score_mort.loc[score_mort["Death_oth2"] == 1, "death_multi"] = 3

# Step 6: Age & time vars
score_mort["agesq"] = score_mort["RIDAGEYR"] ** 2
score_mort["py"] = score_mort["PERMTH_EXM"] / 12
score_mort["agestart"] = score_mort["RIDAGEYR"]
score_mort["ageend"] = score_mort["RIDAGEYR"] + score_mort["py"]

# Step 7: Poverty
score_mort["pir"] = 5
score_mort.loc[(score_mort["INDFMPIR"] < 1.3) & (score_mort["INDFMPIR"].notna()), "pir"] = 1
score_mort.loc[(score_mort["INDFMPIR"] >= 1.3), "pir"] = 2
score_mort.loc[(score_mort["INDFMPIR"] >= 3), "pir"] = 3

# Step 8: Recode SNAP
score_mort.loc[(score_mort["INDFMPIR"].between(0, 1.3)) & (score_mort["SNAP"] != 1), "SNAP"] = 2

# Step 9: BMI categories
score_mort["bmic"] = pd.NA
score_mort.loc[(score_mort["bmi"] > 0) & (score_mort["bmi"] < 18.5), "bmic"] = 0
score_mort.loc[(score_mort["bmi"] >= 18.5) & (score_mort["bmi"] < 25), "bmic"] = 1
score_mort.loc[(score_mort["bmi"] >= 25), "bmic"] = 2
score_mort.loc[(score_mort["bmi"] >= 30), "bmic"] = 3

In [None]:
with open('/Users/dengshuyue/Desktop/SDOH/analysis/code/Ref/2.1_Prepare data_covariates.sas', 'r', encoding='latin1') as f:
    sas_code = f.read()

print(sas_code[:20000])  # Preview the first X,000 characters

In [None]:
# Diabetes Identification Note:
# The final diabetes indicator variable used in the analysis is 'diabe2'.
# This composite variable identifies diabetes based on the following criteria:
# - Self-reported physician diagnosis (DIQ010), current insulin use (DIQ050), or oral medication use (DIQ070)
# - Prescription drug data indicating diabetes treatment (dm_rx2)
# - Fasting glucose ≥ 126 mg/dL (glu_dm)
# - OGTT ≥ 200 mg/dL (ogtt_dm)
# - HbA1c ≥ 6.5% (hb_dm)
# Any one of these criteria being met will set 'diabe2' = 1, otherwise 0.
