
# 00 — DEMO + Mortality + (optional) SDOH Core

This notebook:
1. Builds/loads the **DEMO 1999–2023** stack (local-first; optional CDC download).
2. Loads **mortality-linked** file (1999–2018) from your `data/` folder and merges with DEMO.
3. Optionally merges the result with your **covariate core** (`output/cov_core_1999_2023.parquet`) if present.
4. Writes tidy outputs to `output/`.

> **Tip:** Change `BASE` below if your project root is different.


In [1]:

# %% Setup
from pathlib import Path
import os, warnings, sys
import numpy as np
import pandas as pd

# Project root (EDIT if needed)
BASE = Path("/Users/dengshuyue/Desktop/SDOH/analysis")

# Folders (created only when saving new files)
DATA = BASE / "data"
OUT  = BASE / "output"
(DATA / "nhanes_by_module" / "DEMO").mkdir(parents=True, exist_ok=True)
(DATA / "cov").mkdir(parents=True, exist_ok=True)
OUT.mkdir(parents=True, exist_ok=True)

# Toggle: allow downloading DEMO XPTs from CDC if not found locally
ALLOW_DOWNLOAD = True

# Display prefs
pd.options.display.max_rows = 60
pd.options.display.max_columns = 120
warnings.filterwarnings("ignore")

print("ROOT:", BASE)
print("Data dir exists:", DATA.exists())
print("Output dir exists:", OUT.exists())


ROOT: /Users/dengshuyue/Desktop/SDOH/analysis
Data dir exists: True
Output dir exists: True


In [2]:

# %% Helpers
import requests

def nhanes_demo_urls():
    return {
        "1999-2000": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/1999/DataFiles/DEMO.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/DEMO.XPT",
        ],
        "2001-2002": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2001/DataFiles/DEMO_B.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/DEMO_B.XPT",
        ],
        "2003-2004": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2003/DataFiles/DEMO_C.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/DEMO_C.XPT",
        ],
        "2005-2006": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2005/DataFiles/DEMO_D.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/DEMO_D.XPT",
        ],
        "2007-2008": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2007/DataFiles/DEMO_E.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/DEMO_E.XPT",
        ],
        "2009-2010": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2009/DataFiles/DEMO_F.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/DEMO_F.XPT",
        ],
        "2011-2012": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2011/DataFiles/DEMO_G.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/DEMO_G.XPT",
        ],
        "2013-2014": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/DEMO_H.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DEMO_H.XPT",
        ],
        "2015-2016": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/DEMO_I.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DEMO_I.XPT",
        ],
        "2017-2018": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DEMO_J.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT",
        ],
        "2017-March 2020 (pre-pandemic)": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DEMO.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2020/P_DEMO.XPT",
        ],
        "August 2021–August 2023": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2021-2023/DEMO_L.XPT",
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_Q.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2021-2023/DEMO_Q.XPT",
        ],
    }

def candidates_from_urls(urls):
    out, seen = [], set()
    for url in urls:
        name = Path(url).name
        for variant in (name, name.upper(), name.lower(), name.capitalize()):
            if variant not in seen:
                seen.add(variant); out.append(variant)
    return out

NHANES_URLS = nhanes_demo_urls()
LOCAL_CANDIDATES = {c: candidates_from_urls(urls) for c, urls in NHANES_URLS.items()}

def find_first_under(base: Path, patterns):
    for pat in patterns:
        hits = list(base.rglob(pat))
        if hits:
            return hits[0]
    return None

def download_to(path: Path, url: str, timeout=90):
    path.parent.mkdir(parents=True, exist_ok=True)
    headers = {"User-Agent": "nhanes-fetch/1.0"}
    last = None
    for _ in range(2):
        try:
            with requests.get(url, headers=headers, timeout=timeout, stream=True) as r:
                r.raise_for_status()
                tmp = path.with_suffix(path.suffix + ".downloading")
                with open(tmp, "wb") as f:
                    for chunk in r.iter_content(1<<15):
                        if chunk: f.write(chunk)
                tmp.rename(path)
            return path
        except Exception as e:
            last = e
    raise last

def read_demo_file(p: Path) -> pd.DataFrame:
    if p.suffix.lower() == ".xpt":
        try:
            import pyreadstat
            df, _ = pyreadstat.read_xport(p)
        except Exception:
            df = pd.read_sas(p, format="xport")
    elif p.suffix.lower() == ".sas7bdat":
        try:
            import pyreadstat
            df, _ = pyreadstat.read_sas7bdat(p)
        except Exception:
            df = pd.read_sas(p, format="sas7bdat")
    else:
        raise ValueError(f"Unsupported DEMO file type: {p.suffix}")
    df.columns = [c.upper() for c in df.columns]
    return df

def ensure_demo_file(cycle_label: str) -> Path:
    local = find_first_under(DATA, LOCAL_CANDIDATES[cycle_label])
    if local:
        return local
    if not ALLOW_DOWNLOAD:
        raise FileNotFoundError(f"DEMO file not found locally for {cycle_label} and downloads are disabled.")
    for url in NHANES_URLS[cycle_label]:
        out = DATA / "nhanes_by_module" / "DEMO" / Path(url).name
        try:
            print(f"⬇️  Download {cycle_label}: {out.name}")
            return download_to(out, url)
        except Exception as e:
            print("   ⚠️", e)
    raise FileNotFoundError(f"No DEMO file available for {cycle_label}.")


In [3]:

# %% Recodes + normalization
def recode_L_to_4(s):
    s = pd.to_numeric(s, errors="coerce")
    out = pd.Series(pd.NA, index=s.index, dtype="Int64")
    out = out.where(~s.isin([1, 6]), 1)   # married/partner
    out = out.where(~s.isin([3, 4]), 2)   # widowed/divorced
    out = out.where(~(s == 2), 3)         # never married
    out = out.where(~(s == 5), 4)         # separated
    out = out.where(~s.isin([77, 99]), pd.NA)
    return out

def recode_L_to_3(s):
    s = pd.to_numeric(s, errors="coerce")
    out = pd.Series(pd.NA, index=s.index, dtype="Int64")
    out = out.where(~s.isin([1, 6]), 1)
    out = out.where(~s.isin([3, 4, 5]), 2)
    out = out.where(~(s == 2), 3)
    out = out.where(~s.isin([77, 99]), pd.NA)
    return out

def recode_Z_to_3(s):
    s = pd.to_numeric(s, errors="coerce")
    out = pd.Series(pd.NA, index=s.index, dtype="Int64")
    out = out.where(~(s == 1), 1)
    out = out.where(~(s == 2), 2)
    out = out.where(~(s == 3), 3)
    out = out.where(~s.isin([77, 99]), pd.NA)
    return out

def compute_marriage_cols(df_upper: pd.DataFrame):
    hasL = "DMDMARTL" in df_upper.columns
    hasZ = "DMDMARTZ" in df_upper.columns
    M4 = pd.Series(pd.NA, index=df_upper.index, dtype="Int64")
    M3 = pd.Series(pd.NA, index=df_upper.index, dtype="Int64")
    if hasL:
        M4 = recode_L_to_4(df_upper["DMDMARTL"])
        M3 = recode_L_to_3(df_upper["DMDMARTL"])
    elif hasZ:
        M3 = recode_Z_to_3(df_upper["DMDMARTZ"])
    return M4, M3

def normalize_survey_and_sex(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d.columns = [c.upper() for c in d.columns]
    for c in ["RIAGENDR","DMDSEX","SEX","GENDER","RIAGENDER"]:
        if c in d.columns:
            d["RIAGENDR"] = pd.to_numeric(d[c], errors="coerce")
            break
    for c in ["WTMEC2YR","WTMEC2YR_P","WTMEC2YRA"]:
        if c in d.columns:
            d["WTMEC2YR"] = pd.to_numeric(d[c], errors="coerce")
            break
    for c in ["SDMVPSU","SDMVSTRA","SDDSRVYR","SEQN"]:
        if c in d.columns:
            d[c] = pd.to_numeric(d[c], errors="coerce")
    return d


In [4]:

# %% Build DEMO 1999–2023
DEMO_CYCLES = [
    "1999-2000","2001-2002","2003-2004","2005-2006","2007-2008",
    "2009-2010","2011-2012","2013-2014","2015-2016","2017-2018",
    "2017-March 2020 (pre-pandemic)","August 2021–August 2023",
]

demo_parts, missing = [], []
for cyc in DEMO_CYCLES:
    try:
        p = ensure_demo_file(cyc)
    except FileNotFoundError as e:
        print("⚠️", e); missing.append(cyc); continue
    df = read_demo_file(p)
    df["CYCLE"] = cyc
    M4, M3 = compute_marriage_cols(df)
    df["MARRIAGE"]  = M4
    df["MARRIAGE3"] = M3
    df = normalize_survey_and_sex(df)
    keep = [c for c in [
        "SEQN","RIDAGEYR","SDDSRVYR","SDMVPSU","SDMVSTRA","WTMEC2YR",
        "CYCLE","MARRIAGE","MARRIAGE3","RIAGENDR","DMDHHSIZ"
    ] if c in df.columns]
    demo_parts.append(df[keep])

if missing:
    print("⚠️ Missing DEMO cycles (not found):", missing)
if not demo_parts:
    raise RuntimeError("No DEMO pieces available.")

demo9923 = pd.concat(demo_parts, ignore_index=True).drop_duplicates("SEQN", keep="first")

# Save for reuse
pkl_out  = DATA / "demo9923.pkl"
parq_out = DATA / "cov" / "demo9923.parquet"
demo9923.to_pickle(pkl_out)
demo9923.to_parquet(parq_out, index=False)

print("✅ Saved DEMO stack:")
print("  •", pkl_out)
print("  •", parq_out)
print("Rows:", len(demo9923), "| Unique SEQN:", demo9923["SEQN"].nunique())
print("Columns:", demo9923.columns.tolist())


✅ Saved DEMO stack:
  • /Users/dengshuyue/Desktop/SDOH/analysis/data/demo9923.pkl
  • /Users/dengshuyue/Desktop/SDOH/analysis/data/cov/demo9923.parquet
Rows: 128809 | Unique SEQN: 128809
Columns: ['SEQN', 'RIDAGEYR', 'SDDSRVYR', 'SDMVPSU', 'SDMVSTRA', 'WTMEC2YR', 'CYCLE', 'MARRIAGE', 'MARRIAGE3', 'RIAGENDR', 'DMDHHSIZ']


In [5]:

# %% Mortality link (1999–2018)
def find_mortality_file(base: Path) -> Path | None:
    patterns = [
        "*mortality*.sas7bdat","*mort*.sas7bdat",
        "*mortality*.xpt","*mort*.xpt",
        "*mortality*.csv","*mort*.csv",
        "NHANES_1999_2019_LMF_public.csv"
    ]
    for pat in patterns:
        hits = list(base.rglob(pat))
        if hits:
            return hits[0]
    return None

mort_p = find_mortality_file(DATA)
if mort_p is None:
    raise FileNotFoundError(f"No mortality file found under {DATA}. Place e.g. 'mortality9918.sas7bdat' or the LMF csv.")

print("Using mortality file:", mort_p)
sfx = mort_p.suffix.lower()
if sfx == ".xpt":
    try:
        import pyreadstat
        mort, _ = pyreadstat.read_xport(mort_p)
    except Exception:
        mort = pd.read_sas(mort_p, format="xport")
elif sfx == ".sas7bdat":
    try:
        import pyreadstat
        mort, _ = pyreadstat.read_sas7bdat(mort_p)
    except Exception:
        mort = pd.read_sas(mort_p, format="sas7bdat")
elif sfx == ".csv":
    mort = pd.read_csv(mort_p)
else:
    raise ValueError(f"Unsupported mortality file type: {sfx}")

mort.columns = [c.upper() for c in mort.columns]
keep_cols = [c for c in [
    "SEQN","ELIGSTAT","MORTSTAT","PERMTH_EXM","PERMTH_INT",
    "UCOD_LEADING","DIABETES","HYPERTEN","DODQTR","DODYEAR"
] if c in mort.columns]
mort = mort[keep_cols].copy()

if "ELIGSTAT" in mort.columns:
    mort = mort[mort["ELIGSTAT"] == 1].copy()

TIME_COL = "PERMTH_EXM" if "PERMTH_EXM" in mort.columns else ("PERMTH_INT" if "PERMTH_INT" in mort.columns else None)
if TIME_COL is None:
    raise ValueError("Neither PERMTH_EXM nor PERMTH_INT found in mortality file.")

mort["TIME_Y"] = pd.to_numeric(mort[TIME_COL], errors="coerce") / 12.0
mort["EVENT"]  = (mort["MORTSTAT"] == 1).astype("Int64")
mort = mort[(mort["TIME_Y"].notna()) & (mort["TIME_Y"] >= 0)].copy()

# Merge with DEMO (age + survey year)
d = demo9923.copy()
d["SDDSRVYR"] = pd.to_numeric(d["SDDSRVYR"], errors="coerce")
cols_merge = [c for c in ["SEQN","RIDAGEYR","SDDSRVYR","RIAGENDR"] if c in d.columns]
mort_demo = mort.merge(d[cols_merge], on="SEQN", how="left")

# Summary
print("\n📊 NHANES cycles in mortality-linked (by SDDSRVYR code):")
print(mort_demo["SDDSRVYR"].value_counts(dropna=False).sort_index())

print("\n⏱️ Survival summary:")
print("  N (unique SEQN):", mort_demo["SEQN"].nunique())
evt_rate = float(mort_demo["EVENT"].mean(skipna=True)) if mort_demo["EVENT"].notna().any() else np.nan
print("  Events (%):", None if np.isnan(evt_rate) else round(100*evt_rate, 2))
print("  TIME_Y (min/median/max):",
      np.nanmin(mort_demo["TIME_Y"]),
      np.nanmedian(mort_demo["TIME_Y"]),
      np.nanmax(mort_demo["TIME_Y"]))

# Save
mort_demo_out = OUT / "mort_demo_merged.parquet"
mort_demo.to_parquet(mort_demo_out, index=False)
print("\n✅ wrote", mort_demo_out)


Using mortality file: /Users/dengshuyue/Desktop/SDOH/analysis/data/less_important/mortality9918.sas7bdat

📊 NHANES cycles in mortality-linked (by SDDSRVYR code):
SDDSRVYR
1.0     4973
2.0     5586
3.0     5293
4.0     5332
5.0     5989
6.0     6346
7.0     5603
8.0     5913
9.0     5720
10.0    5498
Name: count, dtype: int64

⏱️ Survival summary:
  N (unique SEQN): 56253
  Events (%): 14.87
  TIME_Y (min/median/max): 0.0 9.416666666666666 20.75

✅ wrote /Users/dengshuyue/Desktop/SDOH/analysis/output/mort_demo_merged.parquet


In [19]:

# %% Optional: merge with covariate core (if present)
core_path = OUT / "cov_core_1999_2023.parquet"
if core_path.exists():
    core = pd.read_parquet(core_path)
    core.columns = [c.upper() for c in core.columns]
    merged = mort_demo.merge(core, on="SEQN", how="left", suffixes=("", "_CORE"))
    outp = OUT / "nhanes_mort_sdoh_core.parquet"
    merged.to_parquet(outp, index=False)
    print("✅ wrote", outp, "  (shape:", merged.shape, ")")
else:
    print("ℹ️  Core file not found at", core_path, "— skipping this optional merge.")


✅ wrote /Users/dengshuyue/Desktop/SDOH/analysis/output/nhanes_mort_sdoh_core.parquet   (shape: (56253, 41) )


<h2>Sanity Check </h2>

In [20]:

# %% Quick checks
print("DEMO shape:", demo9923.shape)
print("DEMO columns:", demo9923.columns.tolist()[:15], "...")
try:
    md = pd.read_parquet(OUT / "mort_demo_merged.parquet")
    print("mort_demo_merged shape:", md.shape)
except Exception as e:
    print("Couldn't read mort_demo_merged:", e)


DEMO shape: (128809, 11)
DEMO columns: ['SEQN', 'RIDAGEYR', 'SDDSRVYR', 'SDMVPSU', 'SDMVSTRA', 'WTMEC2YR', 'CYCLE', 'MARRIAGE', 'MARRIAGE3', 'RIAGENDR', 'DMDHHSIZ'] ...
mort_demo_merged shape: (56253, 13)


In [23]:
md.head(10)

Unnamed: 0,SEQN,ELIGSTAT,MORTSTAT,PERMTH_EXM,PERMTH_INT,UCOD_LEADING,DIABETES,HYPERTEN,TIME_Y,EVENT,RIDAGEYR,SDDSRVYR,RIAGENDR
0,2.0,1.0,1.0,177.0,177.0,6.0,0.0,0.0,14.75,1,77.0,1.0,1.0
1,5.0,1.0,0.0,244.0,244.0,,,,20.333333,0,49.0,1.0,1.0
2,6.0,1.0,0.0,245.0,246.0,,,,20.416667,0,19.0,1.0,2.0
3,7.0,1.0,0.0,236.0,237.0,,,,19.666667,0,59.0,1.0,2.0
4,10.0,1.0,1.0,231.0,231.0,1.0,0.0,0.0,19.25,1,43.0,1.0,1.0
5,12.0,1.0,0.0,236.0,236.0,,,,19.666667,0,37.0,1.0,1.0
6,13.0,1.0,1.0,16.0,16.0,1.0,0.0,0.0,1.333333,1,70.0,1.0,1.0
7,14.0,1.0,1.0,136.0,137.0,3.0,0.0,0.0,11.333333,1,81.0,1.0,1.0
8,15.0,1.0,0.0,231.0,231.0,,,,19.25,0,38.0,1.0,2.0
9,16.0,1.0,1.0,62.0,63.0,2.0,0.0,0.0,5.166667,1,85.0,1.0,2.0


In [12]:
# ==== Mortality row checks ====
import pandas as pd, numpy as np
from pathlib import Path

# --- 0) Load mort if not already loaded (searches under your project /data) ---
if "mort" not in globals():
    ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
    DATA = ROOT / "data"
    pats = ["*mortality*.sas7bdat","*mort*.sas7bdat","*mortality*.xpt","*mort*.xpt","*mortality*.csv","*mort*.csv"]
    mort_path = next((p for pat in pats for p in DATA.rglob(pat)), None)
    if mort_path is None:
        raise FileNotFoundError("No mortality file found under data/.")
    suf = mort_path.suffix.lower()
    if suf == ".xpt":
        try:
            import pyreadstat
            mort, _ = pyreadstat.read_xport(mort_path)
        except Exception:
            mort = pd.read_sas(mort_path, format="xport")
    elif suf == ".sas7bdat":
        try:
            import pyreadstat
            mort, _ = pyreadstat.read_sas7bdat(mort_path)
        except Exception:
            mort = pd.read_sas(mort_path, format="sas7bdat")
    elif suf == ".csv":
        mort = pd.read_csv(mort_path)
    mort.columns = mort.columns.str.upper()

print("Mortality file loaded.")
print("mort shape:", mort.shape)

# --- 1) Basic counts ---
tot_rows = len(mort)
uniq_seqn = mort["SEQN"].nunique() if "SEQN" in mort.columns else np.nan
print(f"Total rows: {tot_rows:,} | Unique SEQN: {uniq_seqn:,}")

# --- 2) Eligibility filter ---
if "ELIGSTAT" in mort.columns:
    m_elig = mort[mort["ELIGSTAT"] == 1].copy()
else:
    m_elig = mort.copy()
print(f"After ELIGSTAT==1 (or skip if missing): {len(m_elig):,}")

# --- 3) Time availability (EXAM / INTERVIEW) ---
for c in ["PERMTH_EXM","PERMTH_INT"]:
    if c in m_elig.columns:
        m_elig[c] = pd.to_numeric(m_elig[c], errors="coerce")

has_exm = m_elig["PERMTH_EXM"].notna().sum() if "PERMTH_EXM" in m_elig.columns else 0
has_int = m_elig["PERMTH_INT"].notna().sum() if "PERMTH_INT" in m_elig.columns else 0
if {"PERMTH_EXM","PERMTH_INT"}.issubset(m_elig.columns):
    has_either = m_elig[["PERMTH_EXM","PERMTH_INT"]].notna().any(axis=1).sum()
    has_both   = m_elig[["PERMTH_EXM","PERMTH_INT"]].notna().all(axis=1).sum()
    has_neither= len(m_elig) - has_either
else:
    # Only one column present
    only = "PERMTH_EXM" if "PERMTH_EXM" in m_elig.columns else ("PERMTH_INT" if "PERMTH_INT" in m_elig.columns else None)
    has_either = m_elig[only].notna().sum() if only else 0
    has_both = 0
    has_neither = len(m_elig) - has_either

print(f"Time present: EXAM={has_exm:,} | INT={has_int:,} | either={has_either:,} | both={has_both:,} | neither={has_neither:,}")

# --- 4) Your current EXAM-only logic vs EXAM-or-INT fallback ---
# EXAM-only (what caused 56,253 in your run)
m_exm_only = m_elig[m_elig.get("PERMTH_EXM").notna()] if "PERMTH_EXM" in m_elig.columns else m_elig.iloc[0:0]
m_exm_only = m_exm_only[pd.to_numeric(m_exm_only["PERMTH_EXM"], errors="coerce").ge(0)]
print(f"Rows with EXAM months (>=0): {len(m_exm_only):,}")

# EXAM-or-INT (recommended)
m_time = m_elig.copy()
m_time["TIME_M"] = np.where(
    m_time.get("PERMTH_EXM").notna() if "PERMTH_EXM" in m_time else False,
    m_time.get("PERMTH_EXM"),
    m_time.get("PERMTH_INT")
)
m_time["TIME_M"] = pd.to_numeric(m_time["TIME_M"], errors="coerce")
m_time = m_time[m_time["TIME_M"].notna() & (m_time["TIME_M"] >= 0)]
print(f"Rows with EXAM-or-INT months (>=0): {len(m_time):,}")

# --- 5) How many merge to DEMO (if available) ---
if "demo9923" in globals():
    d = demo9923.copy()
    d.columns = d.columns.str.upper()
    # EXAM-only merge
    n_merge_exm = m_exm_only.merge(d[["SEQN"]], on="SEQN", how="inner")["SEQN"].nunique()
    # EXAM-or-INT merge
    n_merge_either = m_time.merge(d[["SEQN"]], on="SEQN", how="inner")["SEQN"].nunique()
    print(f"Unique SEQN after merge to DEMO → EXAM-only: {n_merge_exm:,} | EXAM-or-INT: {n_merge_either:,}")
else:
    print("demo9923 not in memory; skipping merge counts.")


Mortality file loaded.
mort shape: (56253, 10)
Total rows: 56,253 | Unique SEQN: 56,253
After ELIGSTAT==1 (or skip if missing): 56,253
Time present: EXAM=56,253 | INT=56,253 | either=56,253 | both=56,253 | neither=0
Rows with EXAM months (>=0): 56,253
Rows with EXAM-or-INT months (>=0): 56,253
Unique SEQN after merge to DEMO → EXAM-only: 56,253 | EXAM-or-INT: 56,253


In [18]:
# --- Coverage by cycle ---
mort_ids = set(mort["SEQN"])
demo_ids = set(demo9923["SEQN"])

demo_in_mort = demo9923[demo9923["SEQN"].isin(mort_ids)]
demo_only    = demo9923[~demo9923["SEQN"].isin(mort_ids)]

print("In mortality (by CYCLE):")
print(demo_in_mort["CYCLE"].value_counts().sort_index(), "\n")

print("Not in mortality (by CYCLE):")
print(demo_only["CYCLE"].value_counts().sort_index())

# --- Build two cores ---
# 1) Mortality-linked cohort only (inner)
core_mort = demo9923.merge(mort, on="SEQN", how="inner")
print("\ncore_mort shape:", core_mort.shape)

# 2) Full DEMO with mortality columns when available (left)
mort_slim = mort.copy()
for c in ["PERMTH_EXM","PERMTH_INT"]:
    if c in mort_slim:
        mort_slim[c] = pd.to_numeric(mort_slim[c], errors="coerce")

# EXAM-or-INT time in years
if "PERMTH_EXM" in mort_slim.columns or "PERMTH_INT" in mort_slim.columns:
    mort_slim["TIME_M"] = mort_slim["PERMTH_EXM"].where(mort_slim["PERMTH_EXM"].notna(), mort_slim.get("PERMTH_INT"))
    mort_slim["TIME_Y"] = mort_slim["TIME_M"] / 12.0

keep_cols = ["SEQN","MORTSTAT","ELIGSTAT","TIME_Y","PERMTH_EXM","PERMTH_INT","UCOD_LEADING","DIABETES","HYPERTEN","DODQTR","DODYEAR"]
keep_cols = [c for c in keep_cols if c in mort_slim.columns]
core_full = demo9923.merge(mort_slim[keep_cols], on="SEQN", how="left")
print("core_full shape:", core_full.shape)

# Optional: save
# core_mort.to_parquet("/Users/dengshuyue/Desktop/SDOH/analysis/output/core_mort.parquet", index=False)
# core_full.to_parquet("/Users/dengshuyue/Desktop/SDOH/analysis/output/core_full.parquet", index=False)
# print("\nSaved:\n - output/core_mort.parquet (mortality-linked only)\n - output/core_full.parquet (all DEMO; mortality NA where missing)")

''''
core_mort has 56,253 mortality-linked participants from 1999–2018; core_full has all 128,809 DEMO participants from 1999–2023. 
The gap is unlinked 1999–2018 cases plus no public mortality linkage for 2017–Mar 2020 and 2021–2023.
''''

In mortality (by CYCLE):
CYCLE
1999-2000    4973
2001-2002    5586
2003-2004    5293
2005-2006    5332
2007-2008    5989
2009-2010    6346
2011-2012    5603
2013-2014    5913
2015-2016    5720
2017-2018    5498
Name: count, dtype: int64 

Not in mortality (by CYCLE):
CYCLE
1999-2000                          4992
2001-2002                          5453
2003-2004                          4829
2005-2006                          5016
2007-2008                          4160
2009-2010                          4191
2011-2012                          4153
2013-2014                          4262
2015-2016                          4251
2017-2018                          3756
2017-March 2020 (pre-pandemic)    15560
August 2021–August 2023           11933
Name: count, dtype: int64

core_mort shape: (56253, 20)
core_full shape: (128809, 19)
