## 0 — BOOTSTRAP & UTILS

In [2]:
# =========================
# 0 — BOOTSTRAP & UTILS
# =========================
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Iterable
import pandas as pd, numpy as np

# Project base (adjust if your path differs)
BASE = Path("/Users/dengshuyue/Desktop/SDOH/analysis")

@dataclass
class Config:
    raw_dir: Path = BASE / "data"
    interim_dir: Path = BASE / "data" / "cov"
    out_dir: Path = BASE / "output"

    # Optional prebuilt inputs
    demo_9923: Optional[Path] = BASE / "data" / "cov" / "demo9923.parquet"
    demo_9918: Optional[Path] = None

    # Depression I/O
    dpq_9923: Optional[Path] = None
    cov_dep: str = "cov_dep_1999_2023.parquet"

CONFIG = Config()

def ensure_dir(p: Path | str) -> None:
    Path(p).mkdir(parents=True, exist_ok=True)

def upper_df(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d.columns = [c.upper() for c in d.columns]
    return d

def pick_first_existing(*cands: Optional[Path]) -> Optional[Path]:
    for c in cands:
        if c and Path(c).exists():
            return Path(c)
    return None

def log(msg: str) -> None:
    print(msg, flush=True)

# --- file helpers ---
def _read_any(p: Path) -> pd.DataFrame:
    p = Path(p)
    return pd.read_parquet(p) if p.suffix.lower()==".parquet" else pd.read_csv(p, low_memory=False)

def _download(url: str, dest: Path, timeout=90):
    import requests
    dest.parent.mkdir(parents=True, exist_ok=True)
    headers = {"User-Agent": "nhanes-fetch/1.0"}
    with requests.get(url, headers=headers, stream=True, timeout=timeout) as r:
        r.raise_for_status()
        tmp = dest.with_suffix(dest.suffix + ".downloading")
        with open(tmp, "wb") as f:
            for chunk in r.iter_content(1<<15):
                if chunk: f.write(chunk)
        tmp.rename(dest)
    return dest

def _read_xpt(p: Path) -> pd.DataFrame:
    try:
        import pyreadstat
        df, _ = pyreadstat.read_xport(p)
    except Exception:
        df = pd.read_sas(p, format="xport")
    df.columns = [c.upper() for c in df.columns]
    return df


## 1 — DPQ / PHQ-9 (2005+)

In [3]:
# =========================
# 1 — DPQ / PHQ-9 (2005+)
# =========================

def _download_and_stack_dpq(cfg: Config) -> pd.DataFrame:
    DPQ_URLS = {
        "2005-2006": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2005/DataFiles/DPQ_D.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/DPQ_D.XPT",
        ],
        "2007-2008": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2007/DataFiles/DPQ_E.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/DPQ_E.XPT",
        ],
        "2009-2010": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2009/DataFiles/DPQ_F.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/DPQ_F.XPT",
        ],
        "2011-2012": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2011/DataFiles/DPQ_G.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/DPQ_G.XPT",
        ],
        "2013-2014": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/DPQ_H.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DPQ_H.XPT",
        ],
        "2015-2016": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/DPQ_I.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DPQ_I.XPT",
        ],
        "2017-2018": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DPQ_J.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DPQ_J.XPT",
        ],
        "2017-March 2020 (pre-pandemic)": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DPQ.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2020/P_DPQ.XPT",
        ],
        "August 2021–August 2023": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DPQ_L.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2021-2023/DPQ_L.XPT",
        ],
    }
    store = Path(cfg.interim_dir) / "dpq"
    parts = []
    for cyc, urls in DPQ_URLS.items():
        got = None
        for u in urls:
            try:
                dst = store / Path(u).name
                if not dst.exists():
                    print(f"⬇️ DPQ {cyc} → {dst.name}")
                    _download(u, dst)
                got = dst; break
            except Exception as e:
                print("  ⚠️", e)
        if got is None:
            continue
        df = _read_xpt(got)
        df["CYCLE"] = cyc
        parts.append(df)
    if not parts:
        raise RuntimeError("No DPQ data available (download failed).")
    dpq = pd.concat(parts, ignore_index=True)
    stacked = Path(cfg.interim_dir) / "dpq_9923.parquet"
    stacked.parent.mkdir(parents=True, exist_ok=True)
    dpq.to_parquet(stacked, index=False)
    return dpq

def _clean_dpq_score(dpq: pd.DataFrame) -> pd.DataFrame:
    d = upper_df(dpq)
    items = [c for c in [f"DPQ0{i:02d}" for i in range(10, 100, 10)] if c in d.columns]
    for c in items:
        d[c] = pd.to_numeric(d[c], errors="coerce")
        d.loc[d[c].isin([7, 9, 77, 99]), c] = np.nan

    answered = d[items].notna().sum(axis=1)
    phq9 = d[items].sum(axis=1, min_count=1)
    phq9 = phq9.where(answered >= 7)

    cat = pd.cut(
        phq9,
        bins=[-0.1, 4, 9, 14, 19, 27],
        labels=["NONE/MIN", "MILD", "MOD", "MOD-SEV", "SEV"],
        include_lowest=True,
        right=True
    ).astype("category")

    out = pd.DataFrame({
        "SEQN": pd.to_numeric(d["SEQN"], errors="coerce").astype("Int64"),
        "PHQ9": phq9.astype("float"),
        "PHQ9_GE10": (phq9 >= 10).astype("Int64"),
        "DPQ_CAT": cat
    })
    out["IMP"] = (answered < 7).astype("Int64")  # 1 if insufficient items
    return out.dropna(subset=["SEQN"]).drop_duplicates("SEQN")

def build_dep(cfg: Config = CONFIG, allow_fetch: bool = True) -> pd.DataFrame:
    """Build PHQ-9 covariates; writes cov_dep_1999_2023.parquet."""
    ensure_dir(cfg.out_dir)
    outp = Path(cfg.out_dir) / cfg.cov_dep
    if outp.exists():
        return pd.read_parquet(outp)

    dpq_path = pick_first_existing(
        cfg.dpq_9923,
        Path(cfg.interim_dir) / "dpq_9923.parquet",
        Path(cfg.interim_dir) / "dpq_9923.csv",
    )
    if dpq_path:
        dpq = _read_any(dpq_path)
    else:
        if not allow_fetch:
            raise FileNotFoundError("DPQ stack not found; set allow_fetch=True to download from CDC.")
        dpq = _download_and_stack_dpq(cfg)

    dep = _clean_dpq_score(dpq)
    dep.to_parquet(outp, index=False)
    log(f"✓ DEP → {outp}  (rows: {len(dep):,}, PHQ9≥10: {int(pd.to_numeric(dep['PHQ9_GE10'], errors='coerce').fillna(0).sum()):,})")
    return dep


## 2 — CIDI (1999–2004) from CIDDSCOR + WTSCI2YR

In [4]:
# =========================
# 2 — CIDI (1999–2004) from CIDDSCOR + WTSCI2YR
# =========================

def _get_first_working(urls: Iterable[str], store: Path) -> Optional[Path]:
    for u in urls:
        try:
            dst = store / Path(u).name
            if not dst.exists():
                print(f"⬇️  trying {Path(u).name}")
                _download(u, dst)
            return dst
        except Exception as e:
            print("  ⚠️", e)
    return None

def _stack_cidi_9904(cfg: Config) -> pd.DataFrame:
    """
    Stack the CIDI depression module files for 1999–2004.
    1999–2000 file is CIQMDEP.xpt (not CIQDEP_A).
    """
    store = Path(cfg.interim_dir) / "dep9904"
    store.mkdir(parents=True, exist_ok=True)

    CANDIDATES = {
        "1999-2000": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/1999/DataFiles/CIQMDEP.xpt",   # official
            "https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/CIQDEP_A.XPT",                  # alt
        ],
        "2001-2002": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2001/DataFiles/CIQDEP_B.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/CIQDEP_B.XPT",
        ],
        "2003-2004": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2003/DataFiles/CIQDEP_C.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/CIQDEP_C.XPT",
        ],
    }

    parts = []
    for cyc, urls in CANDIDATES.items():
        got = _get_first_working(urls, store)
        if got is None:
            print(f"  ❌ skipped {cyc} (no URL worked)")
            continue
        df = _read_xpt(got)
        df["CYCLE"] = cyc
        parts.append(df)

    if not parts:
        raise RuntimeError("No CIQDEP 1999–2004 files available (all URLs failed).")
    return pd.concat(parts, ignore_index=True)

def build_cidi_9904(cfg: Config, allow_fetch=True) -> pd.DataFrame:
    """
    Returns: SEQN, CIDI_SCORE_RAW (CIDDSCOR), CIDI_12M_MDE (1/0/NA), WTSCI2YR (if present)
    Mapping per codebook: CIDDSCOR == 1 → positive diagnosis, 5 → negative.
    """
    pq = Path(cfg.interim_dir) / "ciqdep_9904.parquet"
    if pq.exists():
        raw = pd.read_parquet(pq)
    else:
        if not allow_fetch:
            raise FileNotFoundError("ciqdep_9904.parquet not found; set allow_fetch=True to download.")
        raw = _stack_cidi_9904(cfg)
        raw.to_parquet(pq, index=False)

    d = upper_df(raw)
    out = d[["SEQN"]].drop_duplicates().copy()

    # ---- pull score variable: prefer CIDDSCOR; fallback to any *SCOR* with CIDD/DEP in name
    score_col = None
    if "CIDDSCOR" in d.columns:
        score_col = "CIDDSCOR"
    else:
        cand = [c for c in d.columns if ("SCOR" in c) and (("CIDD" in c) or ("CIDI" in c) or ("DEP" in c))]
        if cand:
            score_col = cand[0]

    if score_col:
        score = pd.to_numeric(d.set_index("SEQN")[score_col], errors="coerce")
        out["CIDI_SCORE_RAW"] = score.reindex(out["SEQN"]).values

        # Map per codebook: 1=Positive Diagnosis, 5=Negative Diagnosis, else NA
        m = score.reindex(out["SEQN"])
        flag = m.map({1: 1, 5: 0})
        out["CIDI_12M_MDE"] = pd.array(flag.values, dtype="Int64")
    else:
        out["CIDI_SCORE_RAW"] = pd.NA
        out["CIDI_12M_MDE"] = pd.Series(pd.NA, dtype="Int64", index=out.index)

    # ---- CIQ half-sample weight (normalize to WTSCI2YR)
    import re
    w_cols = [c for c in d.columns if re.match(r"WTSC.*2YR", c, flags=re.I)]
    if w_cols:
        w = pd.to_numeric(d.set_index("SEQN")[w_cols[0]], errors="coerce").reindex(out["SEQN"])
        out["WTSCI2YR"] = w.values

    return out


## 3 - Merge into your core table & harmonize

In [5]:
# =========================
# 3 — MERGE + HARMONIZE
# =========================

OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
cm_path = OUT / "cov_core_mort_1999_2023_flags.parquet"
assert cm_path.exists(), "Expected cm flags file not found."

# Load
cm  = pd.read_parquet(cm_path)
dpq = build_dep(CONFIG, allow_fetch=True)[["SEQN","PHQ9","PHQ9_GE10","DPQ_CAT","IMP"]].rename(columns={"IMP":"DEP_IMP"})
cidi = build_cidi_9904(CONFIG, allow_fetch=True)  # SEQN, CIDI_SCORE_RAW, CIDI_12M_MDE, WTSCI2YR

# Merge
cm_dep = cm.merge(dpq,  on="SEQN", how="left")
cm_dep = cm_dep.merge(cidi, on="SEQN", how="left")

# Harmonize by era:
# SDDSRVYR: 1=1999–2000, 2=2001–2002, 3=2003–2004, 4+=2005+ cycles
sdd = pd.to_numeric(cm_dep["SDDSRVYR"], errors="coerce")
pre  = sdd <= 3
post = sdd >= 4

phq_flag  = pd.to_numeric(cm_dep.get("PHQ9_GE10"), errors="coerce")
cidi_flag = pd.to_numeric(cm_dep.get("CIDI_12M_MDE"), errors="coerce")

cm_dep["DEP_HARMONIZED"] = pd.Series(pd.NA, dtype="Int64", index=cm_dep.index)
cm_dep.loc[pre,  "DEP_HARMONIZED"] = cidi_flag.loc[pre].astype("Int64")
cm_dep.loc[post, "DEP_HARMONIZED"] = phq_flag.loc[post].astype("Int64")

cm_dep["DEP_SOURCE"] = pd.Series(pd.NA, dtype="string", index=cm_dep.index)
cm_dep.loc[pre  & cm_dep["CIDI_12M_MDE"].notna(), "DEP_SOURCE"] = "CIDI99-04"
cm_dep.loc[post & cm_dep["PHQ9_GE10"].notna(),    "DEP_SOURCE"] = "PHQ9_05plus"

# Save
out1 = OUT / "cov_core_mort_dep_1999_2023.parquet"      # basic merged
cm_dep.to_parquet(out1, index=False)
log(f"✓ Merged cm + DEP → {out1}")

out2 = OUT / "cov_core_mort_dep_all_1999_2023.parquet"  # same content (kept for your workflow)
cm_dep.to_parquet(out2, index=False)
log(f"✓ Wrote {out2}")


✓ Merged cm + DEP → /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_core_mort_dep_1999_2023.parquet
✓ Wrote /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_core_mort_dep_all_1999_2023.parquet


## 4 - Quick peeks (compare score vs flag; show both eras)

In [6]:
# =========================
# 4 — QUICK PEEKS
# =========================

show_cols = [
    "SEQN","SDDSRVYR","AGE_YR","SEX",
    "CIDI_SCORE_RAW","CIDI_12M_MDE",   # pre-2005 CIDI
    "PHQ9","PHQ9_GE10","DPQ_CAT","DEP_IMP",   # post-2005 PHQ-9
    "DEP_SOURCE","DEP_HARMONIZED",
    "WTSCI2YR","WTMEC2YR"
]
present = [c for c in show_cols if c in cm_dep.columns]

print("\nPre-2005 rows with CIDI (first 15):")
display(cm_dep.loc[(cm_dep["SDDSRVYR"]<=3) & cm_dep["CIDI_12M_MDE"].notna(), present].head(15))

print("\nPost-2005 rows with PHQ-9 (first 15):")
display(cm_dep.loc[(cm_dep["SDDSRVYR"]>=4) & cm_dep["PHQ9"].notna(), present].head(15))



Pre-2005 rows with CIDI (first 15):


Unnamed: 0,SEQN,SDDSRVYR,AGE_YR,SEX,CIDI_SCORE_RAW,CIDI_12M_MDE,PHQ9,PHQ9_GE10,DPQ_CAT,DEP_IMP,DEP_SOURCE,DEP_HARMONIZED,WTSCI2YR,WTMEC2YR
11,12,1.0,37.0,M,5.0,0,,,,,CIDI99-04,0,186078.314366,95494.214052
19,20,1.0,23.0,F,5.0,0,,,,,CIDI99-04,0,32784.75114,16736.882281
33,34,1.0,38.0,F,5.0,0,,,,,CIDI99-04,0,50386.248286,27063.495057
65,66,1.0,37.0,M,5.0,0,,,,,CIDI99-04,0,54383.039478,27489.67342
68,69,1.0,27.0,M,5.0,0,,,,,CIDI99-04,0,78866.089966,34823.862122
80,81,1.0,30.0,M,5.0,0,,,,,CIDI99-04,0,176135.722403,95022.657961
96,97,1.0,32.0,M,5.0,0,,,,,CIDI99-04,0,86334.258174,40766.798504
106,107,1.0,30.0,F,5.0,0,,,,,CIDI99-04,0,37883.834696,16803.503699
119,120,1.0,30.0,F,5.0,0,,,,,CIDI99-04,0,186788.235958,98327.632014
142,143,1.0,22.0,F,5.0,0,,,,,CIDI99-04,0,4969.041484,2544.098678



Post-2005 rows with PHQ-9 (first 15):


Unnamed: 0,SEQN,SDDSRVYR,AGE_YR,SEX,CIDI_SCORE_RAW,CIDI_12M_MDE,PHQ9,PHQ9_GE10,DPQ_CAT,DEP_IMP,DEP_SOURCE,DEP_HARMONIZED,WTSCI2YR,WTMEC2YR
31130,31131,4.0,44.0,F,,,0.0,0,NONE/MIN,0,PHQ9_05plus,0,,26770.584605
31131,31132,4.0,70.0,M,,,0.0,0,NONE/MIN,0,PHQ9_05plus,0,,35315.5389
31133,31134,4.0,73.0,M,,,0.0,0,NONE/MIN,0,PHQ9_05plus,0,,44231.167252
31138,31139,4.0,18.0,F,,,4.0,0,NONE/MIN,0,PHQ9_05plus,0,,5963.152246
31142,31143,4.0,19.0,M,,,6.0,0,MILD,0,PHQ9_05plus,0,,31197.601782
31143,31144,4.0,21.0,M,,,0.0,0,NONE/MIN,0,PHQ9_05plus,0,,49416.755687
31148,31149,4.0,85.0,F,,,0.0,0,NONE/MIN,0,PHQ9_05plus,0,,25998.279203
31149,31150,4.0,79.0,M,,,1.0,0,NONE/MIN,0,PHQ9_05plus,0,,15572.425031
31150,31151,4.0,59.0,F,,,3.0,0,NONE/MIN,0,PHQ9_05plus,0,,32058.654021
31151,31152,4.0,27.0,F,,,3.0,0,NONE/MIN,0,PHQ9_05plus,0,,22237.017339


## 5 — QA / SUMMARIES

In [7]:
# =========================
# 5 — QA / SUMMARIES
# =========================

def wmean(x, w):
    x = pd.to_numeric(x, errors="coerce")
    w = pd.to_numeric(w, errors="coerce")
    m = x.notna() & w.notna() & (w > 0)
    return (x[m] * w[m]).sum() / w[m].sum() if m.any() else np.nan

# Share of harmonized dep present by cycle (sanity: pre only where CIDI exists; post only where DPQ exists)
print("\nCoverage by cycle (share DEP_HARMONIZED present):")
print(cm_dep.groupby("SDDSRVYR")["DEP_HARMONIZED"].apply(lambda s: s.notna().mean()).round(3))

# Unweighted prevalence by source
print("\nPrevalence by source (UNweighted, among rows with that source present):")
print(cm_dep.groupby("DEP_SOURCE")["DEP_HARMONIZED"].mean())

# Weighted prevalence, era-specific:
#   - 1999–2004: use CIQ half-sample weight WTSCI2YR & restrict to target age 20–39 (per CIQ eligibility)
#   - 2005+:     use WTMEC2YR (standard), and (optional) restrict to adults if you prefer
pre_mask  = (cm_dep["SDDSRVYR"] <= 3)
post_mask = (cm_dep["SDDSRVYR"] >= 4)

# CIQ target was 20–39 years (and half-sample); restrict to that to be consistent with the module design
age2039 = (pd.to_numeric(cm_dep["AGE_YR"], errors="coerce").between(20, 39, inclusive="both"))

w_prev_cidi = wmean(cm_dep.loc[pre_mask & age2039, "DEP_HARMONIZED"], cm_dep.loc[pre_mask & age2039, "WTSCI2YR"])
w_prev_phq9 = wmean(cm_dep.loc[post_mask, "PHQ9_GE10"], cm_dep.loc[post_mask, "WTMEC2YR"])

print("\nWeighted prevalence 1999–2004 (CIDI, WTSCI2YR, age 20–39):", w_prev_cidi)
print("Weighted prevalence 2005+ (PHQ-9≥10, WTMEC2YR):           ", w_prev_phq9)

# A tiny cross-check table
era = pd.DataFrame({
    "n_pre2005_with_CIDI": [int((pre_mask & cm_dep["CIDI_12M_MDE"].notna()).sum())],
    "pre2005_w_prev_CIDI": [w_prev_cidi],
    "n_post2005_with_PHQ9": [int((post_mask & cm_dep["PHQ9"].notna()).sum())],
    "post2005_w_prev_PHQ9_GE10": [w_prev_phq9]
})
display(era)



Coverage by cycle (share DEP_HARMONIZED present):
SDDSRVYR
1.0     0.072
2.0     0.074
3.0     0.068
4.0     0.515
5.0     0.591
6.0     0.604
7.0     0.576
8.0     0.582
9.0     0.575
10.0    0.598
12.0    0.531
66.0    0.576
Name: DEP_HARMONIZED, dtype: float64

Prevalence by source (UNweighted, among rows with that source present):
DEP_SOURCE
CIDI99-04      0.066757
PHQ9_05plus    0.084268
Name: DEP_HARMONIZED, dtype: Float64

Weighted prevalence 1999–2004 (CIDI, WTSCI2YR, age 20–39): 0.07917131442230411
Weighted prevalence 2005+ (PHQ-9≥10, WTMEC2YR):            0.07589921775231724


Unnamed: 0,n_pre2005_with_CIDI,pre2005_w_prev_CIDI,n_post2005_with_PHQ9,post2005_w_prev_PHQ9_GE10
0,2217,0.079171,50220,0.075899


#### Add CIDI fields to main table (create CIDI_SCORE_RAW & CIDI_12M_MDE, align PHQ9_GE10, save back)

In [8]:
from pathlib import Path
import pandas as pd
import numpy as np

OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
p_all = OUT / "cov_core_mort_dep_all_1999_2023.parquet"   # or use your latest file
df = pd.read_parquet(p_all)

# 1) Make/standardize the raw CIDI score column
if "CIDI_SCORE_RAW" not in df.columns:
    if "CIDDSCOR" in df.columns:
        df["CIDI_SCORE_RAW"] = pd.to_numeric(df["CIDDSCOR"], errors="coerce")
    elif "CIDI_SCORE" in df.columns:
        df["CIDI_SCORE_RAW"] = pd.to_numeric(df["CIDI_SCORE"], errors="coerce")
    else:
        df["CIDI_SCORE_RAW"] = pd.NA

# 2) Make the clean 0/1 CIDI flag (1=positive, 5=negative → 1/0)
if "CIDI_12M_MDE" not in df.columns:
    if "CIDI_DEP_FLAG" in df.columns:
        df["CIDI_12M_MDE"] = pd.to_numeric(df["CIDI_DEP_FLAG"], errors="coerce").astype("Int64")
    else:
        s = pd.to_numeric(df["CIDI_SCORE_RAW"], errors="coerce")
        flag = pd.Series(pd.NA, index=df.index, dtype="Int64")
        flag.loc[s == 1] = 1
        flag.loc[s == 5] = 0
        df["CIDI_12M_MDE"] = flag

# (optional) keep PHQ-9 flag as Int64 for easy compare
if "PHQ9_GE10" in df.columns:
    df["PHQ9_GE10"] = pd.to_numeric(df["PHQ9_GE10"], errors="coerce").astype("Int64")

df.to_parquet(p_all, index=False)
print("✓ Updated:", p_all)
print(df[["SEQN","SDDSRVYR","CIDI_SCORE_RAW","CIDI_12M_MDE","PHQ9","PHQ9_GE10"]].head(12))
print("\nCounts — CIDI_12M_MDE:", df["CIDI_12M_MDE"].value_counts(dropna=False))


✓ Updated: /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_core_mort_dep_all_1999_2023.parquet
    SEQN  SDDSRVYR  CIDI_SCORE_RAW  CIDI_12M_MDE  PHQ9  PHQ9_GE10
0      1       1.0             NaN          <NA>   NaN       <NA>
1      2       1.0             NaN          <NA>   NaN       <NA>
2      3       1.0             NaN          <NA>   NaN       <NA>
3      4       1.0             NaN          <NA>   NaN       <NA>
4      5       1.0             NaN          <NA>   NaN       <NA>
5      6       1.0             NaN          <NA>   NaN       <NA>
6      7       1.0             NaN          <NA>   NaN       <NA>
7      8       1.0             NaN          <NA>   NaN       <NA>
8      9       1.0             NaN          <NA>   NaN       <NA>
9     10       1.0             NaN          <NA>   NaN       <NA>
10    11       1.0             NaN          <NA>   NaN       <NA>
11    12       1.0             5.0             0   NaN       <NA>

Counts — CIDI_12M_MDE: CIDI_12M_MDE
<NA>   

## sanity check 

In [9]:
# 1) Pre-2005 sample actually used for CIDI
pre = (df["SDDSRVYR"] <= 3) & (df["AGE_YR"].between(20, 39))  # 1999–2004 & eligible age
print("CIDI non-missing (pre, 20–39):", df.loc[pre, "CIDI_12M_MDE"].notna().sum())  # expect ~2217

# 2) Unweighted CIDI prevalence within the eligible group only
cidi_unw = df.loc[pre, "CIDI_12M_MDE"].mean()
print("CIDI unweighted (pre, 20–39):", cidi_unw)  # should be ~0.067–0.08 depending on exact rows

# 3) PHQ-9 weighted (2005+) — you already have this
post = df["SDDSRVYR"] >= 4
phq_w = (df.loc[post, "PHQ9_GE10"] * df.loc[post, "WTMEC2YR"]).sum() / df.loc[post, "WTMEC2YR"].sum()
print("PHQ-9≥10 weighted (2005+):", phq_w)


CIDI non-missing (pre, 20–39): 2217
CIDI unweighted (pre, 20–39): 0.06675687866486242
PHQ-9≥10 weighted (2005+): 0.0576415502044662


In [10]:
pre_elig = (df["SDDSRVYR"] <= 3) & (df["AGE_YR"].between(20, 39)) & df["WTSCI2YR"].notna()
w_adj = df.loc[pre_elig, "WTSCI2YR"] / 3  # time-average across 3 cycles
p_cidi = (df.loc[pre_elig, "CIDI_12M_MDE"] * w_adj).sum() / w_adj.sum()
print("CIDI weighted (1999–2004 pooled, adj weight):", p_cidi)


CIDI weighted (1999–2004 pooled, adj weight): 0.07752866929727856


#### depression rate compare by cycle 

In [11]:
# Apples-to-apples PHQ-9 (2005+, age 20–39)
post = (df["SDDSRVYR"] >= 4) & df["AGE_YR"].between(20, 39)
phq_2039_w = (df.loc[post, "PHQ9_GE10"] * df.loc[post, "WTMEC2YR"]).sum() / df.loc[post, "WTMEC2YR"].sum()
print("PHQ-9≥10 weighted (2005+, age 20–39):", phq_2039_w)

# If you also want to mirror your “covered adults” restriction:
post_cov = post & (df["MORTALITY_COVERED"] == True)
phq_2039_cov_w = (df.loc[post_cov, "PHQ9_GE10"] * df.loc[post_cov, "WTMEC2YR"]).sum() / df.loc[post_cov, "WTMEC2YR"].sum()
print("PHQ-9≥10 weighted (2005+, age 20–39, covered):", phq_2039_cov_w)

# Optional: by cycle (sanity check trends in 20–39)
out = []
for cyc, g in df.loc[post, ["SDDSRVYR","PHQ9_GE10","WTMEC2YR"]].dropna(subset=["WTMEC2YR"]).groupby("SDDSRVYR"):
    p = (g["PHQ9_GE10"] * g["WTMEC2YR"]).sum() / g["WTMEC2YR"].sum()
    out.append((cyc, p))
print("PHQ-9≥10 weighted by cycle (20–39):", sorted(out))


PHQ-9≥10 weighted (2005+, age 20–39): 0.07717320746283271
PHQ-9≥10 weighted (2005+, age 20–39, covered): 0.06733801258650958
PHQ-9≥10 weighted by cycle (20–39): [(4.0, np.float64(0.043351055124361854)), (5.0, np.float64(0.07132746447484092)), (6.0, np.float64(0.06703749906287784)), (7.0, np.float64(0.06444612554265457)), (8.0, np.float64(0.07100743090976429)), (9.0, np.float64(0.06916572642652297)), (10.0, np.float64(0.08348482248275887)), (12.0, np.float64(0.14263772083081486))]


#### exclude pandamic years

In [12]:
# Pool 2005–2018 only (cycles 4..10), age 20–39
post_0518 = (df["SDDSRVYR"].between(4, 10)) & df["AGE_YR"].between(20, 39)
w = df.loc[post_0518, "WTMEC2YR"]
phq_0518_w = (df.loc[post_0518, "PHQ9_GE10"] * w).sum() / w.sum()
print("PHQ-9≥10 weighted (2005–2018, age 20–39):", phq_0518_w)

# Optional: also compute “covered” version
post_0518_cov = post_0518 & (df["MORTALITY_COVERED"] == True)
w_cov = df.loc[post_0518_cov, "WTMEC2YR"]
phq_0518_cov_w = (df.loc[post_0518_cov, "PHQ9_GE10"] * w_cov).sum() / w_cov.sum()
print("PHQ-9≥10 weighted (2005–2018, 20–39, covered):", phq_0518_cov_w)


PHQ-9≥10 weighted (2005–2018, age 20–39): 0.06733801258650958
PHQ-9≥10 weighted (2005–2018, 20–39, covered): 0.06733801258650958


#### check missingness post 2005

In [14]:
post_0518 = df["SDDSRVYR"].between(4, 10) & df["AGE_YR"].between(20, 39)

# How many are "covered" in that slice?
n_all   = post_0518.sum()
n_cov   = (post_0518 & (df["MORTALITY_COVERED"] == True)).sum()
print("2005–2018, 20–39 — rows:", n_all, " | covered:", n_cov, f"({n_cov/n_all:.3%})")

# PHQ-9 non-missing rates (all vs covered)
phq_all_nonmiss = df.loc[post_0518, "PHQ9"].notna().mean()
phq_cov_nonmiss = df.loc[post_0518 & (df["MORTALITY_COVERED"] == True), "PHQ9"].notna().mean()
print("PHQ-9 non-missing — all:", round(phq_all_nonmiss,3), "| covered:", round(phq_cov_nonmiss,3))

# Compare weights sum (all vs covered)
w_all = df.loc[post_0518, "WTMEC2YR"].sum()
w_cov = df.loc[post_0518 & (df["MORTALITY_COVERED"] == True), "WTMEC2YR"].sum()
print("Weight sum — all:", w_all, "| covered:", w_cov)

# Recompute weighted prevalence both ways (should match if coverage doesn’t drop any)
mask_cov = post_0518 & (df["MORTALITY_COVERED"] == True)
prev_all = (df.loc[post_0518, "PHQ9_GE10"] * df.loc[post_0518, "WTMEC2YR"]).sum() / df.loc[post_0518, "WTMEC2YR"].sum()
prev_cov = (df.loc[mask_cov,  "PHQ9_GE10"] * df.loc[mask_cov,  "WTMEC2YR"]).sum() / df.loc[mask_cov,  "WTMEC2YR"].sum()
print("PHQ-9≥10 weighted — all:", prev_all, "| covered:", prev_cov)


2005–2018, 20–39 — rows: 13467  | covered: 13467 (100.000%)
PHQ-9 non-missing — all: 0.852 | covered: 0.852
Weight sum — all: 577082510.263677 | covered: 577082510.263677
PHQ-9≥10 weighted — all: 0.06733801258650958 | covered: 0.06733801258650958


#### check missingness pre 2005

In [15]:
from pathlib import Path
import pandas as pd
import numpy as np

# Load your main table (or skip if df already in memory)
OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
p_all = OUT / "cov_core_mort_dep_all_1999_2023.parquet"
df = pd.read_parquet(p_all)

# --- Filter: pre-2005 cycles (1–3) and CIDI-eligible ages (20–39) ---
pre = df["SDDSRVYR"].between(1, 3) & df["AGE_YR"].between(20, 39)

# Convenience handles (safe even if column absent)
cidi_flag = pd.to_numeric(df.get("CIDI_12M_MDE"), errors="coerce")
cidi_raw  = pd.to_numeric(df.get("CIDI_SCORE_RAW"), errors="coerce")
w_pre     = pd.to_numeric(df.get("WTSCI2YR"), errors="coerce")

sub = df.loc[pre].copy()
sub["CIDI_FLAG_nonmiss"] = cidi_flag.loc[pre].notna().astype(float)
sub["CIDI_RAW_nonmiss"]  = cidi_raw.loc[pre].notna().astype(float)
sub["WTSCI_nonmiss"]     = w_pre.loc[pre].notna().astype(float)

# --- Overall missingness / coverage ---
n_rows = len(sub)
share_flag_nonmiss = sub["CIDI_FLAG_nonmiss"].mean()
share_raw_nonmiss  = sub["CIDI_RAW_nonmiss"].mean()
share_w_nonmiss    = sub["WTSCI_nonmiss"].mean()

print(f"1999–2004, age 20–39 — rows: {n_rows}")
print(f"CIDI_12M_MDE non-missing: {share_flag_nonmiss:.3f}  (missing = {1 - share_flag_nonmiss:.3f})")
print(f"CIDI_SCORE_RAW non-missing: {share_raw_nonmiss:.3f} (missing = {1 - share_raw_nonmiss:.3f})")
print(f"WTSCI2YR non-missing: {share_w_nonmiss:.3f}")

# --- By cycle (1=1999–2000, 2=2001–2002, 3=2003–2004) ---
by = (sub
      .assign(cyc=sub["SDDSRVYR"])
      .groupby("cyc")[["CIDI_FLAG_nonmiss","CIDI_RAW_nonmiss","WTSCI_nonmiss"]]
      .mean()
      .round(3))
print("\nNon-missing shares by cycle (pre-2005, age 20–39):")
print(by.rename(columns={
    "CIDI_FLAG_nonmiss": "CIDI_12M_MDE_nonmiss",
    "CIDI_RAW_nonmiss":  "CIDI_SCORE_RAW_nonmiss",
    "WTSCI_nonmiss":     "WTSCI2YR_nonmiss"
}))

# --- (Optional) Weighted prevalence using WTSCI2YR among non-missing ---
def wmean(x, w):
    m = x.notna() & w.notna() & (w > 0)
    return float((x[m]*w[m]).sum() / w[m].sum()) if m.any() else np.nan

w_prev_cidi = wmean(cidi_flag.loc[pre], w_pre.loc[pre])
print(f"\nCIDI 12-month MDE weighted prevalence (1999–2004, 20–39): {w_prev_cidi:.6f}")



1999–2004, age 20–39 — rows: 5362
CIDI_12M_MDE non-missing: 0.413  (missing = 0.587)
CIDI_SCORE_RAW non-missing: 0.413 (missing = 0.587)
WTSCI2YR non-missing: 0.423

Non-missing shares by cycle (pre-2005, age 20–39):
     CIDI_12M_MDE_nonmiss  CIDI_SCORE_RAW_nonmiss  WTSCI2YR_nonmiss
cyc                                                                
1.0                 0.421                   0.421             0.429
2.0                 0.422                   0.422             0.430
3.0                 0.397                   0.397             0.409

CIDI 12-month MDE weighted prevalence (1999–2004, 20–39): 0.079171


## final preview merged data 

In [30]:
# Preview the merged table (first 20 rows)

# If needed, install a parquet engine (uncomment one of these):
# %pip install pyarrow --quiet
# %pip install fastparquet --quiet

from pathlib import Path
import pandas as pd

p_all = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output/cov_core_mort_dep_all_1999_2023.parquet")
df = pd.read_parquet(p_all)  # uses pyarrow or fastparquet if installed

print(f"File: {p_all.name}")
print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]:,} columns")

# Pick useful columns if present
cols = [
    "SEQN","SDDSRVYR","AGE_YR","SEX",
    "CIDI_SCORE_RAW","CIDI_12M_MDE",
    "PHQ9","PHQ9_GE10",
    "DEP_SOURCE","DEP_HARMONIZED",
    "WTSCI2YR","WTMEC2YR"
]
show = [c for c in cols if c in df.columns]

display(df.loc[:, show].head(20))


File: cov_core_mort_dep_all_1999_2023.parquet
Shape: 128,809 rows × 56 columns


Unnamed: 0,SEQN,SDDSRVYR,AGE_YR,SEX,CIDI_SCORE_RAW,CIDI_12M_MDE,PHQ9,PHQ9_GE10,DEP_SOURCE,DEP_HARMONIZED,WTSCI2YR,WTMEC2YR
0,1,1.0,2.0,F,,,,,,,,10982.898896
1,2,1.0,77.0,M,,,,,,,,28325.384898
2,3,1.0,10.0,F,,,,,,,,46192.256945
3,4,1.0,1.0,M,,,,,,,,10251.26002
4,5,1.0,49.0,M,,,,,,,,99445.065735
5,6,1.0,19.0,F,,,,,,,,39656.600444
6,7,1.0,59.0,F,,,,,,,,25525.423409
7,8,1.0,13.0,M,,,,,,,,31510.587866
8,9,1.0,11.0,F,,,,,,,,7575.870247
9,10,1.0,43.0,M,,,,,,,,22445.808572
