## BOOTSTRAP

In [2]:
# --- BOOTSTRAP (place at the top of 04_depression_merge.ipynb) ---
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
import pandas as pd, numpy as np

# Project base (adjust if your path differs)
BASE = Path("/Users/dengshuyue/Desktop/SDOH/analysis")

@dataclass
class Config:
    raw_dir: Path = BASE / "data"
    interim_dir: Path = BASE / "data" / "cov"
    out_dir: Path = BASE / "output"

    # Preferred inputs (only used if present)
    demo_9923: Optional[Path] = BASE / "data" / "cov" / "demo9923.parquet"
    demo_9918: Optional[Path] = None

    # Depression inputs/outputs (new)
    dpq_9923: Optional[Path] = None
    cov_dep: str = "cov_dep_1999_2023.parquet"

CONFIG = Config()

def ensure_dir(p: Path | str) -> None:
    Path(p).mkdir(parents=True, exist_ok=True)

def upper_df(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d.columns = [c.upper() for c in d.columns]
    return d

def pick_first_existing(*cands: Optional[Path]) -> Optional[Path]:
    for c in cands:
        if c and Path(c).exists():
            return Path(c)
    return None


In [4]:
def log(msg: str) -> None:
    print(msg, flush=True)

## Depression (DPQ/PHQ-9) → merge into cm

In [13]:
# =========================
# 04 — Depression (DPQ/PHQ-9) → merge into cm
# =========================

from pathlib import Path
import pandas as pd, numpy as np

# --- Make sure CONFIG has these (backwards-compatible add) ---
if not hasattr(CONFIG, "dpq_9923"):
    CONFIG.dpq_9923 = None
if not hasattr(CONFIG, "cov_dep"):
    CONFIG.cov_dep = "cov_dep_1999_2023.parquet"

def _read_any(p: Path) -> pd.DataFrame:
    p = Path(p)
    return pd.read_parquet(p) if p.suffix.lower()==".parquet" else pd.read_csv(p, low_memory=False)

def _download(url: str, dest: Path, timeout=90):
    import requests
    dest.parent.mkdir(parents=True, exist_ok=True)
    headers = {"User-Agent": "nhanes-fetch/1.0"}
    with requests.get(url, headers=headers, stream=True, timeout=timeout) as r:
        r.raise_for_status()
        tmp = dest.with_suffix(dest.suffix + ".downloading")
        with open(tmp, "wb") as f:
            for chunk in r.iter_content(1<<15):
                if chunk: f.write(chunk)
        tmp.rename(dest)
    return dest

def _read_xpt(p: Path) -> pd.DataFrame:
    try:
        import pyreadstat
        df, _ = pyreadstat.read_xport(p)
    except Exception:
        df = pd.read_sas(p, format="xport")
    df.columns = [c.upper() for c in df.columns]
    return df

def _download_and_stack_dpq(cfg: Config) -> pd.DataFrame:
    # DPQ is available starting 2005–2006; nothing for 1999–2004.
    DPQ_URLS = {
        "2005-2006": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2005/DataFiles/DPQ_D.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/DPQ_D.XPT",
        ],
        "2007-2008": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2007/DataFiles/DPQ_E.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/DPQ_E.XPT",
        ],
        "2009-2010": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2009/DataFiles/DPQ_F.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/DPQ_F.XPT",
        ],
        "2011-2012": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2011/DataFiles/DPQ_G.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/DPQ_G.XPT",
        ],
        "2013-2014": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/DPQ_H.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DPQ_H.XPT",
        ],
        "2015-2016": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/DPQ_I.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DPQ_I.XPT",
        ],
        "2017-2018": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DPQ_J.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DPQ_J.XPT",
        ],
        "2017-March 2020 (pre-pandemic)": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DPQ.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2020/P_DPQ.XPT",
        ],
        "August 2021–August 2023": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DPQ_L.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2021-2023/DPQ_L.XPT",
        ],
    }
    store = Path(cfg.interim_dir) / "dpq"
    parts = []
    for cyc, urls in DPQ_URLS.items():
        got = None
        for u in urls:
            try:
                dst = store / Path(u).name
                if not dst.exists():
                    print(f"⬇️ DPQ {cyc} → {dst.name}")
                    _download(u, dst)
                got = dst; break
            except Exception as e:
                print("  ⚠️", e)
        if got is None: 
            continue
        df = _read_xpt(got)
        df["CYCLE"] = cyc
        parts.append(df)
    if not parts:
        raise RuntimeError("No DPQ data available (download failed).")
    dpq = pd.concat(parts, ignore_index=True)
    stacked = Path(cfg.interim_dir) / "dpq_9923.parquet"
    stacked.parent.mkdir(parents=True, exist_ok=True)
    dpq.to_parquet(stacked, index=False)
    return dpq

def _clean_dpq_score(dpq: pd.DataFrame) -> pd.DataFrame:
    d = dpq.copy()
    d.columns = [c.upper() for c in d.columns]
    items = [f"DPQ0{i:02d}" for i in range(10, 100, 10)]  # DPQ010..DPQ090
    items = [c for c in items if c in d.columns]

    # numeric + mask NHANES missing (7/9 etc.)
    for c in items:
        d[c] = pd.to_numeric(d[c], errors="coerce")
        d.loc[d[c].isin([7, 9, 77, 99]), c] = np.nan

    # count answered items; require >=7 to score (common convention)
    answered = d[items].notna().sum(axis=1)
    phq9 = d[items].sum(axis=1, min_count=1)  # sum if at least 1 non-na
    phq9 = phq9.where(answered >= 7)          # else NA

    # categories (standard PHQ-9)
    # 0–4 none/minimal, 5–9 mild, 10–14 moderate, 15–19 mod-severe, 20–27 severe
    cat = pd.cut(
        phq9,
        bins=[-0.1, 4, 9, 14, 19, 27],
        labels=["NONE/MIN", "MILD", "MOD", "MOD-SEV", "SEV"],
        include_lowest=True,
        right=True
    ).astype("category")

    out = pd.DataFrame({
        "SEQN": pd.to_numeric(d["SEQN"], errors="coerce").astype("Int64"),
        "PHQ9": phq9.astype("float"),
        "PHQ9_GE10": (phq9 >= 10).astype("Int8"),
        "DPQ_CAT": cat
    })
    # IMP: 1 if <7 items answered (scored NA), else 0
    out["IMP"] = (answered < 7).astype("Int8")
    return out.dropna(subset=["SEQN"]).drop_duplicates("SEQN")

def build_dep(cfg: Config = CONFIG, allow_fetch: bool = True) -> pd.DataFrame:
    """Build depression covariates from DPQ; writes cov_dep_1999_2023.parquet."""
    ensure_dir(cfg.out_dir)
    outp = Path(cfg.out_dir) / cfg.cov_dep
    if outp.exists():
        return pd.read_parquet(outp)

    dpq_path = pick_first_existing(
        cfg.dpq_9923,
        Path(cfg.interim_dir) / "dpq_9923.parquet",
        Path(cfg.interim_dir) / "dpq_9923.csv",
    )
    if dpq_path:
        dpq = _read_any(dpq_path)
    else:
        if not allow_fetch:
            raise FileNotFoundError("DPQ stack not found; set allow_fetch=True to download from CDC.")
        dpq = _download_and_stack_dpq(cfg)

    dep = _clean_dpq_score(dpq)
    dep.to_parquet(outp, index=False)
    log(f"✓ DEP → {outp}  (rows: {len(dep):,}, PHQ9≥10: {int(dep['PHQ9_GE10'].sum()):,})")
    return dep

# ---- Build depression & merge to your current cm
OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
cm_path = OUT / "cov_core_mort_1999_2023_flags.parquet"
assert cm_path.exists(), "Expected cm flags file not found."

cm = pd.read_parquet(cm_path)
dep = build_dep(CONFIG)  # may download DPQ if not stacked

# Keep a tidy set of dep columns
dep_small = dep[["SEQN","PHQ9","PHQ9_GE10","DPQ_CAT","IMP"]].rename(columns={
    "IMP": "DEP_IMP"
})

cm_dep = cm.merge(dep_small, on="SEQN", how="left")

# Save merged datasets
cm_dep_path = OUT / "cov_core_mort_dep_1999_2023.parquet"
cm_dep.to_parquet(cm_dep_path, index=False)
print("✓ Merged cm + DEP →", cm_dep_path)

# Quick peek / QA
show = ["SEQN","SDDSRVYR","AGE_YR","SEX","PHQ9","DPQ_CAT","PHQ9_GE10","DEP_IMP","MORTALITY_COVERED","EVENT", "WTMEC2YR"]
present = [c for c in show if c in cm_dep.columns]
display(cm_dep.tail(20)[present])

# Coverage stats (adults <=2018)
mask = cm_dep["MORTALITY_COVERED"]==True
print("PHQ9 missing (covered adults):", cm_dep.loc[mask,"PHQ9"].isna().mean().round(3))
print("PHQ9>=10 prevalence (covered adults):", cm_dep.loc[mask,"PHQ9_GE10"].mean().round(3))


✓ Merged cm + DEP → /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_core_mort_dep_1999_2023.parquet


Unnamed: 0,SEQN,SDDSRVYR,AGE_YR,SEX,PHQ9,DPQ_CAT,PHQ9_GE10,DEP_IMP,MORTALITY_COVERED,EVENT,WTMEC2YR
128789,142291,12.0,3.0,M,,,,,False,,5.397605e-79
128790,142292,12.0,10.0,M,,,,,False,,20847.19
128791,142293,12.0,44.0,M,,,0.0,1.0,False,,76811.76
128792,142294,12.0,80.0,F,,,,,False,,5.397605e-79
128793,142295,12.0,80.0,F,2.0,NONE/MIN,0.0,0.0,False,,42736.48
128794,142296,12.0,1.0,M,,,,,False,,5.397605e-79
128795,142297,12.0,76.0,M,,,,,False,,5.397605e-79
128796,142298,12.0,60.0,M,1.0,NONE/MIN,0.0,0.0,False,,47208.2
128797,142299,12.0,33.0,M,0.0,NONE/MIN,0.0,0.0,False,,162250.2
128798,142300,12.0,46.0,F,3.0,NONE/MIN,0.0,0.0,False,,30517.13


PHQ9 missing (covered adults): 0.385
PHQ9>=10 prevalence (covered adults): 0.079


## quick sanity 

In [14]:
# 1) PHQ9 completeness by cycle (you'll see 1999–2004 are all NA)
print(cm_dep.groupby("SDDSRVYR")["PHQ9"].apply(lambda s: 1 - s.isna().mean()).round(3))

# 2) Recompute completeness & prevalence only for cycles with DPQ (2005+)
mask_dpq_cycles = cm_dep["SDDSRVYR"] >= 4   # 2005–2006 is 4
mask_cov = cm_dep["MORTALITY_COVERED"] == True
sub = cm_dep.loc[mask_cov & mask_dpq_cycles]

print("PHQ9 missing (covered adults, 2005+):", sub["PHQ9"].isna().mean().round(3))
print("PHQ9>=10 prevalence (covered adults, 2005+):", sub["PHQ9_GE10"].mean().round(3))

# 3) Weighted prevalence (recommended)
w = sub["WTMEC2YR"]
print("Weighted PHQ9>=10 prevalence (2005+):",
      (sub["PHQ9_GE10"] * w).sum() / w.sum())


SDDSRVYR
1.0     0.000
2.0     0.000
3.0     0.000
4.0     0.467
5.0     0.536
6.0     0.528
7.0     0.507
8.0     0.530
9.0     0.517
10.0    0.550
12.0    0.461
66.0    0.533
Name: PHQ9, dtype: float64
PHQ9 missing (covered adults, 2005+): 0.136
PHQ9>=10 prevalence (covered adults, 2005+): 0.079
Weighted PHQ9>=10 prevalence (2005+): 0.07060832861475243


In [15]:
from pathlib import Path
import pandas as pd
import numpy as np

OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
p = OUT / "cov_core_mort_dep_1999_2023.parquet"

cm = pd.read_parquet(p)

# -- 1) Normalize weights: treat tiny float noise as zero, ensure nonnegative
if "WTMEC2YR" in cm.columns:
    cm["WTMEC2YR"] = pd.to_numeric(cm["WTMEC2YR"], errors="coerce")
    cm.loc[cm["WTMEC2YR"].abs() < 1e-20, "WTMEC2YR"] = 0.0
    cm.loc[cm["WTMEC2YR"] < 0, "WTMEC2YR"] = np.nan  # just in case
    print("WTMEC2YR > 0 rows:", int((cm["WTMEC2YR"] > 0).sum()))

# -- 2) Enforce RIAGENDR / SEX / FEMALE consistency
if "RIAGENDR" in cm.columns:
    cm["RIAGENDR"] = pd.to_numeric(cm["RIAGENDR"], errors="coerce").astype("Int8")
    cm["SEX"] = cm["RIAGENDR"].map({1: "M", 2: "F"}).astype("string")
    cm["FEMALE"] = (cm["RIAGENDR"] == 2).astype("Int8")

# -- 3) Quick sanity display in a fixed order to avoid wrap confusion
show_cols = [
    "SEQN","SDDSRVYR","AGE_YR","RIAGENDR","SEX",
    "PHQ9","DPQ_CAT","PHQ9_GE10","DEP_IMP",
    "MORTALITY_COVERED","EVENT","WTMEC2YR"
]
show_cols = [c for c in show_cols if c in cm.columns]
display(cm.loc[:20, show_cols])

# (Optional) Save back
# cm.to_parquet(p, index=False)
# print("✓ Cleaned and saved:", p)


WTMEC2YR > 0 rows: 105626


Unnamed: 0,SEQN,SDDSRVYR,AGE_YR,RIAGENDR,SEX,PHQ9,DPQ_CAT,PHQ9_GE10,DEP_IMP,MORTALITY_COVERED,EVENT,WTMEC2YR
0,1,1.0,2.0,2,F,,,,,False,,10982.898896
1,2,1.0,77.0,1,M,,,,,True,1.0,28325.384898
2,3,1.0,10.0,2,F,,,,,False,,46192.256945
3,4,1.0,1.0,1,M,,,,,False,,10251.26002
4,5,1.0,49.0,1,M,,,,,True,0.0,99445.065735
5,6,1.0,19.0,2,F,,,,,True,0.0,39656.600444
6,7,1.0,59.0,2,F,,,,,True,0.0,25525.423409
7,8,1.0,13.0,1,M,,,,,False,,31510.587866
8,9,1.0,11.0,2,F,,,,,False,,7575.870247
9,10,1.0,43.0,1,M,,,,,True,1.0,22445.808572


## add 99-04

In [20]:
# --- REPLACE your CIQDEP 99–04 fetch/stack with this ---

import re, pandas as pd, numpy as np
from pathlib import Path

def _read_xpt(p: Path) -> pd.DataFrame:
    p = Path(p)
    try:
        import pyreadstat
        df, _ = pyreadstat.read_xport(p)
    except Exception:
        df = pd.read_sas(p, format="xport")
    df.columns = [c.upper() for c in df.columns]
    return df

def _download(url: str, dest: Path, timeout=90):
    import requests, time
    dest.parent.mkdir(parents=True, exist_ok=True)
    headers = {"User-Agent": "nhanes-fetch/1.0"}
    with requests.get(url, headers=headers, stream=True, timeout=timeout) as r:
        r.raise_for_status()
        tmp = dest.with_suffix(dest.suffix + ".downloading")
        with open(tmp, "wb") as f:
            for chunk in r.iter_content(1<<15):
                if chunk: f.write(chunk)
        tmp.rename(dest)
    return dest

def _get_first_working(urls, store: Path) -> Path | None:
    for u in urls:
        try:
            dst = store / Path(u).name
            if not dst.exists():
                print(f"⬇️  trying {Path(u).name}")
                _download(u, dst)
            return dst
        except Exception as e:
            print("  ⚠️", e)
    return None

def _stack_ciqdep_9904(cfg) -> pd.DataFrame:
    """
    Stack CIDI depression for 1999–2004 using multiple fallback URLs.
    1999–2000 file is CIQMDEP.xpt (not CIQDEP_A).
    """
    store = Path(cfg.interim_dir) / "dep9904"
    store.mkdir(parents=True, exist_ok=True)

    CANDIDATES = {
        "1999-2000": [
            # your working link (preferred)
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/1999/DataFiles/CIQMDEP.xpt",
            # alternate patterns
            "https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/CIQDEP_A.XPT",
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/1999/DataFiles/CIQDEP_A.xpt",
        ],
        "2001-2002": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2001/DataFiles/CIQDEP_B.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/CIQDEP_B.XPT",
        ],
        "2003-2004": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2003/DataFiles/CIQDEP_C.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/CIQDEP_C.XPT",
        ],
    }

    parts = []
    for cyc, urls in CANDIDATES.items():
        got = _get_first_working(urls, store)
        if got is None:
            print(f"  ❌ skipped {cyc} (no URL worked)")
            continue
        df = _read_xpt(got)
        df["CYCLE"] = cyc
        parts.append(df)

    if not parts:
        raise RuntimeError("No CIQDEP 1999–2004 files available (all URLs failed).")
    return pd.concat(parts, ignore_index=True)

#### HERE IS problem 🔥🔥🔥🔥🔥!!!! 

def build_dep_9904(cfg, allow_fetch=True) -> pd.DataFrame:
    """
    Returns: SEQN, CIDI_SCORE, CIDI_MDE_12MO (if present), WTSCI2YR (if present)
    Tries to be robust to column name differences across cycles.
    """
    pq = Path(cfg.interim_dir) / "ciqdep_9904.parquet"
    if pq.exists():
        raw = pd.read_parquet(pq)
    else:
        if not allow_fetch:
            raise FileNotFoundError("ciqdep_9904.parquet not found; set allow_fetch=True to download.")
        raw = _stack_ciqdep_9904(cfg)
        raw.to_parquet(pq, index=False)

    d = raw.copy()
    d.columns = [c.upper() for c in d.columns]

    out = d[["SEQN"]].drop_duplicates().copy()

    # --- score: prefer CIDDSCOR; otherwise any "*SCOR*" with CIDI/CIDD/DEP in name
    score_col = None
    if "CIDDSCOR" in d.columns:
        score_col = "CIDDSCOR"
    else:
        cand = [c for c in d.columns if ("SCOR" in c) and (("CIDD" in c) or ("CIDI" in c) or ("DEP" in c))]
        if cand: score_col = cand[0]
    if score_col:
        out["CIDI_SCORE"] = pd.to_numeric(d.set_index("SEQN")[score_col], errors="coerce").reindex(out["SEQN"]).values

    # --- diagnosis flag: try to find something like CIDI/CIDD + MDE/DEP that is binary-ish
    diag_col = None
    pref = [c for c in d.columns if ("MDE" in c) and (("CIDD" in c) or ("CIDI" in c))]
    if pref:
        diag_col = pref[0]
    else:
        pref = [c for c in d.columns if (("DEP" in c) and (("CIDD" in c) or ("CIDI" in c))) and ("SCOR" not in c)]
        if pref:
            diag_col = pref[0]
    if diag_col:
        dc = pd.to_numeric(d.set_index("SEQN")[diag_col], errors="coerce").reindex(out["SEQN"])
        out["CIDI_MDE_12MO"] = np.where(dc.isna(), pd.NA, (dc == 1).astype("Int8"))

    # --- subsample weight: any WTSC*2YR gets normalized to WTSCI2YR
    w_cols = [c for c in d.columns if re.match(r"WTSC.*2YR", c, flags=re.I)]
    if w_cols:
        w = pd.to_numeric(d.set_index("SEQN")[w_cols[0]], errors="coerce").reindex(out["SEQN"])
        out["WTSCI2YR"] = w.values

    return out


In [35]:
# --- REPLACE your CIQDEP 99–04 fetch/stack with this ---

import re, pandas as pd, numpy as np
from pathlib import Path

def _read_xpt(p: Path) -> pd.DataFrame:
    p = Path(p)
    try:
        import pyreadstat
        df, _ = pyreadstat.read_xport(p)
    except Exception:
        df = pd.read_sas(p, format="xport")
    df.columns = [c.upper() for c in df.columns]
    return df

def _download(url: str, dest: Path, timeout=90):
    import requests, time
    dest.parent.mkdir(parents=True, exist_ok=True)
    headers = {"User-Agent": "nhanes-fetch/1.0"}
    with requests.get(url, headers=headers, stream=True, timeout=timeout) as r:
        r.raise_for_status()
        tmp = dest.with_suffix(dest.suffix + ".downloading")
        with open(tmp, "wb") as f:
            for chunk in r.iter_content(1<<15):
                if chunk: f.write(chunk)
        tmp.rename(dest)
    return dest

def _get_first_working(urls, store: Path) -> Path | None:
    for u in urls:
        try:
            dst = store / Path(u).name
            if not dst.exists():
                print(f"⬇️  trying {Path(u).name}")
                _download(u, dst)
            return dst
        except Exception as e:
            print("  ⚠️", e)
    return None

def _stack_ciqdep_9904(cfg) -> pd.DataFrame:
    """
    Stack CIDI depression for 1999–2004 using multiple fallback URLs.
    1999–2000 file is CIQMDEP.xpt (not CIQDEP_A).
    """
    store = Path(cfg.interim_dir) / "dep9904"
    store.mkdir(parents=True, exist_ok=True)

    CANDIDATES = {
        "1999-2000": [
            # your working link (preferred)
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/1999/DataFiles/CIQMDEP.xpt",
            # alternate patterns
            "https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/CIQDEP_A.XPT",
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/1999/DataFiles/CIQDEP_A.xpt",
        ],
        "2001-2002": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2001/DataFiles/CIQDEP_B.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/CIQDEP_B.XPT",
        ],
        "2003-2004": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2003/DataFiles/CIQDEP_C.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/CIQDEP_C.XPT",
        ],
    }

    parts = []
    for cyc, urls in CANDIDATES.items():
        got = _get_first_working(urls, store)
        if got is None:
            print(f"  ❌ skipped {cyc} (no URL worked)")
            continue
        df = _read_xpt(got)
        df["CYCLE"] = cyc
        parts.append(df)

    if not parts:
        raise RuntimeError("No CIQDEP 1999–2004 files available (all URLs failed).")
    return pd.concat(parts, ignore_index=True)

def build_dep_9904(cfg, allow_fetch=True) -> pd.DataFrame:
    """
    Build CIDI depression for 1999–2004.

    Returns columns:
      SEQN, CIDI_SCORE (sum of CIQD001–CIQD009), CIDI_DEP_FLAG (>=cut), WTSCI2YR (if present)
    """
    import re

    pq = Path(cfg.interim_dir) / "ciqdep_9904.parquet"
    if pq.exists():
        d = pd.read_parquet(pq)
    else:
        if not allow_fetch:
            raise FileNotFoundError("ciqdep_9904.parquet not found; set allow_fetch=True to download.")
        d = _stack_ciqdep_9904(cfg)
        d.to_parquet(pq, index=False)

    d.columns = [c.upper() for c in d.columns]
    out = d[["SEQN"]].drop_duplicates().copy()

    # --- Prefer constructing a symptom score from CIQD001..CIQD009 ---
    ciqd_cols = sorted([c for c in d.columns if re.fullmatch(r"CIQD\d{3}", c)])
    if ciqd_cols:
        dd = d.copy()
        # coerce items to binary 0/1 (handle 1=yes, 2=no patterns)
        for c in ciqd_cols:
            x = pd.to_numeric(dd[c], errors="coerce")
            uniq = set(pd.Series(x.dropna().unique()))
            if uniq.issubset({0, 1}):
                dd[c] = x.astype("Int8")
            else:
                # typical CIDI coding: 1 = yes, 2 = no
                dd[c] = (x == 1).astype("Int8")
        d["CIDI_SCORE"] = dd[ciqd_cols].sum(axis=1, min_count=1)
    else:
        # --- Fallback: try any pre-computed score if CIQD* are missing ---
        score_col = None
        if "CIDDSCOR" in d.columns:
            score_col = "CIDDSCOR"
        else:
            cand = [c for c in d.columns
                    if ("SCOR" in c) and (("CIDD" in c) or ("CIDI" in c) or ("DEP" in c))]
            if cand:
                score_col = cand[0]
        d["CIDI_SCORE"] = pd.to_numeric(d.get(score_col), errors="coerce") if score_col else pd.NA

    # --- Binary flag with your chosen cutoff (default ≥5) ---
    cut = 5
    out["CIDI_SCORE"]   = pd.to_numeric(d.set_index("SEQN")["CIDI_SCORE"], errors="coerce").reindex(out["SEQN"]).values
    out["CIDI_DEP_FLAG"] = (out["CIDI_SCORE"] >= cut).astype("Int8")

    # --- Weight: any WTSC*2YR → WTSCI2YR ---
    w_cols = [c for c in d.columns if re.match(r"WTSC.*2YR", c, flags=re.I)]
    if w_cols:
        w = pd.to_numeric(d.set_index("SEQN")[w_cols[0]], errors="coerce").reindex(out["SEQN"])
        out["WTSCI2YR"] = w.values

    return out



In [36]:
# ---- Quick diagnostic for CIQDEP 1999–2004 build ----
from pathlib import Path
import pandas as pd

# 0) Make sure CONFIG is in scope
try:
    _ = CONFIG
except NameError:
    raise RuntimeError("CONFIG is not defined in this notebook. Re-run the cell that defines Config/CONFIG.")

# 1) See what's in the local store (helps if downloads were flaky)
store = Path(CONFIG.interim_dir) / "dep9904"
print("Store folder:", store)
print("Exists:", store.exists())
print("Files:", [p.name for p in store.glob("*.xpt")])

# 2) Build/Load the 99–04 depression stack
dep9904 = build_dep_9904(CONFIG, allow_fetch=True)

print("\nStack summary")
print("Rows:", len(dep9904))
print("Columns:", dep9904.columns.tolist())

# 3) Preview
display(dep9904.head(20))

# 4) Quick coverage checks
for col in ["CIDI_SCORE","CIDI_MDE_12MO","WTSCI2YR"]:
    if col in dep9904.columns:
        print(f"{col} non-missing:", dep9904[col].notna().sum())


Store folder: /Users/dengshuyue/Desktop/SDOH/analysis/data/cov/dep9904
Exists: True
Files: ['CIQDEP_B.xpt', 'CIQDEP_C.xpt', 'CIQMDEP.xpt']

Stack summary
Rows: 2556
Columns: ['SEQN', 'CIDI_SCORE', 'CIDI_DEP_FLAG', 'WTSCI2YR']


Unnamed: 0,SEQN,CIDI_SCORE,CIDI_DEP_FLAG,WTSCI2YR
0,12.0,0,0,186078.314366
1,20.0,0,0,32784.75114
2,34.0,19,1,50386.248286
3,66.0,0,0,54383.039478
4,69.0,0,0,78866.089966
5,81.0,0,0,176135.722403
6,97.0,0,0,86334.258174
7,107.0,0,0,37883.834696
8,120.0,0,0,186788.235958
9,143.0,1,0,4969.041484


CIDI_SCORE non-missing: 2556
WTSCI2YR non-missing: 2266


## add depression 99-04 to 05-18

In [40]:
from pathlib import Path
import numpy as np, pandas as pd

# --- Paths ---
OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
cm_path = OUT / "cov_core_mort_dep_1999_2023.parquet"     # PHQ9 + mort (your existing file)
dep9904_path = Path("/Users/dengshuyue/Desktop/SDOH/analysis/data/cov/ciqdep_9904.parquet")

# --- Load ---
cm = pd.read_parquet(cm_path)
dep9904 = pd.read_parquet(dep9904_path).rename(columns=str.upper)
dep9904 = dep9904.replace({r'^\s*$': np.nan, r'^(<NA>|NA|NaN)$': np.nan}, regex=True)

# --- Weight column (1999–2004 CIQDEP subsample weight) ---
if "WTSCI2YR" in dep9904.columns:
    w_col = "WTSCI2YR"
else:
    w_col = next((c for c in dep9904.columns
                  if isinstance(c, str) and c.startswith("WTS") and c.endswith("2YR")), None)

# --- Helpers to pick score/flag columns robustly ---
EXCLUDE = {"CIAORDER", "CIASEQ", "SECTION", "COMMENT", "NOTE"}

def pick_binary_flag(df: pd.DataFrame) -> str | None:
    # Look for existing CIDI/CIDD/DEP flags that are (mostly) 0/1 and not obviously scores
    candidates = [c for c in df.columns if isinstance(c, str) and c not in EXCLUDE]
    # Prefer names with 'FLAG' or 'MDE' and avoid 'SCOR'
    prioritised = [c for c in candidates
                   if (("CIDI" in c or "CIDD" in c or "DEP" in c) and
                       ("FLAG" in c or "MDE" in c or "DIAG" in c) and
                       ("SCOR" not in c) and c not in {"SEQN", "SDDSRVYR", w_col})]
    # If none, allow simpler names like 'CIDI'/'DEP' (still avoiding 'SCOR')
    if not prioritised:
        prioritised = [c for c in candidates
                       if (("CIDI" in c or "CIDD" in c or "DEP" in c) and
                           ("SCOR" not in c) and c not in {"SEQN", "SDDSRVYR", w_col})]
    # Validate by checking values are essentially binary
    for c in prioritised:
        x = pd.to_numeric(df[c], errors="coerce")
        u = set(x.dropna().unique().tolist())
        if u.issubset({0, 1}) or u.issubset({0.0, 1.0}) or u.issubset({1}) or u.issubset({0}):
            return c
    return None

def pick_score_column(df: pd.DataFrame) -> str | None:
    # Prefer columns with SCORE/SCOR in name; else most informative numeric CI(D)I/DEP col
    candidates = [c for c in df.columns if isinstance(c, str) and c not in EXCLUDE]
    by_name = [c for c in candidates
               if (("SCORE" in c or "SCOR" in c or c == "CIDI_SCORE") and
                   c not in {"SEQN", "SDDSRVYR", w_col})]
    def info_tuple(col):
        x = pd.to_numeric(df[col], errors="coerce")
        return (x.notna().sum(), x.nunique(dropna=True))
    ranked = sorted(by_name, key=info_tuple, reverse=True)
    for c in ranked:
        if pd.to_numeric(df[c], errors="coerce").notna().any():
            return c
    # Fallback: most informative numeric CIDI/DEP column
    numericish = []
    for c in candidates:
        if c in {"SEQN"} or (w_col and c == w_col): 
            continue
        if not (("CIDI" in c or "CIDD" in c or "DEP" in c) or ("SCORE" in c or "SCOR" in c)):
            continue
        x = pd.to_numeric(df[c], errors="coerce")
        if x.notna().sum() > 0:
            numericish.append((c, x.notna().sum(), x.nunique(dropna=True)))
    numericish.sort(key=lambda t: (t[1], t[2]), reverse=True)
    return numericish[0][0] if numericish else None

# --- Normalize: prefer an existing binary flag; otherwise derive from score ---
bin_col = pick_binary_flag(dep9904)
score_col = pick_score_column(dep9904)

if score_col:
    dep9904["CIDI_SCORE"] = pd.to_numeric(dep9904[score_col], errors="coerce")
else:
    dep9904["CIDI_SCORE"] = pd.NA

if bin_col:
    dep9904["CIDI_DEP_FLAG"] = pd.to_numeric(dep9904[bin_col], errors="coerce").astype("Int8")
else:
    # Threshold: treat 0/1 as binary (thr=1). If counts/symptoms, use thr=5 (PHQ9-like).
    x = pd.to_numeric(dep9904["CIDI_SCORE"], errors="coerce")
    thr = 1 if (x.notna().any() and x.max() <= 1) else 5
    dep9904["CIDI_DEP_FLAG"] = pd.Series(pd.NA, dtype="Int8", index=dep9904.index)
    dep9904.loc[x.notna(), "CIDI_DEP_FLAG"] = (x.loc[x.notna()] >= thr).astype("Int8")

# --- Dedupe preferring rows with: (1) a flag, (2) a score, (3) a weight ---
dep9904["_has_flag"]  = dep9904["CIDI_DEP_FLAG"].notna().astype(int)
dep9904["_has_score"] = pd.to_numeric(dep9904["CIDI_SCORE"], errors="coerce").notna().astype(int)
dep9904["_has_wt"]    = (pd.to_numeric(dep9904[w_col], errors="coerce").notna().astype(int)
                         if w_col and w_col in dep9904.columns else 0)

dep9904_small = (
    dep9904.sort_values(["SEQN", "_has_flag", "_has_score", "_has_wt"],
                        ascending=[True, False, False, False])
           .drop_duplicates("SEQN")
           .drop(columns=["_has_flag", "_has_score", "_has_wt"])
)

# --- Merge into your core PHQ9+mort file and harmonize ---
keep_merge_cols = ["SEQN", "CIDI_SCORE", "CIDI_DEP_FLAG"] + ([w_col] if w_col else [])
cm2 = cm.merge(dep9904_small[[c for c in keep_merge_cols if c in dep9904_small.columns]],
               on="SEQN", how="left")

sdd = pd.to_numeric(cm2["SDDSRVYR"], errors="coerce")
mask_9904   = sdd <= 3    # 1999–2004 (CIDI)
mask_05plus = sdd >= 4    # 2005+     (PHQ-9)

phq = pd.to_numeric(cm2.get("PHQ9_GE10"), errors="coerce").astype("Int8")
has_phq = phq.notna()
has_cidi_any = cm2["CIDI_DEP_FLAG"].notna() | pd.to_numeric(cm2.get("CIDI_SCORE"), errors="coerce").notna()

cm2["DEP_HARMONIZED"] = pd.Series(pd.NA, dtype="Int8", index=cm2.index)
cm2.loc[mask_9904 & has_cidi_any, "DEP_HARMONIZED"] = cm2.loc[mask_9904 & has_cidi_any, "CIDI_DEP_FLAG"]
cm2.loc[mask_05plus & has_phq,    "DEP_HARMONIZED"] = phq[mask_05plus & has_phq]

cm2["DEP_SOURCE"] = pd.Series(pd.NA, dtype="string", index=cm2.index)
cm2.loc[mask_9904 & has_cidi_any, "DEP_SOURCE"] = "CIDI99-04"
phq_presence = cm2["PHQ9"].notna() if "PHQ9" in cm2.columns else cm2["PHQ9_GE10"].notna()
cm2.loc[mask_05plus & phq_presence, "DEP_SOURCE"] = "PHQ9_05plus"

# --- Save ---
out_all = OUT / "cov_core_mort_dep_all_1999_2023.parquet"
cm2.to_parquet(out_all, index=False)
print(f"✓ Wrote {out_all}  (rows: {len(cm2):,})")

# --- Diagnostics ---
def nn(dicols, label):
    return {c: cm2[c].notna().sum() for c in dicols if c in cm2.columns}

print("\nAuto-detected 1999–2004 columns:")
print("  Binary flag:", bin_col)
print("  Score col:  ", score_col)
print("Detected weight col:", w_col)

print("\nNon-missing counts after merge:",
      nn(["CIDI_SCORE","CIDI_DEP_FLAG", w_col if w_col else ""], "post-merge"))

print("\nCoverage by cycle (share DEP_HARMONIZED present):")
print(cm2.groupby("SDDSRVYR", dropna=False)["DEP_HARMONIZED"].apply(lambda s: s.notna().mean()).round(3))

print("\nPrevalence by source (unweighted):")
print(cm2.groupby("DEP_SOURCE", dropna=False)["DEP_HARMONIZED"].mean().round(3))

def wmean(x, w):
    x = pd.to_numeric(x, errors="coerce")
    w = pd.to_numeric(w, errors="coerce")
    m = x.notna() & w.notna() & (w > 0)
    return (x[m] * w[m]).sum() / w[m].sum() if m.any() else np.nan

w_prev_cidi = np.nan
if w_col and w_col in cm2.columns:
    w_prev_cidi = wmean(cm2.loc[mask_9904, "DEP_HARMONIZED"], cm2.loc[mask_9904, w_col])
w_prev_phq = wmean(cm2.loc[mask_05plus, "DEP_HARMONIZED"], cm2.loc[mask_05plus, "WTMEC2YR"])

print(f"\nWeighted prevalence 1999–2004 (CIDI, {w_col or 'no weight found'}): {w_prev_cidi:.3f}")
print(f"Weighted prevalence 2005+ (PHQ-9, WTMEC2YR):                         {w_prev_phq:.3f}")

# --- Peek a few rows to visually confirm both eras ---
cols_show = ["SEQN","SDDSRVYR","AGE_YR","SEX",
             "CIDI_SCORE","CIDI_DEP_FLAG","PHQ9","PHQ9_GE10",
             "DEP_SOURCE","DEP_HARMONIZED", w_col, "WTMEC2YR"]
cols_show = [c for c in cols_show if isinstance(c, str) and c in cm2.columns]
try:
    from IPython.display import display
    print("\nSample (1999–2004 rows with CIDI, then 2005+ with PHQ-9):")
    sample_9904 = cm2.loc[mask_9904 & has_cidi_any, cols_show].head(10)
    sample_05p  = cm2.loc[mask_05plus & phq_presence, cols_show].head(10)
    display(pd.concat([sample_9904, sample_05p], axis=0))
except Exception:
    print("\nHead:")
    print(cm2[cols_show].head(20))


✓ Wrote /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_core_mort_dep_all_1999_2023.parquet  (rows: 128,809)

Auto-detected 1999–2004 columns:
  Binary flag: None
  Score col:   CIDDSCOR
Detected weight col: WTSCI2YR

Non-missing counts after merge: {'CIDI_SCORE': np.int64(2217), 'CIDI_DEP_FLAG': np.int64(2217), 'WTSCI2YR': np.int64(2266)}

Coverage by cycle (share DEP_HARMONIZED present):
SDDSRVYR
1.0     0.072
2.0     0.074
3.0     0.068
4.0     0.515
5.0     0.591
6.0     0.604
7.0     0.576
8.0     0.582
9.0     0.575
10.0    0.598
12.0    0.531
66.0    0.576
Name: DEP_HARMONIZED, dtype: float64

Prevalence by source (unweighted):
DEP_SOURCE
CIDI99-04      0.933
PHQ9_05plus    0.094
<NA>             0.0
Name: DEP_HARMONIZED, dtype: Float64

Weighted prevalence 1999–2004 (CIDI, WTSCI2YR): 0.921
Weighted prevalence 2005+ (PHQ-9, WTMEC2YR):                         0.076

Sample (1999–2004 rows with CIDI, then 2005+ with PHQ-9):


Unnamed: 0,SEQN,SDDSRVYR,AGE_YR,SEX,CIDI_SCORE,CIDI_DEP_FLAG,PHQ9,PHQ9_GE10,DEP_SOURCE,DEP_HARMONIZED,WTSCI2YR,WTMEC2YR
11,12,1.0,37.0,M,5.0,1.0,,,CIDI99-04,1,186078.314366,95494.214052
19,20,1.0,23.0,F,5.0,1.0,,,CIDI99-04,1,32784.75114,16736.882281
33,34,1.0,38.0,F,5.0,1.0,,,CIDI99-04,1,50386.248286,27063.495057
65,66,1.0,37.0,M,5.0,1.0,,,CIDI99-04,1,54383.039478,27489.67342
68,69,1.0,27.0,M,5.0,1.0,,,CIDI99-04,1,78866.089966,34823.862122
80,81,1.0,30.0,M,5.0,1.0,,,CIDI99-04,1,176135.722403,95022.657961
96,97,1.0,32.0,M,5.0,1.0,,,CIDI99-04,1,86334.258174,40766.798504
106,107,1.0,30.0,F,5.0,1.0,,,CIDI99-04,1,37883.834696,16803.503699
119,120,1.0,30.0,F,5.0,1.0,,,CIDI99-04,1,186788.235958,98327.632014
142,143,1.0,22.0,F,5.0,1.0,,,CIDI99-04,1,4969.041484,2544.098678


In [41]:
# --- Peek a table reliably ---
try:
    from IPython.display import display
except Exception:
    display = None

peek_cols = [
    "SEQN","SDDSRVYR","AGE_YR","SEX",
    "CIDI_SCORE","CIDI_DEP_FLAG",
    "PHQ9","PHQ9_GE10",
    "DEP_SOURCE","DEP_HARMONIZED",
    "WTSCI2YR","WTMEC2YR"
]
peek_cols = [c for c in peek_cols if c in cm2.columns]

peek = cm2[peek_cols].head(20)

if display is not None:
    display(peek)
else:
    # fallback for scripts/terminals
    print(peek.to_string(index=False))


Unnamed: 0,SEQN,SDDSRVYR,AGE_YR,SEX,CIDI_SCORE,CIDI_DEP_FLAG,PHQ9,PHQ9_GE10,DEP_SOURCE,DEP_HARMONIZED,WTSCI2YR,WTMEC2YR
0,1,1.0,2.0,F,,,,,,,,10982.898896
1,2,1.0,77.0,M,,,,,,,,28325.384898
2,3,1.0,10.0,F,,,,,,,,46192.256945
3,4,1.0,1.0,M,,,,,,,,10251.26002
4,5,1.0,49.0,M,,,,,,,,99445.065735
5,6,1.0,19.0,F,,,,,,,,39656.600444
6,7,1.0,59.0,F,,,,,,,,25525.423409
7,8,1.0,13.0,M,,,,,,,,31510.587866
8,9,1.0,11.0,F,,,,,,,,7575.870247
9,10,1.0,43.0,M,,,,,,,,22445.808572


## sanity check

In [42]:
# 1) How many have CIDI in 1999–2004?
m = (cm2["SDDSRVYR"] <= 4)
print("CIDI present 99–04:", cm2.loc[m, "CIDI_SCORE"].notna().sum())

# 2) Does DEP_HARMONIZED equal the era-specific flags?
print("≤2004 equal CIDI flag:", (cm2.loc[m, "DEP_HARMONIZED"] == cm2.loc[m, "CIDI_DEP_FLAG"]).mean())
print("≥2005 equal PHQ9_GE10:", (cm2.loc[~m, "DEP_HARMONIZED"] == cm2.loc[~m, "PHQ9_GE10"]).mean())

# 3) Weighted prevalence by era (subsample weight for CIDI; MEC weight for PHQ-9)
def wmean(x, w):
    x = pd.to_numeric(x, errors="coerce")
    w = pd.to_numeric(w, errors="coerce")
    ok = x.notna() & w.notna() & (w > 0)
    return float((x[ok] * w[ok]).sum() / w[ok].sum()) if ok.any() else float("nan")

print("99–04 weighted (CIDI, WTSCI2YR):", wmean(cm2.loc[m, "DEP_HARMONIZED"], cm2.loc[m, "WTSCI2YR"]))
print("2005+ weighted (PHQ9, WTMEC2YR):", wmean(cm2.loc[~m, "DEP_HARMONIZED"], cm2.loc[~m, "WTMEC2YR"]))


CIDI present 99–04: 2217
≤2004 equal CIDI flag: 1.0
≥2005 equal PHQ9_GE10: 1.0
99–04 weighted (CIDI, WTSCI2YR): 0.9208286855776959
2005+ weighted (PHQ9, WTMEC2YR): 0.07926053649632853
