<h1> 00 — merge demo, mort, sdoh</h1>

<h2>Shared environment and helper functions used across notebooks.</h2>
    

In [53]:
from pathlib import Path
import os, sys, warnings
import numpy as np
import pandas as pd

# -------------------------
# Root & existing folders (NO mkdir here)
# -------------------------
ROOT = Path(os.environ.get("SDOH_ROOT", "/Users/dengshuyue/Desktop/SDOH/analysis"))

CODE   = ROOT / "code"
DATA   = ROOT / "data"
OUT    = ROOT / "output"

# Data subfolders that you already have
NH_DEIT      = DATA / "nhanes_deit"
NH_BY_MOD    = DATA / "nhanes_by_module"
FPED_DIR     = DATA / "fped"
FNDDDS_DIR   = DATA / "fndds"
BPQ_DIR      = DATA / "bpq"
HEALTH_ACC   = DATA / "health_access"
HH_SIZE_DIR  = DATA / "household_size"
TMP_NORM_XPT = DATA / "tmp_norm_xpt"
LESS_IMP     = DATA / "less_important"

# Common files already present
FILES = {
    "demoall_csv":          DATA / "demoall.csv",
    "demoall_pkl":          DATA / "demoall.pkl",
    "hei9918_sas7bdat":     DATA / "hei9918.sas7bdat",
    "sodh_diet_mort_sas":   DATA / "sodh_diet_mort.sas7bdat",
    "sodh_diet_mort_pkl":   DATA / "SODH_diet_mort.pkl",
    # multiple CSV variants exist; we’ll glob when needed
}

# Output files/folders (already exist in your tree)
OUT_FILES = {
    "demo_summary_csv":     OUT / "demo_summary.csv",
    "demo_summary_r_csv":   OUT / "demo_summary_r.csv",
    "ahei_combined_csv":    DATA / "ahei_combined.csv",  # lives under data/
}
TABLES_FIGS = OUT  # you keep tables directly in output/

# -------------------------
# NHANES cycles
# - Keep it simple: Cox is in R; use explicit lists
# -------------------------
CYCLES_MORTALITY = [
    "1999-2000","2001-2002","2003-2004","2005-2006","2007-2008",
    "2009-2010","2011-2012","2013-2014","2015-2016","2017-2018",
]
CYCLES_NONMORT = [
    "2017-March 2020 (pre-pandemic)",  # P_DEMO.xpt style
    "August 2021–August 2023",         # DEMO_L.xpt
]
CYCLES_ALL = CYCLES_MORTALITY + CYCLES_NONMORT

# Optional suffix/prefix hints (use only if you need to load DEMO files by pattern)
CYCLE_SUFFIX = {
    "1999-2000": "",
    "2001-2002": "_B", "2003-2004": "_C", "2005-2006": "_D",
    "2007-2008": "_E", "2009-2010": "_F", "2011-2012": "_G",
    "2013-2014": "_H", "2015-2016": "_I", "2017-2018": "_J",
    "2017-March 2020 (pre-pandemic)": "P_",  # e.g., P_DEMO.xpt
    "August 2021–August 2023": "_L",         # e.g., DEMO_L.xpt
}
def cycle_suffix(label: str) -> str:
    return CYCLE_SUFFIX.get(label, "")

# -------------------------
# Small helpers
# -------------------------
def z(x):
    x = pd.Series(x, dtype="float64")
    return (x - x.mean(skipna=True)) / x.std(skipna=True)

def combine_wtmec(w, n_cycles: int):
    """
    When stacking cycles, divide 2-year MEC weights by the number of
    2-year cycles actually included in the stack you’re building.
    """
    return w / float(n_cycles)

def list_existing(paths):
    """Quickly check which paths exist (debug helper)."""
    return {k: (p if p.exists() else None) for k, p in paths.items()}

# -------------------------
# Display prefs
# -------------------------
pd.options.display.max_rows = 60
pd.options.display.max_columns = 120
warnings.filterwarnings("ignore")

print("Bootstrap loaded.")
print("ROOT:", ROOT)
print("Data dir exists:", DATA.exists())
print("Output dir exists:", OUT.exists())
print("Mortality cycles:", CYCLES_MORTALITY)
print("Non-mortality cycles:", CYCLES_NONMORT)


Bootstrap loaded.
ROOT: /Users/dengshuyue/Desktop/SDOH/analysis
Data dir exists: True
Output dir exists: True
Mortality cycles: ['1999-2000', '2001-2002', '2003-2004', '2005-2006', '2007-2008', '2009-2010', '2011-2012', '2013-2014', '2015-2016', '2017-2018']
Non-mortality cycles: ['2017-March 2020 (pre-pandemic)', 'August 2021–August 2023']


<h2>preview sas code and data </h2>

In [3]:
from pathlib import Path
import pandas as pd

BASE = Path("/Users/dengshuyue/Desktop/SDOH/analysis/data")

# Find DEMO for 2015–2016 ("I")
cands = list(BASE.rglob("DEMO_I.*"))
print("Candidates:", cands)

if not cands:
    # fallback: list any DEMO files you have
    print("Any DEMO files I can see:")
    print(list(BASE.rglob("DEMO*.*")))
else:
    p = cands[0]
    if p.suffix.lower() == ".xpt":
        df = pd.read_sas(p, format="xport")  # NHANES XPT format
    elif p.suffix.lower() == ".sas7bdat":
        import pyreadstat
        df, _ = pyreadstat.read_sas7bdat(p)
    else:
        raise ValueError(f"Unknown extension: {p.suffix}")

    pd.set_option("display.max_columns", None)
    display(df.head())
    print(df.columns.tolist())


Candidates: [PosixPath('/Users/dengshuyue/Desktop/SDOH/analysis/data/household_size/DEMO_I.xpt'), PosixPath('/Users/dengshuyue/Desktop/SDOH/analysis/data/nhanes_deit/DEMO_I.xpt')]


Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,DMQMILIZ,DMQADFC,DMDBORN4,DMDCITZN,DMDYRSUS,DMDEDUC3,DMDEDUC2,DMDMARTL,RIDEXPRG,SIALANG,SIAPROXY,SIAINTRP,FIALANG,FIAPROXY,FIAINTRP,MIALANG,MIAPROXY,MIAINTRP,AIALANGA,DMDHHSIZ,DMDFMSIZ,DMDHHSZA,DMDHHSZB,DMDHHSZE,DMDHRGND,DMDHRAGE,DMDHRBR4,DMDHREDU,DMDHRMAR,DMDHSEDU,WTINT2YR,WTMEC2YR,SDMVPSU,SDMVSTRA,INDHHIN2,INDFMIN2,INDFMPIR
0,83732.0,9.0,2.0,1.0,62.0,,3.0,3.0,1.0,,2.0,,1.0,1.0,,,5.0,1.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,5.397605e-79,5.397605e-79,1.0,1.0,62.0,1.0,5.0,1.0,3.0,134671.370419,135629.507405,1.0,125.0,10.0,10.0,4.39
1,83733.0,9.0,2.0,1.0,53.0,,3.0,3.0,1.0,,2.0,,2.0,2.0,7.0,,3.0,3.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79,5.397605e-79,1.0,53.0,2.0,3.0,3.0,,24328.560239,25282.425927,1.0,125.0,4.0,4.0,1.32
2,83734.0,9.0,2.0,1.0,78.0,,3.0,3.0,2.0,,1.0,2.0,1.0,1.0,,,3.0,1.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,,2.0,2.0,5.397605e-79,5.397605e-79,2.0,2.0,79.0,1.0,3.0,1.0,3.0,12400.008522,12575.838818,1.0,131.0,5.0,5.0,1.51
3,83735.0,9.0,2.0,2.0,56.0,,3.0,3.0,2.0,,2.0,,1.0,1.0,,,5.0,6.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79,5.397605e-79,2.0,56.0,1.0,5.0,6.0,,102717.995647,102078.634508,1.0,131.0,10.0,10.0,5.0
4,83736.0,9.0,2.0,2.0,42.0,,4.0,4.0,2.0,,2.0,,1.0,1.0,,,4.0,3.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,5.0,5.0,5.397605e-79,2.0,5.397605e-79,2.0,42.0,1.0,4.0,3.0,,17627.674984,18234.736219,2.0,126.0,7.0,7.0,1.23


['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR', 'RIDAGEMN', 'RIDRETH1', 'RIDRETH3', 'RIDEXMON', 'RIDEXAGM', 'DMQMILIZ', 'DMQADFC', 'DMDBORN4', 'DMDCITZN', 'DMDYRSUS', 'DMDEDUC3', 'DMDEDUC2', 'DMDMARTL', 'RIDEXPRG', 'SIALANG', 'SIAPROXY', 'SIAINTRP', 'FIALANG', 'FIAPROXY', 'FIAINTRP', 'MIALANG', 'MIAPROXY', 'MIAINTRP', 'AIALANGA', 'DMDHHSIZ', 'DMDFMSIZ', 'DMDHHSZA', 'DMDHHSZB', 'DMDHHSZE', 'DMDHRGND', 'DMDHRAGE', 'DMDHRBR4', 'DMDHREDU', 'DMDHRMAR', 'DMDHSEDU', 'WTINT2YR', 'WTMEC2YR', 'SDMVPSU', 'SDMVSTRA', 'INDHHIN2', 'INDFMIN2', 'INDFMPIR']


In [4]:

# Path to your .sas7bdat file
file_path = "/Users/dengshuyue/Desktop/SDOH/analysis/data/less_important/gg.sas7bdat"

# Load the dataset using pandas (requires pyreadstat)
df = pd.read_sas(file_path, format="sas7bdat")

# Preview the data
pd.set_option('display.max_columns', None)
df.head()

# List all column names
print(df.columns.tolist())


['SEQN', 'DAYS', 'DR12IFDC', 'WTDRD1', 'DR12DRST', 'SDDSRVYR', 'RIAGENDR', 'RIDAGEYR', 'RIDRETH1', 'DMDEDUC3', 'DMDEDUC2', 'INDFMPIR', 'DMDHREDU', 'WTINT2YR', 'WTMEC2YR', 'SDMVPSU', 'SDMVSTRA', 'DR12IKC2', 'CYCLE', 'WTDR2D', 'DR12FS', 'DMDHREDZ', 'age', 'race', 'edu', 'pedu', 'Incm', 'incm2', 'include', 'Weight16a', 'cycles', 'sex', 'age1', 'age2', 'age3', 'race2', 'race3', 'race4', 'weekend', '_NAME_', '_LABEL_', 'DRDAY1', 'DRDAY2', 'tkal1', 'tkal2', 'Tcal', 'DR12DAY', 'kcal1', 'kcal2', 'kcal3', 'kcal4', 'kcal12', 'wt1', 'wt2', 'wt3', 'wt4', 'wt12', 'pcte1', 'pcte2', 'pcte3', 'pcte4', 'pcte12', 'pctg1', 'pctg2', 'pctg3', 'pctg4', 'pctg12', 'kcals2', 'kcals5', 'kcals6', 'kcals9', 'kcals13', 'kcals14', 'kcals17', 'kcals20', 'kcals21', 'kcals22', 'kcals23', 'kcals25', 'kcals28', 'kcals29', 'kcals33', 'kcals36', 'kcals39', 'kcals41', 'kcals3', 'kcals37', 'kcals38', 'kcals40', 'kcals42', 'kcals1', 'kcals10', 'kcals16', 'kcals24', 'kcals15', 'kcals18', 'kcals7', 'kcals8', 'kcals19', 'kcals4

In [5]:
# Path to your .sas script file (SAS code, not dataset)
file_path = "/Users/dengshuyue/Desktop/SDOH/analysis/code/code_lu/Analysis1_COX_allcause.sas"

# Load the SAS script as plain text
with open(file_path, "r") as file:
    sas_code = file.read()

# Optionally preview the first 30 lines
sas_code_lines = sas_code.splitlines()
for line in sas_code_lines[:300]:
    print(line)



libname data "C:\Users\lwang18\Box\Projects\5_UPF_Mortality\data";
*%let home=C:\Users\LWANG18\Box\Projects\5_UPF_Mortality\results_revision ; 
%let home= C:\Users\lwang18\Box\Projects\Kroger project\Data for analysis\Scores;
 %let path= C:\Users\lwang18\Box\Projects\Kroger project\Data for analysis\Scores;

%let home= C:\Users\lwang18\OneDrive - Tufts\Desktop\Projects\Food Insecurity,;
libname out "C:\Users\lwang18\OneDrive - Tufts\Desktop\Projects\Food Insecurity,";

libname NHANES "C:\Users\LWANG18\Box\NHANES_Lu" ;

/** main analysis **/

%macro cox(data, dvar, evars, covars, death , out);
ods select all ; 
ODS OUTPUT PARAMETERESTIMATES=r0; 
proc surveyphreg data=&data;
	strata sdmvstra;
	cluster sdmvpsu;
	weight wt;
	class    sex (ref="1") race edu(ref="1") smk  pir(ref="3") SNAP(ref="0") FS ins2(ref="1") ins i_FCS_sdq hei2015q(ref="3") marriage  hoq065/param=ref;
	model py*&death(0)= &dvar &covars /rl ties=breslow;
run ;
data r0 ; set r0 ; 
HRCL=compress(round(hazardratio,0.01)||

<h2>Step 1: NHANES demographic data (fetch and merge)</h2>

In [6]:
# %% Step 1: Paths and cycle mapping (matches your project)
from pathlib import Path
import os
import pandas as pd
import numpy as np
import requests

BASE = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
DATA: Path = BASE / "data"
(DATA / "nhanes_by_module" / "DEMO").mkdir(parents=True, exist_ok=True)  # where we’ll save downloads

# --- Candidate upstream URLs for DEMO files (multiple fallbacks per cycle) ---
# Includes the “classic” /Nhanes/<cycle>/ path and the /Nchs/Data/Nhanes/Public/<year>/DataFiles/ path.
def nhanes_url_candidates():
    return {
        "1999-2000": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/1999/DataFiles/DEMO.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/DEMO.XPT",
        ],
        "2001-2002": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2001/DataFiles/DEMO_B.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/DEMO_B.XPT",
        ],
        "2003-2004": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2003/DataFiles/DEMO_C.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/DEMO_C.XPT",
        ],
        "2005-2006": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2005/DataFiles/DEMO_D.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/DEMO_D.XPT",
        ],
        "2007-2008": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2007/DataFiles/DEMO_E.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/DEMO_E.XPT",
        ],
        "2009-2010": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2009/DataFiles/DEMO_F.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/DEMO_F.XPT",
        ],
        "2011-2012": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2011/DataFiles/DEMO_G.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/DEMO_G.XPT",
        ],
        "2013-2014": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/DEMO_H.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DEMO_H.XPT",
        ],
        "2015-2016": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/DEMO_I.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DEMO_I.XPT",
        ],
        "2017-2018": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DEMO_J.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT",
        ],
        # Special combined pre-pandemic release
        "2017-March 2020 (pre-pandemic)": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DEMO.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2020/P_DEMO.XPT",
        ],
        # August 2021–August 2023 (DEMO_L; include Q as fallback just in case)
        "August 2021–August 2023": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2021-2023/DEMO_L.XPT",
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_Q.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2021-2023/DEMO_Q.XPT",
        ],
    }

NHANES_URLS = nhanes_url_candidates()

# Build local filename candidates per cycle (try these names before downloading)
def candidates_from_urls(urls):
    out, seen = [], set()
    for url in urls:
        fname = Path(url).name
        for v in (fname, fname.upper(), fname.lower(), fname.capitalize()):
            if v not in seen:
                seen.add(v)
                out.append(v)
    return out

LOCAL_CANDIDATES = {cycle: candidates_from_urls(urls)
                    for cycle, urls in NHANES_URLS.items()}

# %% Step 2: helpers (local search + download + reader)
def find_first_under_data(patterns):
    """Search recursively under DATA for any of the provided filenames (case-sensitive per pattern list)."""
    for pattern in patterns:
        hits = list(DATA.rglob(pattern))
        if hits:
            return hits[0]
    return None

# define fuction to fetch from CDC web
def download_to(path: Path, url: str, timeout=90):
    """Download URL -> path with a basic retry; return local path."""
    path.parent.mkdir(parents=True, exist_ok=True)
    headers = {"User-Agent": "nhanes-fetch/1.0 (+https://cdc.gov)"}
    last_err = None
    for attempt in range(2):  # two tries per URL
        try:
            resp = requests.get(url, headers=headers, timeout=timeout, stream=True)
            resp.raise_for_status()
            tmp = path.with_suffix(path.suffix + ".downloading")
            with open(tmp, "wb") as f:
                for chunk in resp.iter_content(chunk_size=1 << 15):
                    if chunk:
                        f.write(chunk)
            tmp.rename(path)
            return path
        except Exception as e:
            last_err = e
    raise last_err

def ensure_demo_file(cycle_label: str) -> Path:
    """Return a local DEMO file path for the cycle (search first, then download from known URLs)."""
    local = find_first_under_data(LOCAL_CANDIDATES[cycle_label])
    if local:
        return local
    for url in NHANES_URLS[cycle_label]:
        out = DATA / "nhanes_by_module" / "DEMO" / Path(url).name
        try:
            print(f"⬇️  Downloading {cycle_label} from {url}")
            return download_to(out, url)
        except Exception as e:
            print(f"   ⚠️ Download failed from {url}: {e}")
    raise FileNotFoundError(f"No DEMO file found or downloaded for {cycle_label}")

def read_demo_file(p: Path) -> pd.DataFrame:
    """Read DEMO file (.xpt preferred, .sas7bdat fallback)."""
    if p.suffix.lower() == ".xpt":
        # Prefer pyreadstat if available (faster/labels), else pandas
        try:
            import pyreadstat
            df, _ = pyreadstat.read_xport(p)
        except Exception:
            df = pd.read_sas(p, format="xport")
    elif p.suffix.lower() == ".sas7bdat":
        try:
            import pyreadstat
            df, _ = pyreadstat.read_sas7bdat(p)
        except Exception:
            df = pd.read_sas(p, format="sas7bdat")
    else:
        raise ValueError(f"Unsupported file type: {p.suffix}")
    return df

# Which cycles to include in this DEMO build (OK to include through 2023 here)
DEMO_CYCLES = [
    "1999-2000","2001-2002","2003-2004","2005-2006","2007-2008",
    "2009-2010","2011-2012","2013-2014","2015-2016","2017-2018",
    "2017-March 2020 (pre-pandemic)","August 2021–August 2023",
]




# --- helpers to recode marital status ---
def recode_L_to_4(s):
    # DMDMARTL codes: 1=Married, 2=Never, 3=Widowed, 4=Divorced, 5=Separated, 6=Living with partner, 77/99=DK/Ref
    s = pd.to_numeric(s, errors="coerce")
    out = pd.Series(pd.NA, index=s.index, dtype="Int64")
    out = out.where(~s.isin([1, 6]), 1)   # married/partner
    out = out.where(~s.isin([3, 4]), 2)   # widowed/divorced
    out = out.where(~(s == 2), 3)         # never married
    out = out.where(~(s == 5), 4)         # separated
    out = out.where(~s.isin([77, 99]), pd.NA)
    return out

def recode_L_to_3(s):
    # Collapse L into 3 categories to match Z:
    # 1 = married/partner; 2 = previously married (widowed/divorced/separated); 3 = never
    s = pd.to_numeric(s, errors="coerce")
    out = pd.Series(pd.NA, index=s.index, dtype="Int64")
    out = out.where(~s.isin([1, 6]), 1)
    out = out.where(~s.isin([3, 4, 5]), 2)
    out = out.where(~(s == 2), 3)
    out = out.where(~s.isin([77, 99]), pd.NA)
    return out

def recode_Z_to_3(s):
    # DMDMARTZ codes (2021–2023): 1=married/partner, 2=previously married, 3=never, 77/99=DK/Ref
    s = pd.to_numeric(s, errors="coerce")
    out = pd.Series(pd.NA, index=s.index, dtype="Int64")
    out = out.where(~(s == 1), 1)
    out = out.where(~(s == 2), 2)
    out = out.where(~(s == 3), 3)
    out = out.where(~s.isin([77, 99]), pd.NA)
    return out

def compute_marriage_cols(df_upper: pd.DataFrame):
    """Return (MARRIAGE, MARRIAGE3) given an uppercase-column DataFrame."""
    hasL = "DMDMARTL" in df_upper.columns
    hasZ = "DMDMARTZ" in df_upper.columns
    M4 = pd.Series(pd.NA, index=df_upper.index, dtype="Int64")
    M3 = pd.Series(pd.NA, index=df_upper.index, dtype="Int64")
    if hasL:
        M4 = recode_L_to_4(df_upper["DMDMARTL"])
        M3 = recode_L_to_3(df_upper["DMDMARTL"])
    elif hasZ:
        # Can't reconstruct a 4-cat from Z; leave M4 as NA
        M3 = recode_Z_to_3(df_upper["DMDMARTZ"])
    return M4, M3

# %% Step 3: Load (local-or-download), tag cycle, recode marriage per cycle, and stack
demo_dfs, missing = [], []
for cycle in DEMO_CYCLES:
    try:
        p = ensure_demo_file(cycle)
    except FileNotFoundError as e:
        print(f"⚠️ {e}")
        missing.append(cycle)
        continue

    df = read_demo_file(p)
    df.columns = [c.upper() for c in df.columns]
    df["CYCLE"] = cycle

    # Recode marital status for this cycle using whichever column exists
    M4, M3 = compute_marriage_cols(df)
    df["MARRIAGE"]  = M4          # 4-category where available; NA for 2021–2023
    df["MARRIAGE3"] = M3          # 3-category available for ALL cycles

    # Keep only relevant columns that exist
    keep = [c for c in ["SEQN","RIDAGEYR","SDDSRVYR","CYCLE","MARRIAGE","MARRIAGE3"] if c in df.columns]
    demo_dfs.append(df[keep])

if missing:
    print("⚠️ Missing cycles (not found and not downloaded):", missing)
if not demo_dfs:
    raise FileNotFoundError("No DEMO files were found or downloaded under your data/ directory.")

# %% Step 4: Combine
demo9923 = pd.concat(demo_dfs, ignore_index=True).copy()

# %% Step 5/6 merged: we already kept just the needed columns; print summary
print(f"✅ Total rows: {demo9923.shape[0]}")
print(f"✅ Unique SEQN: {demo9923['SEQN'].nunique()}")
print("Columns:", demo9923.columns.tolist())

# Optional quick check: 2021–2023 now has MARRIAGE3 populated
vc = demo9923.loc[demo9923["CYCLE"]=="August 2021–August 2023","MARRIAGE3"].value_counts(dropna=False)
print("\n2021–2023 MARRIAGE3 counts:\n", vc)

✅ Total rows: 128809
✅ Unique SEQN: 128809
Columns: ['SEQN', 'RIDAGEYR', 'SDDSRVYR', 'CYCLE', 'MARRIAGE', 'MARRIAGE3']

2021–2023 MARRIAGE3 counts:
 MARRIAGE3
<NA>    4150
1       4136
2       2022
3       1625
Name: count, dtype: Int64


In [66]:
# %% Step 3: Load (local-or-download), tag cycle, add survey design fields, recode marriage per cycle, and stack
import re

def pick_weight_col(df):
    """Pick a MEC 2-year weight column (prefer WTMEC2YR; regex; else WTINT2YR fallback)."""
    colsU = {c.upper(): c for c in df.columns}
    # 1) exact preferred
    if "WTMEC2YR" in colsU:
        return colsU["WTMEC2YR"], "WTMEC2YR"
    # 2) regex like "WT...MEC...2YR"
    for cU, c in colsU.items():
        if re.match(r"^WT.*MEC.*2YR$", cU):
            return c, "regex_WTMEC2YR"
    # 3) interview weight as last resort
    if "WTINT2YR" in colsU:
        return colsU["WTINT2YR"], "WTINT2YR_fallback"
    return None, None

demo_dfs, missing = [], []
for cycle in DEMO_CYCLES:
    try:
        p = ensure_demo_file(cycle)
    except FileNotFoundError as e:
        print(f"⚠️ {e}")
        missing.append(cycle)
        continue

    df = read_demo_file(p)
    df.columns = [c.upper() for c in df.columns]
    df["CYCLE"] = cycle

    # --- survey design fields ---
    # Normalize weight column to WTMEC2YR if possible
    wcol, wsrc = pick_weight_col(df)
    if wcol is not None and "WTMEC2YR" not in df.columns:
        df["WTMEC2YR"] = df[wcol]

    # --- marital status recodes (unchanged) ---
    M4, M3 = compute_marriage_cols(df)
    df["MARRIAGE"]  = M4
    df["MARRIAGE3"] = M3

    # Keep only relevant columns that exist
    keep = [c for c in [
        "SEQN","RIDAGEYR","SDDSRVYR","SDMVPSU","SDMVSTRA","WTMEC2YR",
        "CYCLE","MARRIAGE","MARRIAGE3"
    ] if c in df.columns]
    demo_dfs.append(df[keep])

if missing:
    print("⚠️ Missing cycles (not found and not downloaded):", missing)
if not demo_dfs:
    raise FileNotFoundError("No DEMO files were found or downloaded under your data/ directory.")

# %% Step 4: Combine and coerce types
demo9923 = pd.concat(demo_dfs, ignore_index=True).copy()
for c in ["SEQN","RIDAGEYR","SDDSRVYR","SDMVPSU","SDMVSTRA","WTMEC2YR"]:
    if c in demo9923.columns:
        demo9923[c] = pd.to_numeric(demo9923[c], errors="coerce")

demo9923 = demo9923.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])

# %% Step 5: Save tidy stacked DEMO with survey design fields
INTERIM = DATA.parent / "interim"
INTERIM.mkdir(parents=True, exist_ok=True)
demo_out = INTERIM / "demo_9923.parquet"
demo9923.to_parquet(demo_out, index=False)
print(f"✅ Saved DEMO with survey design → {demo_out}")

# %% Step 6: Quick summaries
print(f"✅ Total rows: {demo9923.shape[0]}")
print(f"✅ Unique SEQN: {demo9923['SEQN'].nunique()}")
print("Columns:", demo9923.columns.tolist())

# Missingness report for design fields
for k in ["SDDSRVYR","SDMVPSU","SDMVSTRA","WTMEC2YR"]:
    if k in demo9923.columns:
        print(f"{k} NA %:", round(demo9923[k].isna().mean()*100, 2), "%")
    else:
        print(f"{k} missing entirely")

# Optional: verify 2021–2023 marriage3 populated
if "MARRIAGE3" in demo9923.columns:
    vc = demo9923.loc[demo9923["CYCLE"]=="August 2021–August 2023","MARRIAGE3"].value_counts(dropna=False)
    print("\n2021–2023 MARRIAGE3 counts:\n", vc)


✅ Saved DEMO with survey design → /Users/dengshuyue/Desktop/SDOH/analysis/interim/demo_9923.parquet
✅ Total rows: 128809
✅ Unique SEQN: 128809
Columns: ['SEQN', 'RIDAGEYR', 'SDDSRVYR', 'SDMVPSU', 'SDMVSTRA', 'WTMEC2YR', 'CYCLE', 'MARRIAGE', 'MARRIAGE3']
SDDSRVYR NA %: 0.0 %
SDMVPSU NA %: 0.0 %
SDMVSTRA NA %: 0.0 %
WTMEC2YR NA %: 12.08 %

2021–2023 MARRIAGE3 counts:
 MARRIAGE3
<NA>    4150
1       4136
2       2022
3       1625
Name: count, dtype: Int64


<h3>save and check</h3>

In [67]:
# Simple saves without pyarrow (plain CSV)
out_pkl = DATA / "demo9923.pkl"   # fastest round-trip in Python
out_csv = DATA / "demo9923.csv"   # plain CSV (max compatibility)

demo9923.to_pickle(out_pkl)
demo9923.to_csv(out_csv, index=False)  # no compression

print("Saved:", out_pkl)
print("Saved:", out_csv)



Saved: /Users/dengshuyue/Desktop/SDOH/analysis/data/demo9923.pkl
Saved: /Users/dengshuyue/Desktop/SDOH/analysis/data/demo9923.csv


In [8]:
print("\nCounts by CYCLE:")
print(demo9923["CYCLE"].value_counts(dropna=False))

print("\nCounts by SDDSRVYR (raw numeric code, may be NaN for newer releases):")
print(demo9923["SDDSRVYR"].value_counts(dropna=False).sort_index())



Counts by CYCLE:
CYCLE
2017-March 2020 (pre-pandemic)    15560
August 2021–August 2023           11933
2001-2002                         11039
2009-2010                         10537
2005-2006                         10348
2013-2014                         10175
2007-2008                         10149
2003-2004                         10122
2015-2016                          9971
1999-2000                          9965
2011-2012                          9756
2017-2018                          9254
Name: count, dtype: int64

Counts by SDDSRVYR (raw numeric code, may be NaN for newer releases):
SDDSRVYR
1.0      9965
2.0     11039
3.0     10122
4.0     10348
5.0     10149
6.0     10537
7.0      9756
8.0     10175
9.0      9971
10.0     9254
12.0    11933
66.0    15560
Name: count, dtype: int64


In [69]:
demo9923.tail(16)
demo9923["SEQN"].agg(["min", "max", "nunique"])

min             1.0
max        142310.0
nunique    128809.0
Name: SEQN, dtype: float64

<h2> Step 2: merge demo with mortality </h2>

In [10]:
# %% Mortality: locate, read, prep, and merge with DEMO (uses demo9923)
import pandas as pd
import numpy as np
from pathlib import Path

# --- Paths -------------------------------------------------------
ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
DATA = ROOT / "data"

# --- Locate mortality file (search recursively under data/) -----
MORT_PATTERNS = [
    "*mortality*.sas7bdat", "*mort*.sas7bdat",
    "*mortality*.xpt",      "*mort*.xpt",
    "*mortality*.csv",      "*mort*.csv"
]

def find_first(base: Path, patterns):
    for pat in patterns:
        hits = list(base.rglob(pat))
        if hits:
            return hits[0]
    return None

mort_path = find_first(DATA, MORT_PATTERNS)
if mort_path is None:
    raise FileNotFoundError(
        f"No mortality file found under {DATA}. "
        "Place a file like 'NHANES_1999_2019_LMF_public.csv' or 'mortality9918.sas7bdat' there."
    )
print("Using mortality file:", mort_path)

# --- Load mortality (handles .xpt / .sas7bdat / .csv) ----------
suffix = mort_path.suffix.lower()
if suffix == ".xpt":
    try:
        import pyreadstat
        mort, _ = pyreadstat.read_xport(mort_path)
    except Exception:
        mort = pd.read_sas(mort_path, format="xport")
elif suffix == ".sas7bdat":
    try:
        import pyreadstat
        mort, _ = pyreadstat.read_sas7bdat(mort_path)
    except Exception:
        mort = pd.read_sas(mort_path, format="sas7bdat")
elif suffix == ".csv":
    mort = pd.read_csv(mort_path)
else:
    raise ValueError(f"Unsupported mortality file type: {suffix}")

mort.columns = mort.columns.str.upper()

# --- Keep relevant columns if present ---------------------------
keep_cols = [c for c in [
    "SEQN","ELIGSTAT","MORTSTAT","PERMTH_EXM","PERMTH_INT",
    "UCOD_LEADING","DIABETES","HYPERTEN","DODQTR","DODYEAR"
] if c in mort.columns]
mort = mort[keep_cols].copy()

# --- Eligibility and time/event construction --------------------
if "ELIGSTAT" in mort.columns:
    mort = mort[mort["ELIGSTAT"] == 1].copy()

# default: exam-based time; switch to interview by changing TIME_COL
TIME_COL = "PERMTH_EXM" if "PERMTH_EXM" in mort.columns else "PERMTH_INT"
if TIME_COL not in mort.columns:
    raise ValueError("Neither PERMTH_EXM nor PERMTH_INT found in mortality file.")

mort["TIME_Y"] = pd.to_numeric(mort[TIME_COL], errors="coerce") / 12.0
mort["EVENT"]  = (mort["MORTSTAT"] == 1).astype(int)
mort = mort[(mort["TIME_Y"].notna()) & (mort["TIME_Y"] >= 0)].copy()

# --- Ensure DEMO columns & types (demo9923) ---------------------
demo9923.columns = demo9923.columns.str.upper()
if "SDDSRVYR" not in demo9923.columns:
    raise ValueError("SDDSRVYR not found in demo9923. Ensure your DEMO build carries SDDSRVYR.")
demo9923["SDDSRVYR"] = pd.to_numeric(demo9923["SDDSRVYR"], errors="coerce")

# --- Merge age + cycle into mortality ---------------------------
cols_to_merge = [c for c in ["SEQN","RIDAGEYR","SDDSRVYR"] if c in demo9923.columns]
mort_with_demo = mort.merge(demo9923[cols_to_merge], on="SEQN", how="left")

# Optional: adults (≥20)
# mort_with_demo = mort_with_demo.dropna(subset=["RIDAGEYR"])
# mort_with_demo = mort_with_demo[mort_with_demo["RIDAGEYR"] >= 20]

# --- Cycle counts and summary -----------------------------------
cycle_counts = mort_with_demo["SDDSRVYR"].value_counts(dropna=False).sort_index()

cycle_map = {
    1: "1999–2000",  2: "2001–2002",  3: "2003–2004",
    4: "2005–2006",  5: "2007–2008",  6: "2009–2010",
    7: "2011–2012",  8: "2013–2014",  9: "2015–2016",
    10: "2017–2018", 11: "2017–Mar 2020 (pre-pandemic)", 12: "Aug 2021–Aug 2023"
}

print("\n📊 NHANES cycles in mortality-linked data:")
print(cycle_counts.rename(index=cycle_map))
print(f"\nTotal records: {int(cycle_counts.sum())}")

print("\n⏱️  Survival summary:")
print("  N (unique SEQN):", mort_with_demo["SEQN"].nunique())
print("  Events (%):", round(100 * mort_with_demo["EVENT"].mean(), 2))
print("  TIME_Y (min / median / max):",
      np.nanmin(mort_with_demo["TIME_Y"]),
      np.nanmedian(mort_with_demo["TIME_Y"]),
      np.nanmax(mort_with_demo["TIME_Y"]))


Using mortality file: /Users/dengshuyue/Desktop/SDOH/analysis/data/less_important/mortality9918.sas7bdat

📊 NHANES cycles in mortality-linked data:
SDDSRVYR
1999–2000    4973
2001–2002    5586
2003–2004    5293
2005–2006    5332
2007–2008    5989
2009–2010    6346
2011–2012    5603
2013–2014    5913
2015–2016    5720
2017–2018    5498
Name: count, dtype: int64

Total records: 56253

⏱️  Survival summary:
  N (unique SEQN): 56253
  Events (%): 14.87
  TIME_Y (min / median / max): 0.0 9.416666666666666 20.75


<h2>Step 3: merge demo_mort with sodh</h2>


<h3> OCQ / HOQ / HIQ / FSQ — early (1999–2002) + main (2003–2018), adult filter, tidy outputs </h3>


In [23]:
# %% Prereqs & helpers
import pandas as pd, numpy as np, os
from pathlib import Path

ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
DATA = ROOT / "data"
MOD  = DATA / "nhanes_by_module"

def read_any(p: Path) -> pd.DataFrame:
    s = p.suffix.lower()
    if s == ".xpt":
        import pyreadstat
        df, _ = pyreadstat.read_xport(str(p))
    elif s == ".sas7bdat":
        df = pd.read_sas(str(p), format="sas7bdat", encoding="latin1")
    elif s == ".csv":
        df = pd.read_csv(p)
    elif s == ".parquet":
        df = pd.read_parquet(p)
    else:
        raise ValueError(f"Unsupported file: {p}")
    df.columns = df.columns.str.upper()
    return df

def filter_adults(df: pd.DataFrame) -> pd.DataFrame:
    """Adults ≥20 using demo9923 already in memory (SEQN, RIDAGEYR)."""
    if "demo9923" not in globals():
        raise RuntimeError("demo9923 not found in memory; load/build it first.")
    demo = demo9923.copy()
    demo.columns = demo.columns.str.upper()
    age = demo[["SEQN","RIDAGEYR"]].dropna()
    age = age[age["RIDAGEYR"] >= 20]
    return df.merge(age, on="SEQN", how="inner")

def s_or_false(df: pd.DataFrame, col: str):
    return df[col] if col in df.columns else pd.Series(False, index=df.index)


In [24]:
# %% OCQ: 1999–2018 (employment recode)
OCQ = MOD / "ocq"
ocq_main = OCQ / "ocq.sas7bdat"
ocq_early_files = [(OCQ/"OCQ.xpt",1), (OCQ/"OCQ_B.xpt",2)]

def recode_employment(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["EMPLOY"] = np.nan
    if "OCD150" in df:
        df.loc[df["OCD150"] == 1, "EMPLOY"] = 1
        df.loc[df["OCD150"] == 3, "EMPLOY"] = 2
    if "OCQ380" in df:
        df.loc[df["OCQ380"] == 5, "EMPLOY"] = 2
        df.loc[df["OCQ380"] == 3, "EMPLOY"] = 3
        df.loc[df["OCQ380"].isin([4,6]), "EMPLOY"] = 4
        df.loc[df["OCQ380"].isin([1,2,7]), "EMPLOY"] = 5
    df["UNEMPLOYMENT"] = (df["EMPLOY"] == 2).astype("Int64")
    keep = [c for c in ["SEQN","EMPLOY","UNEMPLOYMENT","SDDSRVYR"] if c in df.columns]
    return df[keep]

# early (adult filter applied)
parts = []
for p, cyc in ocq_early_files:
    if p.exists():
        df = read_any(p)
        df["SDDSRVYR"] = cyc
        df = filter_adults(df)
        parts.append(recode_employment(df))
ocq_early = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()

# main (adult filter applied)
if not ocq_main.exists():
    raise FileNotFoundError(f"Missing {ocq_main}")
ocq_main_df = filter_adults(read_any(ocq_main))
ocq_main_df = recode_employment(ocq_main_df)

# combine + hygiene
ocq = pd.concat([ocq_early, ocq_main_df], ignore_index=True)
ocq["SEQN"] = pd.to_numeric(ocq["SEQN"], errors="coerce").astype("Int64")
if "SDDSRVYR" in ocq: ocq["SDDSRVYR"] = pd.to_numeric(ocq["SDDSRVYR"], errors="coerce").astype("Int64")
ocq = ocq.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])
print("OCQ:", ocq.shape)


OCQ: (55081, 4)


In [25]:
# %% HOQ: 1999–2018 (housing subset)
HOQ = MOD / "hoq"
hoq_main = HOQ / "hoq.sas7bdat"
hoq_early_files = [(HOQ/"HOQ.xpt",1), (HOQ/"HOQ_B.xpt",2), (HOQ/"hoq.sas7bdat",1), (HOQ/"hoq_b.sas7bdat",2)]

def preprocess_hoq(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "HOQ065" in df: df.loc[df["HOQ065"].isin([7,9]), "HOQ065"] = np.nan
    keep = [c for c in ["SEQN","HOD050","HOQ065","SDDSRVYR"] if c in df.columns]
    return df[keep]

parts = []
seen = set()
for p, cyc in hoq_early_files:
    if p.exists() and cyc not in seen:
        df = read_any(p); df["SDDSRVYR"] = cyc
        df = filter_adults(df)
        parts.append(preprocess_hoq(df)); seen.add(cyc)
hoq_early = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()

if not hoq_main.exists():
    raise FileNotFoundError(f"Missing {hoq_main}")
hoq_main_df = preprocess_hoq(filter_adults(read_any(hoq_main)))

hoq_all = pd.concat([hoq_early, hoq_main_df], ignore_index=True)
for c in ("SEQN","SDDSRVYR"):
    if c in hoq_all: hoq_all[c] = pd.to_numeric(hoq_all[c], errors="coerce").astype("Int64")
hoq_all = hoq_all.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])
print("HOQ:", hoq_all.shape)


HOQ: (55081, 4)


In [26]:
# %% HIQ/HIQS: 1999–2018 (insurance category INS)
HIQ = MOD / "hiq"
hiq_main = HIQ / "hiqs.sas7bdat"
hiq_early_files = [(HIQ/"HIQ.xpt",1), (HIQ/"HIQ_B.xpt",2), (HIQ/"hiq.sas7bdat",1), (HIQ/"hiq_b.sas7bdat",2)]

# stack early + main, adult filter
parts, seen = [], set()
for p, cyc in hiq_early_files:
    if p.exists() and cyc not in seen:
        df = read_any(p); df["SDDSRVYR"] = cyc
        parts.append(filter_adults(df)); seen.add(cyc)
hiq_early = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()

if not hiq_main.exists():
    raise FileNotFoundError(f"Missing {hiq_main}")
hiqs = filter_adults(read_any(hiq_main))

hiq_all = pd.concat([hiq_early, hiqs], ignore_index=True)
hiq_all.columns = hiq_all.columns.str.upper()

# build INS (0 none, 1 private, 2 Medicare, 3 Medicaid/both, 5 other)
ins = pd.DataFrame({"SEQN": hiq_all["SEQN"]})
if "SDDSRVYR" in hiq_all: ins["SDDSRVYR"] = hiq_all["SDDSRVYR"]
ins["INS"] = np.nan

cond_private = (s_or_false(hiq_all,"HIQ031A") == 14) | (s_or_false(hiq_all,"HID030A") == 1)
ins.loc[cond_private, "INS"] = 1

cond_med = (
    (s_or_false(hiq_all,"HIQ031B") == 15) &
    (s_or_false(hiq_all,"HIQ031D") != 17) &
    (s_or_false(hiq_all,"HIQ031E") != 18)
) | (
    (s_or_false(hiq_all,"HID030B") == 1) &
    (s_or_false(hiq_all,"HID030C") != 1)
)
ins.loc[cond_med, "INS"] = 2

cond_mcaid_only = (
    ((s_or_false(hiq_all,"HIQ031D") == 17) | (s_or_false(hiq_all,"HIQ031E") == 18)) &
    (s_or_false(hiq_all,"HIQ031B") != 15)
) | (
    (s_or_false(hiq_all,"HID030B") != 1) &
    (s_or_false(hiq_all,"HID030C") == 1)
)
cond_both = (
    (s_or_false(hiq_all,"HIQ031B") == 15) & (s_or_false(hiq_all,"HIQ031D") == 17)
) | (
    (s_or_false(hiq_all,"HID030B") == 1) & (s_or_false(hiq_all,"HID030C") == 1)
)
ins.loc[cond_mcaid_only | cond_both, "INS"] = 3

other_cols = [c for c in ["HIQ031C","HIQ031F","HIQ031G","HIQ031H","HIQ031I"] if c in hiq_all.columns]
cond_other = hiq_all[other_cols].eq(1).any(axis=1) if other_cols else pd.Series(False, index=hiq_all.index)
cond_other = cond_other | (s_or_false(hiq_all,"HID030D") == 1)
ins.loc[cond_other, "INS"] = 5

none_conds = []
if "HIQ011" in hiq_all: none_conds.append(hiq_all["HIQ011"] == 2)
if "HID010" in hiq_all: none_conds.append(hiq_all["HID010"] == 2)
if none_conds: ins.loc[np.logical_or.reduce(none_conds), "INS"] = 0

for c in ("SEQN","SDDSRVYR"):
    if c in ins: ins[c] = pd.to_numeric(ins[c], errors="coerce").astype("Int64")
ins = ins.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])
print("INS:", ins.shape)


INS: (55081, 3)


In [27]:
# %% FSQ/FSQS: 1999–2018 (SNAP & FS)
FSQ = MOD / "fsq"
fsq_main = FSQ / "fsqs.sas7bdat"
fsq_early_files = [(FSQ/"FSQ.xpt",1), (FSQ/"FSQ_B.xpt",2), (FSQ/"fsq.sas7bdat",1), (FSQ/"fsq_b.sas7bdat",2)]

parts, seen = [], set()
for p, cyc in fsq_early_files:
    if p.exists() and cyc not in seen:
        df = read_any(p); df["SDDSRVYR"] = cyc
        parts.append(filter_adults(df)); seen.add(cyc)
fsq_early = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()

if not fsq_main.exists():
    raise FileNotFoundError(f"Missing {fsq_main}")
fsqs = filter_adults(read_any(fsq_main))

fsq_all = pd.concat([fsq_early, fsqs], ignore_index=True)
fsq_all.columns = fsq_all.columns.str.upper()

snap = pd.DataFrame({"SEQN": fsq_all["SEQN"]})
if "SDDSRVYR" in fsq_all: snap["SDDSRVYR"] = fsq_all["SDDSRVYR"]
if "FSDHH" in fsq_all:   snap["FSDHH"] = fsq_all["FSDHH"]

snap["SNAP"] = np.nan
if "FSQ165" in fsq_all: snap.loc[fsq_all["FSQ165"] == 2, "SNAP"] = 0
if "FSQ012" in fsq_all:
    snap.loc[fsq_all["FSQ012"] == 1, "SNAP"] = 1
    snap.loc[fsq_all["FSQ012"] == 2, "SNAP"] = 0
if "FSQ171" in fsq_all:
    snap.loc[fsq_all["FSQ171"] == 1, "SNAP"] = 1
    snap.loc[fsq_all["FSQ171"] == 2, "SNAP"] = 0
if "FSD170N" in fsq_all: snap.loc[fsq_all["FSD170N"] >= 1, "SNAP"] = 1
if "FSQ170" in fsq_all:
    snap.loc[fsq_all["FSQ170"] == 1, "SNAP"] = 1
    snap.loc[(fsq_all["FSQ170"] == 2) & (fsq_all.get("FSD170N", pd.Series(index=fsq_all.index)) < 1), "SNAP"] = 0
if "FSD200" in fsq_all: snap.loc[fsq_all["FSD200"] == 1, "SNAP"] = 1

snap["FS"] = np.nan
if "FSDHH" in fsq_all:
    snap.loc[fsq_all["FSDHH"].isin([1,2]), "FS"] = 1
    snap.loc[fsq_all["FSDHH"] > 2,        "FS"] = 0

for c in ("SEQN","SDDSRVYR"):
    if c in snap: snap[c] = pd.to_numeric(snap[c], errors="coerce").astype("Int64")
snap = snap[[c for c in ["SEQN","SNAP","FSDHH","FS","SDDSRVYR"] if c in snap.columns]]
snap = snap.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])
print("SNAP/FS:", snap.shape)


SNAP/FS: (55081, 5)


<h3> Audit — cycles present & inner vs left-merge coverage </h3>

In [28]:
# %% Audit helpers + run
def cycles(df, name):
    if "SDDSRVYR" in df.columns and df["SDDSRVYR"].notna().any():
        print(f"{name} cycles:", sorted(df["SDDSRVYR"].dropna().astype(int).unique()))
    else:
        tmp = df.merge(demo9923[["SEQN","SDDSRVYR"]], on="SEQN", how="left")
        if tmp["SDDSRVYR"].notna().any():
            print(f"{name} cycles (via DEMO):", sorted(tmp["SDDSRVYR"].dropna().astype(int).unique()))
        else:
            print(f"{name} cycles: (none found)")

def coverage(base, addon, name):
    base_u = base["SEQN"].dropna().nunique()
    inner_u = base.merge(addon[["SEQN"]].drop_duplicates(), on="SEQN", how="inner")["SEQN"].nunique()
    left_u  = base.merge(addon[["SEQN"]].drop_duplicates(),  on="SEQN", how="left")["SEQN"].nunique()
    print(f"{name:>5} | base={base_u:,} | inner keeps={inner_u:,} | left keeps={left_u:,}")

# Choose a base for audit (prefers mortality, else diet)
if "mort_with_demo" in globals():
    base_df = mort_with_demo
elif "SODH_diet_mort" in globals():
    base_df = SODH_diet_mort
else:
    base_df = demo9923

for (n, df) in [("OCQ", ocq), ("HOQ", hoq_all), ("HIQ", ins), ("FSQ", snap)]:
    cycles(df, n)
    coverage(base_df, df, n)


OCQ cycles: [np.int64(1), np.int64(2)]
  OCQ | base=56,253 | inner keeps=52,287 | left keeps=56,253
HOQ cycles: [np.int64(1), np.int64(2)]
  HOQ | base=56,253 | inner keeps=52,287 | left keeps=56,253
HIQ cycles: [np.int64(1), np.int64(2)]
  HIQ | base=56,253 | inner keeps=52,287 | left keeps=56,253
FSQ cycles: [np.int64(1), np.int64(2)]
  FSQ | base=56,253 | inner keeps=52,287 | left keeps=56,253


<h3> Left-merge onto mortality base and save </h3>

In [54]:
# check if output path is correct 
import os
from pathlib import Path

print("ROOT =", ROOT)
print("OUT  =", OUT)
print("SDOH_ROOT env =", os.environ.get("SDOH_ROOT"))

# ✅ Force the correct OUT (one-time reassignment in this kernel)
OUT = ROOT / "output"
print("OUT (fixed) =", OUT)


ROOT = /Users/dengshuyue/Desktop/SDOH/analysis
OUT  = /Users/dengshuyue/Desktop/SDOH/analysis/output
SDOH_ROOT env = None
OUT (fixed) = /Users/dengshuyue/Desktop/SDOH/analysis/output


In [52]:
# %% Build master (LEFT merges onto mort_with_demo) and save — refined
import pandas as pd
import numpy as np

if "mort_with_demo" not in globals():
    print("⚠️ mort_with_demo not found — skip merge/save.")
else:
    def _key(df: pd.DataFrame) -> pd.DataFrame:
        d = df.copy()
        d.columns = d.columns.str.upper()
        d["SEQN"] = pd.to_numeric(d["SEQN"], errors="coerce").astype("Int64")
        return d.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])

    base = _key(mort_with_demo)

    # Pick only needed columns if present
    ocq_m  = _key(ocq)      [[c for c in ("SEQN","EMPLOY","UNEMPLOYMENT") if c in ocq.columns]]      if "ocq"      in globals() else None
    hoq_m  = _key(hoq_all)  [[c for c in ("SEQN","HOD050","HOQ065")        if c in hoq_all.columns]]  if "hoq_all"  in globals() else None
    ins_m  = _key(ins)      [[c for c in ("SEQN","INS")                    if c in ins.columns]]      if "ins"      in globals() else None
    snap_m = _key(snap)     [[c for c in ("SEQN","SNAP","FSDHH","FS")      if c in snap.columns]]     if "snap"     in globals() else None

    # Merge in sequence with one-to-one validation
    master = base
    for name, piece in [("OCQ", ocq_m), ("HOQ", hoq_m), ("INS", ins_m), ("FSQ", snap_m)]:
        if piece is None:
            print(f"ℹ️  {name}: not available — skipped")
            continue
        try:
            master = master.merge(piece, on="SEQN", how="left", validate="one_to_one")
        except Exception as e:
            raise RuntimeError(f"Merge failed for {name} (check duplicate SEQN).") from e

    print("✅ master shape:", master.shape)

    # Quick audit (non-missing coverage for key SDOH fields)
    audit_cols = [c for c in ["EMPLOY","UNEMPLOYMENT","HOD050","HOQ065","INS","SNAP","FS"] if c in master.columns]
    if audit_cols:
        cov = (master[audit_cols].notna().mean()*100).round(1).sort_values(ascending=False)
        print("\nNon-missing coverage (%):")
        for k, v in cov.items(): print(f"  {k}: {v}%")

    # Save (use predefined OUT)
    OUT.mkdir(parents=True, exist_ok=True)
    base_name = "nhanes_mort_demo_sdoh_1999_2018"
    master.to_parquet(OUT / f"{base_name}.parquet", index=False)
    master.to_csv(    OUT / f"{base_name}.csv",     index=False)
    print("\nSaved:\n ", OUT / f"{base_name}.parquet", "\n ", OUT / f"{base_name}.csv")


✅ master shape: (56253, 20)

Non-missing coverage (%):
  UNEMPLOYMENT: 92.9%
  HOD050: 91.6%
  HOQ065: 91.5%
  INS: 87.7%
  EMPLOY: 86.8%
  SNAP: 76.3%
  FS: 74.7%

Saved:
  /Users/dengshuyue/Desktop/SDOH/analysis/output/nhanes_mort_demo_sdoh_1999_2018.parquet 
  /Users/dengshuyue/Desktop/SDOH/analysis/output/nhanes_mort_demo_sdoh_1999_2018.csv


In [65]:
# %% Build master (LEFT merges onto mort_with_demo) and save — refined
from pathlib import Path
import pandas as pd
import numpy as np

# If OUT wasn't defined earlier, set it here
ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
OUT = ROOT / "out"
OUT.mkdir(parents=True, exist_ok=True)

if "mort_with_demo" not in globals():
    print("⚠️ mort_with_demo not found — skip merge/save.")
else:
    def _key(df: pd.DataFrame) -> pd.DataFrame:
        d = df.copy()
        d.columns = d.columns.str.upper()
        # normalize SEQN
        d["SEQN"] = pd.to_numeric(d["SEQN"], errors="coerce")
        # if it's float but all .0, move to Int64
        if pd.api.types.is_float_dtype(d["SEQN"]) and ((d["SEQN"] % 1) == 0).all():
            d["SEQN"] = d["SEQN"].astype("Int64")
        else:
            d["SEQN"] = d["SEQN"].astype("Int64")
        return d.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])

    base = _key(mort_with_demo)

    # helper to pick columns AFTER uppercasing
    def _pick_cols(df_up: pd.DataFrame, candidates: tuple[str, ...]) -> pd.DataFrame:
        keep = [c for c in candidates if c in df_up.columns]
        return df_up[keep] if keep else df_up[["SEQN"]]

    ocq_m  = _pick_cols(_key(ocq),      ("SEQN","EMPLOY","UNEMPLOYMENT")) if "ocq"      in globals() else None
    hoq_m  = _pick_cols(_key(hoq_all),  ("SEQN","HOD050","HOQ065"))        if "hoq_all"  in globals() else None
    ins_m  = _pick_cols(_key(ins),      ("SEQN","INS"))                    if "ins"      in globals() else None
    snap_m = _pick_cols(_key(snap),     ("SEQN","SNAP","FSDHH","FS"))      if "snap"     in globals() else None

    # Merge in sequence with one-to-one validation
    master = base
    for name, piece in [("OCQ", ocq_m), ("HOQ", hoq_m), ("INS", ins_m), ("FSQ", snap_m)]:
        if piece is None:
            print(f"ℹ️  {name}: not available — skipped")
            continue
        # make sure no dupes in piece (already handled by _key → but safe)
        if piece["SEQN"].duplicated().any():
            raise RuntimeError(f"{name}: duplicate SEQN detected after cleaning.")
        try:
            master = master.merge(piece, on="SEQN", how="left", validate="one_to_one")
        except Exception as e:
            raise RuntimeError(f"Merge failed for {name} (check duplicate SEQN).") from e

    print("✅ master shape:", master.shape)

    # Quick audit (non-missing coverage for key SDOH fields)
    audit_cols = [c for c in ["EMPLOY","UNEMPLOYMENT","HOD050","HOQ065","INS","SNAP","FS"] if c in master.columns]
    if audit_cols:
        cov = (master[audit_cols].notna().mean()*100).round(1).sort_values(ascending=False)
        print("\nNon-missing coverage (%):")
        for k, v in cov.items():
            print(f"  {k}: {v}%")
    else:
        print("ℹ️ No audit columns present among EMPLOY/UNEMPLOYMENT/HOD050/HOQ065/INS/SNAP/FS.")

    # Save
    base_name = "nhanes_mort_demo_sdoh_1999_2018"
    master.to_parquet(OUT / f"{base_name}.parquet", index=False)
    master.to_csv(    OUT / f"{base_name}.csv",     index=False)
    print("\nSaved:\n ", OUT / f"{base_name}.parquet", "\n ", OUT / f"{base_name}.csv")


✅ master shape: (56253, 20)

Non-missing coverage (%):
  UNEMPLOYMENT: 92.9%
  HOD050: 91.6%
  HOQ065: 91.5%
  INS: 87.7%
  EMPLOY: 86.8%
  SNAP: 76.3%
  FS: 74.7%

Saved:
  /Users/dengshuyue/Desktop/SDOH/analysis/out/nhanes_mort_demo_sdoh_1999_2018.parquet 
  /Users/dengshuyue/Desktop/SDOH/analysis/out/nhanes_mort_demo_sdoh_1999_2018.csv


In [58]:
# keep a clear in-memory handle for downstream cells
nhanes_mort_demo_sdoh = master.copy()

# TEMP compatibility alias for older notebooks
nhanes_mort_demo_soc_9918 = nhanes_mort_demo_sdoh
df = nhanes_mort_demo_sdoh

In [59]:


# Use in-memory object if available; else load from disk
try:
    df = nhanes_mort_demo_soc_9918
except NameError:
    ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
    candidates = [
        ROOT / "analysis" / "output" / "nhanes_mort_demo_soc_1999_2018.parquet",
        ROOT / "analysis" / "output" / "mort_with_demo_plus_soc.parquet",
    ]
    for p in candidates:
        if p.exists():
            df = pd.read_parquet(p)
            print(f"Loaded from: {p}")
            break
    else:
        raise FileNotFoundError("Couldn’t find the saved table in expected locations.")

# Ensure SEQN exists and is numeric
if "SEQN" not in df.columns:
    raise KeyError("SEQN column not found.")
s = pd.to_numeric(df["SEQN"], errors="coerce")

print(f"Rows: {len(df):,} | Unique SEQN: {s.nunique(dropna=True):,} | "
      f"Missing SEQN: {s.isna().sum():,} | Duplicates: {df.duplicated('SEQN').sum():,}")
print(f"SEQN range: {int(s.min())} → {int(s.max())}")

# Optional: cycles present (if available)
if "SDDSRVYR" in df.columns:
    print("Cycles:\n", df["SDDSRVYR"].value_counts(dropna=False).sort_index())


Rows: 56,253 | Unique SEQN: 56,253 | Missing SEQN: 0 | Duplicates: 0
SEQN range: 2 → 102956
Cycles:
 SDDSRVYR
1.0     4973
2.0     5586
3.0     5293
4.0     5332
5.0     5989
6.0     6346
7.0     5603
8.0     5913
9.0     5720
10.0    5498
Name: count, dtype: int64


<h3> Optional Check for old files and moved to old </h3>

In [61]:
from pathlib import Path
from datetime import datetime
import pandas as pd

# Correct output folder (no double "analysis")
out = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")

# Try new names first, then legacy
candidates = [
    out / "nhanes_mort_demo_sdoh_1999_2018.parquet",
    out / "nhanes_mort_demo_sdoh_1999_2018.csv",
    out / "nhanes_mort_demo_soc_1999_2018.parquet",  # legacy
    out / "nhanes_mort_demo_soc_1999_2018.csv",      # legacy
]

# Show which of the candidates exist, with timestamps and sizes
print("Looking in:", out)
found_any = False
for p in candidates:
    if p.exists():
        ts = datetime.fromtimestamp(p.stat().st_mtime).strftime("%Y-%m-%d %H:%M:%S")
        print(f"  ✓ {p.name:40s} | modified: {ts} | size: {p.stat().st_size:,} bytes")
        found_any = True
    else:
        print(f"  - {p.name:40s} (missing)")

# Load the first existing file (prefer Parquet)
df = None
for p in candidates:
    if p.exists():
        df = pd.read_parquet(p) if p.suffix == ".parquet" else pd.read_csv(p)
        print("\nLoaded:", p)
        break

if df is None:
    print("\n⚠️ No saved table found in expected locations. "
          "Re-run the merge/save cell in 00_demo_mort_sdoh.ipynb.")
else:
    print("Shape:", df.shape)
    print("Cols (first 10):", df.columns[:10].tolist())


Looking in: /Users/dengshuyue/Desktop/SDOH/analysis/output
  ✓ nhanes_mort_demo_sdoh_1999_2018.parquet  | modified: 2025-09-09 13:20:01 | size: 774,526 bytes
  ✓ nhanes_mort_demo_sdoh_1999_2018.csv      | modified: 2025-09-09 13:20:01 | size: 4,562,121 bytes
  - nhanes_mort_demo_soc_1999_2018.parquet   (missing)
  - nhanes_mort_demo_soc_1999_2018.csv       (missing)

Loaded: /Users/dengshuyue/Desktop/SDOH/analysis/output/nhanes_mort_demo_sdoh_1999_2018.parquet
Shape: (56253, 20)
Cols (first 10): ['SEQN', 'ELIGSTAT', 'MORTSTAT', 'PERMTH_EXM', 'PERMTH_INT', 'UCOD_LEADING', 'DIABETES', 'HYPERTEN', 'TIME_Y', 'EVENT']


In [62]:
from pathlib import Path
from datetime import datetime

ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
OUT  = ROOT / "analysis" / "output"
DATA = ROOT / "data"

patterns = {
    OUT:  ["mort_with_demo_plus_soc.*",
           "nhanes_mort_demo_soc_*.parquet",
           "nhanes_mort_demo_soc_*.csv"],
    DATA: ["SODH_diet_mort*"],
}

def keep_newest_per_ext(paths):
    by_ext = {}
    for p in paths:
        by_ext.setdefault(p.suffix.lower(), []).append(p)
    keep = set()
    for ext, files in by_ext.items():
        keep.add(max(files, key=lambda x: x.stat().st_mtime))
    return keep

def fmt_size(num_bytes: int) -> str:
    for unit in ("B","KB","MB","GB","TB"):
        if num_bytes < 1024 or unit == "TB":
            return f"{num_bytes:,.0f} {unit}"
        num_bytes /= 1024

# gather matches
all_matches = []
for base, globs in patterns.items():
    for pat in globs:
        all_matches.extend(sorted(base.glob(pat)))

# decide which to keep/delete
TO_KEEP = keep_newest_per_ext(all_matches)
TO_DELETE = [p for p in all_matches if p not in TO_KEEP]

print("✅ Keeping (newest per extension):")
keep_total = 0
for p in sorted(TO_KEEP):
    stat = p.stat()
    ts = datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M:%S")
    sz = fmt_size(stat.st_size); keep_total += stat.st_size
    print(f"  ✓ {p}  |  {ts}  |  {sz}")
print(f"  ↳ total size kept: {fmt_size(keep_total)}")

print("\n🗑️ Candidates to delete (older versions):")
del_total = 0
for p in sorted(TO_DELETE):
    stat = p.stat()
    ts = datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M:%S")
    sz = fmt_size(stat.st_size); del_total += stat.st_size
    print(f"  - {p}  |  {ts}  |  {sz}")
print(f"  ↳ total size to delete: {fmt_size(del_total)}")

(len(TO_KEEP), len(TO_DELETE))


✅ Keeping (newest per extension):
  ✓ /Users/dengshuyue/Desktop/SDOH/analysis/data/SODH_diet_mort.pkl  |  2025-07-18 10:39:20  |  27 MB
  ✓ /Users/dengshuyue/Desktop/SDOH/analysis/data/SODH_diet_mort_depr2.csv  |  2025-09-08 16:16:55  |  21 MB
  ↳ total size kept: 48 MB

🗑️ Candidates to delete (older versions):
  - /Users/dengshuyue/Desktop/SDOH/analysis/data/SODH_diet_mort6.csv  |  2025-08-10 11:39:07  |  14 MB
  ↳ total size to delete: 14 MB


(2, 1)

In [63]:
from pathlib import Path
from datetime import datetime
import shutil

ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
DATA = ROOT / "data"
OLD  = DATA / "old"
OLD.mkdir(parents=True, exist_ok=True)

# 1) Find the SODH diet CSVs under data/
cands = sorted(DATA.glob("SODH_diet_mort*.csv"))

# 2) Exclude the two you want to keep
exclude_stems = {"SODH_diet_mort_depr2", "SODH_diet_mort6"}
to_move = [p for p in cands if p.stem not in exclude_stems]

# --- Preview ---
def fmt_size(n):
    for u in ("B","KB","MB","GB","TB"):
        if n < 1024: return f"{n:,.0f} {u}"
        n /= 1024

if not to_move:
    print("Nothing to move.")
else:
    total = 0
    print("Will move to /data/old:")
    for p in to_move:
        st = p.stat()
        total += st.st_size
        ts = datetime.fromtimestamp(st.st_mtime).strftime("%Y-%m-%d %H:%M:%S")
        print(f"  • {p.name}  |  {fmt_size(st.st_size)}  |  {ts}")
    print(f"Total: {fmt_size(total)}")

# --- Move (flip to True to execute) ---
CONFIRM_MOVE = False  # <-- set True to actually move

if CONFIRM_MOVE and to_move:
    for p in to_move:
        target = OLD / p.name
        if target.exists():  # avoid overwrite if a same-named file already there
            stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
            target = OLD / f"{p.stem}_{stamp}{p.suffix}"
        shutil.move(str(p), str(target))
        print(f"📦 moved: {p.name} -> {target.name}")


Nothing to move.


In [64]:
# Recompute to_move safely
cands = sorted(DATA.glob("SODH_diet_mort*.csv"))
exclude_stems = {"SODH_diet_mort_depr2", "SODH_diet_mort6"}
to_move = [p for p in cands if p.stem not in exclude_stems]

# Execute move
for p in to_move:
    target = OLD / p.name
    if target.exists():  # avoid overwrite
        stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
        target = OLD / f"{p.stem}_{stamp}{p.suffix}"
    shutil.move(str(p), str(target))
    print(f"📦 moved: {p.name} -> {target}")

# Quick verify
print("\nNow in /data/old:")
for p in sorted(OLD.glob("SODH_diet_mort*.csv")):
    print(" -", p.name)


Now in /data/old:
 - SODH_diet_mort.csv
 - SODH_diet_mort2.csv
 - SODH_diet_mort3.csv
 - SODH_diet_mort4.csv
 - SODH_diet_mort5.csv
 - SODH_diet_mort_depr.csv
