<h1> 00 — Bootstrap (paths, cycles, helpers)</h1>

<h2>Shared environment and helper functions used across notebooks.</h2>
    

In [1]:
from pathlib import Path
import os, sys, warnings
import numpy as np
import pandas as pd

# -------------------------
# Root & existing folders (NO mkdir here)
# -------------------------
ROOT = Path(os.environ.get("SDOH_ROOT", "/Users/dengshuyue/Desktop/SDOH/analysis"))

CODE   = ROOT / "code"
DATA   = ROOT / "data"
OUT    = ROOT / "output"

# Data subfolders that you already have
NH_DEIT      = DATA / "nhanes_deit"
NH_BY_MOD    = DATA / "nhanes_by_module"
FPED_DIR     = DATA / "fped"
FNDDDS_DIR   = DATA / "fndds"
BPQ_DIR      = DATA / "bpq"
HEALTH_ACC   = DATA / "health_access"
HH_SIZE_DIR  = DATA / "household_size"
TMP_NORM_XPT = DATA / "tmp_norm_xpt"
LESS_IMP     = DATA / "less_important"

# Common files already present
FILES = {
    "demoall_csv":          DATA / "demoall.csv",
    "demoall_pkl":          DATA / "demoall.pkl",
    "hei9918_sas7bdat":     DATA / "hei9918.sas7bdat",
    "sodh_diet_mort_sas":   DATA / "sodh_diet_mort.sas7bdat",
    "sodh_diet_mort_pkl":   DATA / "SODH_diet_mort.pkl",
    # multiple CSV variants exist; we’ll glob when needed
}

# Output files/folders (already exist in your tree)
OUT_FILES = {
    "demo_summary_csv":     OUT / "demo_summary.csv",
    "demo_summary_r_csv":   OUT / "demo_summary_r.csv",
    "ahei_combined_csv":    DATA / "ahei_combined.csv",  # lives under data/
}
TABLES_FIGS = OUT  # you keep tables directly in output/

# -------------------------
# NHANES cycles
# - Keep it simple: Cox is in R; use explicit lists
# -------------------------
CYCLES_MORTALITY = [
    "1999-2000","2001-2002","2003-2004","2005-2006","2007-2008",
    "2009-2010","2011-2012","2013-2014","2015-2016","2017-2018",
]
CYCLES_NONMORT = [
    "2017-March 2020 (pre-pandemic)",  # P_DEMO.xpt style
    "August 2021–August 2023",         # DEMO_L.xpt
]
CYCLES_ALL = CYCLES_MORTALITY + CYCLES_NONMORT

# Optional suffix/prefix hints (use only if you need to load DEMO files by pattern)
CYCLE_SUFFIX = {
    "1999-2000": "",
    "2001-2002": "_B", "2003-2004": "_C", "2005-2006": "_D",
    "2007-2008": "_E", "2009-2010": "_F", "2011-2012": "_G",
    "2013-2014": "_H", "2015-2016": "_I", "2017-2018": "_J",
    "2017-March 2020 (pre-pandemic)": "P_",  # e.g., P_DEMO.xpt
    "August 2021–August 2023": "_L",         # e.g., DEMO_L.xpt
}
def cycle_suffix(label: str) -> str:
    return CYCLE_SUFFIX.get(label, "")

# -------------------------
# Small helpers
# -------------------------
def z(x):
    x = pd.Series(x, dtype="float64")
    return (x - x.mean(skipna=True)) / x.std(skipna=True)

def combine_wtmec(w, n_cycles: int):
    """
    When stacking cycles, divide 2-year MEC weights by the number of
    2-year cycles actually included in the stack you’re building.
    """
    return w / float(n_cycles)

def list_existing(paths):
    """Quickly check which paths exist (debug helper)."""
    return {k: (p if p.exists() else None) for k, p in paths.items()}

# -------------------------
# Display prefs
# -------------------------
pd.options.display.max_rows = 60
pd.options.display.max_columns = 120
warnings.filterwarnings("ignore")

print("Bootstrap loaded.")
print("ROOT:", ROOT)
print("Data dir exists:", DATA.exists())
print("Output dir exists:", OUT.exists())
print("Mortality cycles:", CYCLES_MORTALITY)
print("Non-mortality cycles:", CYCLES_NONMORT)


Bootstrap loaded.
ROOT: /Users/dengshuyue/Desktop/SDOH/analysis
Data dir exists: True
Output dir exists: True
Mortality cycles: ['1999-2000', '2001-2002', '2003-2004', '2005-2006', '2007-2008', '2009-2010', '2011-2012', '2013-2014', '2015-2016', '2017-2018']
Non-mortality cycles: ['2017-March 2020 (pre-pandemic)', 'August 2021–August 2023']


<h2>preview sas code and data </h2>

In [2]:
from pathlib import Path
import pandas as pd

BASE = Path("/Users/dengshuyue/Desktop/SDOH/analysis/data")

# Find DEMO for 2015–2016 ("I")
cands = list(BASE.rglob("DEMO_I.*"))
print("Candidates:", cands)

if not cands:
    # fallback: list any DEMO files you have
    print("Any DEMO files I can see:")
    print(list(BASE.rglob("DEMO*.*")))
else:
    p = cands[0]
    if p.suffix.lower() == ".xpt":
        df = pd.read_sas(p, format="xport")  # NHANES XPT format
    elif p.suffix.lower() == ".sas7bdat":
        import pyreadstat
        df, _ = pyreadstat.read_sas7bdat(p)
    else:
        raise ValueError(f"Unknown extension: {p.suffix}")

    pd.set_option("display.max_columns", None)
    display(df.head())
    print(df.columns.tolist())


Candidates: [PosixPath('/Users/dengshuyue/Desktop/SDOH/analysis/data/household_size/DEMO_I.xpt'), PosixPath('/Users/dengshuyue/Desktop/SDOH/analysis/data/nhanes_deit/DEMO_I.xpt')]


Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,DMQMILIZ,DMQADFC,DMDBORN4,DMDCITZN,DMDYRSUS,DMDEDUC3,DMDEDUC2,DMDMARTL,RIDEXPRG,SIALANG,SIAPROXY,SIAINTRP,FIALANG,FIAPROXY,FIAINTRP,MIALANG,MIAPROXY,MIAINTRP,AIALANGA,DMDHHSIZ,DMDFMSIZ,DMDHHSZA,DMDHHSZB,DMDHHSZE,DMDHRGND,DMDHRAGE,DMDHRBR4,DMDHREDU,DMDHRMAR,DMDHSEDU,WTINT2YR,WTMEC2YR,SDMVPSU,SDMVSTRA,INDHHIN2,INDFMIN2,INDFMPIR
0,83732.0,9.0,2.0,1.0,62.0,,3.0,3.0,1.0,,2.0,,1.0,1.0,,,5.0,1.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,5.397605e-79,5.397605e-79,1.0,1.0,62.0,1.0,5.0,1.0,3.0,134671.370419,135629.507405,1.0,125.0,10.0,10.0,4.39
1,83733.0,9.0,2.0,1.0,53.0,,3.0,3.0,1.0,,2.0,,2.0,2.0,7.0,,3.0,3.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79,5.397605e-79,1.0,53.0,2.0,3.0,3.0,,24328.560239,25282.425927,1.0,125.0,4.0,4.0,1.32
2,83734.0,9.0,2.0,1.0,78.0,,3.0,3.0,2.0,,1.0,2.0,1.0,1.0,,,3.0,1.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,,2.0,2.0,5.397605e-79,5.397605e-79,2.0,2.0,79.0,1.0,3.0,1.0,3.0,12400.008522,12575.838818,1.0,131.0,5.0,5.0,1.51
3,83735.0,9.0,2.0,2.0,56.0,,3.0,3.0,2.0,,2.0,,1.0,1.0,,,5.0,6.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79,5.397605e-79,2.0,56.0,1.0,5.0,6.0,,102717.995647,102078.634508,1.0,131.0,10.0,10.0,5.0
4,83736.0,9.0,2.0,2.0,42.0,,4.0,4.0,2.0,,2.0,,1.0,1.0,,,4.0,3.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,5.0,5.0,5.397605e-79,2.0,5.397605e-79,2.0,42.0,1.0,4.0,3.0,,17627.674984,18234.736219,2.0,126.0,7.0,7.0,1.23


['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR', 'RIDAGEMN', 'RIDRETH1', 'RIDRETH3', 'RIDEXMON', 'RIDEXAGM', 'DMQMILIZ', 'DMQADFC', 'DMDBORN4', 'DMDCITZN', 'DMDYRSUS', 'DMDEDUC3', 'DMDEDUC2', 'DMDMARTL', 'RIDEXPRG', 'SIALANG', 'SIAPROXY', 'SIAINTRP', 'FIALANG', 'FIAPROXY', 'FIAINTRP', 'MIALANG', 'MIAPROXY', 'MIAINTRP', 'AIALANGA', 'DMDHHSIZ', 'DMDFMSIZ', 'DMDHHSZA', 'DMDHHSZB', 'DMDHHSZE', 'DMDHRGND', 'DMDHRAGE', 'DMDHRBR4', 'DMDHREDU', 'DMDHRMAR', 'DMDHSEDU', 'WTINT2YR', 'WTMEC2YR', 'SDMVPSU', 'SDMVSTRA', 'INDHHIN2', 'INDFMIN2', 'INDFMPIR']


In [3]:

# Path to your .sas7bdat file
file_path = "/Users/dengshuyue/Desktop/SDOH/analysis/data/less_important/gg.sas7bdat"

# Load the dataset using pandas (requires pyreadstat)
df = pd.read_sas(file_path, format="sas7bdat")

# Preview the data
pd.set_option('display.max_columns', None)
df.head()

# List all column names
print(df.columns.tolist())


['SEQN', 'DAYS', 'DR12IFDC', 'WTDRD1', 'DR12DRST', 'SDDSRVYR', 'RIAGENDR', 'RIDAGEYR', 'RIDRETH1', 'DMDEDUC3', 'DMDEDUC2', 'INDFMPIR', 'DMDHREDU', 'WTINT2YR', 'WTMEC2YR', 'SDMVPSU', 'SDMVSTRA', 'DR12IKC2', 'CYCLE', 'WTDR2D', 'DR12FS', 'DMDHREDZ', 'age', 'race', 'edu', 'pedu', 'Incm', 'incm2', 'include', 'Weight16a', 'cycles', 'sex', 'age1', 'age2', 'age3', 'race2', 'race3', 'race4', 'weekend', '_NAME_', '_LABEL_', 'DRDAY1', 'DRDAY2', 'tkal1', 'tkal2', 'Tcal', 'DR12DAY', 'kcal1', 'kcal2', 'kcal3', 'kcal4', 'kcal12', 'wt1', 'wt2', 'wt3', 'wt4', 'wt12', 'pcte1', 'pcte2', 'pcte3', 'pcte4', 'pcte12', 'pctg1', 'pctg2', 'pctg3', 'pctg4', 'pctg12', 'kcals2', 'kcals5', 'kcals6', 'kcals9', 'kcals13', 'kcals14', 'kcals17', 'kcals20', 'kcals21', 'kcals22', 'kcals23', 'kcals25', 'kcals28', 'kcals29', 'kcals33', 'kcals36', 'kcals39', 'kcals41', 'kcals3', 'kcals37', 'kcals38', 'kcals40', 'kcals42', 'kcals1', 'kcals10', 'kcals16', 'kcals24', 'kcals15', 'kcals18', 'kcals7', 'kcals8', 'kcals19', 'kcals4

In [4]:
# Path to your .sas script file (SAS code, not dataset)
file_path = "/Users/dengshuyue/Desktop/SDOH/analysis/code/code_lu/Analysis1_COX_allcause.sas"

# Load the SAS script as plain text
with open(file_path, "r") as file:
    sas_code = file.read()

# Optionally preview the first 30 lines
sas_code_lines = sas_code.splitlines()
for line in sas_code_lines[:300]:
    print(line)



libname data "C:\Users\lwang18\Box\Projects\5_UPF_Mortality\data";
*%let home=C:\Users\LWANG18\Box\Projects\5_UPF_Mortality\results_revision ; 
%let home= C:\Users\lwang18\Box\Projects\Kroger project\Data for analysis\Scores;
 %let path= C:\Users\lwang18\Box\Projects\Kroger project\Data for analysis\Scores;

%let home= C:\Users\lwang18\OneDrive - Tufts\Desktop\Projects\Food Insecurity,;
libname out "C:\Users\lwang18\OneDrive - Tufts\Desktop\Projects\Food Insecurity,";

libname NHANES "C:\Users\LWANG18\Box\NHANES_Lu" ;

/** main analysis **/

%macro cox(data, dvar, evars, covars, death , out);
ods select all ; 
ODS OUTPUT PARAMETERESTIMATES=r0; 
proc surveyphreg data=&data;
	strata sdmvstra;
	cluster sdmvpsu;
	weight wt;
	class    sex (ref="1") race edu(ref="1") smk  pir(ref="3") SNAP(ref="0") FS ins2(ref="1") ins i_FCS_sdq hei2015q(ref="3") marriage  hoq065/param=ref;
	model py*&death(0)= &dvar &covars /rl ties=breslow;
run ;
data r0 ; set r0 ; 
HRCL=compress(round(hazardratio,0.01)||

<h2>Step 1: NHANES demographic data (fetch and merge)</h2>

In [28]:
# %% Step 1: Paths and cycle mapping (matches your project)
from pathlib import Path
import os
import pandas as pd
import numpy as np
import requests

BASE = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
DATA: Path = BASE / "data"
(DATA / "nhanes_by_module" / "DEMO").mkdir(parents=True, exist_ok=True)  # where we’ll save downloads

# --- Candidate upstream URLs for DEMO files (multiple fallbacks per cycle) ---
# Includes the “classic” /Nhanes/<cycle>/ path and the /Nchs/Data/Nhanes/Public/<year>/DataFiles/ path.
def nhanes_url_candidates():
    return {
        "1999-2000": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/1999/DataFiles/DEMO.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/DEMO.XPT",
        ],
        "2001-2002": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2001/DataFiles/DEMO_B.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/DEMO_B.XPT",
        ],
        "2003-2004": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2003/DataFiles/DEMO_C.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2003-2004/DEMO_C.XPT",
        ],
        "2005-2006": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2005/DataFiles/DEMO_D.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/DEMO_D.XPT",
        ],
        "2007-2008": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2007/DataFiles/DEMO_E.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2007-2008/DEMO_E.XPT",
        ],
        "2009-2010": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2009/DataFiles/DEMO_F.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/DEMO_F.XPT",
        ],
        "2011-2012": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2011/DataFiles/DEMO_G.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/DEMO_G.XPT",
        ],
        "2013-2014": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2013/DataFiles/DEMO_H.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DEMO_H.XPT",
        ],
        "2015-2016": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2015/DataFiles/DEMO_I.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DEMO_I.XPT",
        ],
        "2017-2018": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DEMO_J.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT",
        ],
        # Special combined pre-pandemic release
        "2017-March 2020 (pre-pandemic)": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DEMO.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2020/P_DEMO.XPT",
        ],
        # August 2021–August 2023 (DEMO_L; include Q as fallback just in case)
        "August 2021–August 2023": [
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2021-2023/DEMO_L.XPT",
            "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_Q.xpt",
            "https://wwwn.cdc.gov/Nchs/Nhanes/2021-2023/DEMO_Q.XPT",
        ],
    }

NHANES_URLS = nhanes_url_candidates()

# Build local filename candidates per cycle (try these names before downloading)
def candidates_from_urls(urls):
    out, seen = [], set()
    for url in urls:
        fname = Path(url).name
        for v in (fname, fname.upper(), fname.lower(), fname.capitalize()):
            if v not in seen:
                seen.add(v)
                out.append(v)
    return out

LOCAL_CANDIDATES = {cycle: candidates_from_urls(urls)
                    for cycle, urls in NHANES_URLS.items()}

# %% Step 2: helpers (local search + download + reader)
def find_first_under_data(patterns):
    """Search recursively under DATA for any of the provided filenames (case-sensitive per pattern list)."""
    for pattern in patterns:
        hits = list(DATA.rglob(pattern))
        if hits:
            return hits[0]
    return None

# define fuction to fetch from CDC web
def download_to(path: Path, url: str, timeout=90):
    """Download URL -> path with a basic retry; return local path."""
    path.parent.mkdir(parents=True, exist_ok=True)
    headers = {"User-Agent": "nhanes-fetch/1.0 (+https://cdc.gov)"}
    last_err = None
    for attempt in range(2):  # two tries per URL
        try:
            resp = requests.get(url, headers=headers, timeout=timeout, stream=True)
            resp.raise_for_status()
            tmp = path.with_suffix(path.suffix + ".downloading")
            with open(tmp, "wb") as f:
                for chunk in resp.iter_content(chunk_size=1 << 15):
                    if chunk:
                        f.write(chunk)
            tmp.rename(path)
            return path
        except Exception as e:
            last_err = e
    raise last_err

def ensure_demo_file(cycle_label: str) -> Path:
    """Return a local DEMO file path for the cycle (search first, then download from known URLs)."""
    local = find_first_under_data(LOCAL_CANDIDATES[cycle_label])
    if local:
        return local
    for url in NHANES_URLS[cycle_label]:
        out = DATA / "nhanes_by_module" / "DEMO" / Path(url).name
        try:
            print(f"⬇️  Downloading {cycle_label} from {url}")
            return download_to(out, url)
        except Exception as e:
            print(f"   ⚠️ Download failed from {url}: {e}")
    raise FileNotFoundError(f"No DEMO file found or downloaded for {cycle_label}")

def read_demo_file(p: Path) -> pd.DataFrame:
    """Read DEMO file (.xpt preferred, .sas7bdat fallback)."""
    if p.suffix.lower() == ".xpt":
        # Prefer pyreadstat if available (faster/labels), else pandas
        try:
            import pyreadstat
            df, _ = pyreadstat.read_xport(p)
        except Exception:
            df = pd.read_sas(p, format="xport")
    elif p.suffix.lower() == ".sas7bdat":
        try:
            import pyreadstat
            df, _ = pyreadstat.read_sas7bdat(p)
        except Exception:
            df = pd.read_sas(p, format="sas7bdat")
    else:
        raise ValueError(f"Unsupported file type: {p.suffix}")
    return df

# Which cycles to include in this DEMO build (OK to include through 2023 here)
DEMO_CYCLES = [
    "1999-2000","2001-2002","2003-2004","2005-2006","2007-2008",
    "2009-2010","2011-2012","2013-2014","2015-2016","2017-2018",
    "2017-March 2020 (pre-pandemic)","August 2021–August 2023",
]




# --- helpers to recode marital status ---
def recode_L_to_4(s):
    # DMDMARTL codes: 1=Married, 2=Never, 3=Widowed, 4=Divorced, 5=Separated, 6=Living with partner, 77/99=DK/Ref
    s = pd.to_numeric(s, errors="coerce")
    out = pd.Series(pd.NA, index=s.index, dtype="Int64")
    out = out.where(~s.isin([1, 6]), 1)   # married/partner
    out = out.where(~s.isin([3, 4]), 2)   # widowed/divorced
    out = out.where(~(s == 2), 3)         # never married
    out = out.where(~(s == 5), 4)         # separated
    out = out.where(~s.isin([77, 99]), pd.NA)
    return out

def recode_L_to_3(s):
    # Collapse L into 3 categories to match Z:
    # 1 = married/partner; 2 = previously married (widowed/divorced/separated); 3 = never
    s = pd.to_numeric(s, errors="coerce")
    out = pd.Series(pd.NA, index=s.index, dtype="Int64")
    out = out.where(~s.isin([1, 6]), 1)
    out = out.where(~s.isin([3, 4, 5]), 2)
    out = out.where(~(s == 2), 3)
    out = out.where(~s.isin([77, 99]), pd.NA)
    return out

def recode_Z_to_3(s):
    # DMDMARTZ codes (2021–2023): 1=married/partner, 2=previously married, 3=never, 77/99=DK/Ref
    s = pd.to_numeric(s, errors="coerce")
    out = pd.Series(pd.NA, index=s.index, dtype="Int64")
    out = out.where(~(s == 1), 1)
    out = out.where(~(s == 2), 2)
    out = out.where(~(s == 3), 3)
    out = out.where(~s.isin([77, 99]), pd.NA)
    return out

def compute_marriage_cols(df_upper: pd.DataFrame):
    """Return (MARRIAGE, MARRIAGE3) given an uppercase-column DataFrame."""
    hasL = "DMDMARTL" in df_upper.columns
    hasZ = "DMDMARTZ" in df_upper.columns
    M4 = pd.Series(pd.NA, index=df_upper.index, dtype="Int64")
    M3 = pd.Series(pd.NA, index=df_upper.index, dtype="Int64")
    if hasL:
        M4 = recode_L_to_4(df_upper["DMDMARTL"])
        M3 = recode_L_to_3(df_upper["DMDMARTL"])
    elif hasZ:
        # Can't reconstruct a 4-cat from Z; leave M4 as NA
        M3 = recode_Z_to_3(df_upper["DMDMARTZ"])
    return M4, M3

# %% Step 3: Load (local-or-download), tag cycle, recode marriage per cycle, and stack
demo_dfs, missing = [], []
for cycle in DEMO_CYCLES:
    try:
        p = ensure_demo_file(cycle)
    except FileNotFoundError as e:
        print(f"⚠️ {e}")
        missing.append(cycle)
        continue

    df = read_demo_file(p)
    df.columns = [c.upper() for c in df.columns]
    df["CYCLE"] = cycle

    # Recode marital status for this cycle using whichever column exists
    M4, M3 = compute_marriage_cols(df)
    df["MARRIAGE"]  = M4          # 4-category where available; NA for 2021–2023
    df["MARRIAGE3"] = M3          # 3-category available for ALL cycles

    # Keep only relevant columns that exist
    keep = [c for c in ["SEQN","RIDAGEYR","SDDSRVYR","CYCLE","MARRIAGE","MARRIAGE3"] if c in df.columns]
    demo_dfs.append(df[keep])

if missing:
    print("⚠️ Missing cycles (not found and not downloaded):", missing)
if not demo_dfs:
    raise FileNotFoundError("No DEMO files were found or downloaded under your data/ directory.")

# %% Step 4: Combine
demo9923 = pd.concat(demo_dfs, ignore_index=True).copy()

# %% Step 5/6 merged: we already kept just the needed columns; print summary
print(f"✅ Total rows: {demo9923.shape[0]}")
print(f"✅ Unique SEQN: {demo9923['SEQN'].nunique()}")
print("Columns:", demo9923.columns.tolist())

# Optional quick check: 2021–2023 now has MARRIAGE3 populated
vc = demo9923.loc[demo9923["CYCLE"]=="August 2021–August 2023","MARRIAGE3"].value_counts(dropna=False)
print("\n2021–2023 MARRIAGE3 counts:\n", vc)

✅ Total rows: 128809
✅ Unique SEQN: 128809
Columns: ['SEQN', 'RIDAGEYR', 'SDDSRVYR', 'CYCLE', 'MARRIAGE', 'MARRIAGE3']

2021–2023 MARRIAGE3 counts:
 MARRIAGE3
<NA>    4150
1       4136
2       2022
3       1625
Name: count, dtype: Int64


<h3>save and check</h3>

In [29]:
# Simple saves without pyarrow (plain CSV)
out_pkl = DATA / "demo9923.pkl"   # fastest round-trip in Python
out_csv = DATA / "demo9923.csv"   # plain CSV (max compatibility)

demo9923.to_pickle(out_pkl)
demo9923.to_csv(out_csv, index=False)  # no compression

print("Saved:", out_pkl)
print("Saved:", out_csv)



Saved: /Users/dengshuyue/Desktop/SDOH/analysis/data/demo9923.pkl
Saved: /Users/dengshuyue/Desktop/SDOH/analysis/data/demo9923.csv


In [24]:
print("\nCounts by CYCLE:")
print(demo9923["CYCLE"].value_counts(dropna=False))

print("\nCounts by SDDSRVYR (raw numeric code, may be NaN for newer releases):")
print(demo9923["SDDSRVYR"].value_counts(dropna=False).sort_index())



Counts by CYCLE:
CYCLE
2017-March 2020 (pre-pandemic)    15560
August 2021–August 2023           11933
2001-2002                         11039
2009-2010                         10537
2005-2006                         10348
2013-2014                         10175
2007-2008                         10149
2003-2004                         10122
2015-2016                          9971
1999-2000                          9965
2011-2012                          9756
2017-2018                          9254
Name: count, dtype: int64

Counts by SDDSRVYR (raw numeric code, may be NaN for newer releases):
SDDSRVYR
1.0      9965
2.0     11039
3.0     10122
4.0     10348
5.0     10149
6.0     10537
7.0      9756
8.0     10175
9.0      9971
10.0     9254
12.0    11933
66.0    15560
Name: count, dtype: int64


In [36]:
demo9923.tail(16)   

Unnamed: 0,SEQN,RIDAGEYR,SDDSRVYR,CYCLE,MARRIAGE,MARRIAGE3
128793,142295.0,80.0,12.0,August 2021–August 2023,,2.0
128794,142296.0,1.0,12.0,August 2021–August 2023,,
128795,142297.0,76.0,12.0,August 2021–August 2023,,2.0
128796,142298.0,60.0,12.0,August 2021–August 2023,,1.0
128797,142299.0,33.0,12.0,August 2021–August 2023,,1.0
128798,142300.0,46.0,12.0,August 2021–August 2023,,1.0
128799,142301.0,80.0,12.0,August 2021–August 2023,,2.0
128800,142302.0,70.0,12.0,August 2021–August 2023,,3.0
128801,142303.0,69.0,12.0,August 2021–August 2023,,2.0
128802,142304.0,14.0,12.0,August 2021–August 2023,,


Candidate columns for marital status: ['DMDMARTZ']

 DMDMARTZ
DMDMARTZ
NaN     4141
1.0     4136
2.0     2022
3.0     1625
99.0       5
77.0       4
Name: count, dtype: int64


<h2> Step 2: merge demo with mortality </h2>

In [39]:
# %% Mortality: locate, read, prep, and merge with DEMO (uses demo9923)
import pandas as pd
import numpy as np
from pathlib import Path

# --- Paths -------------------------------------------------------
ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
DATA = ROOT / "data"

# --- Locate mortality file (search recursively under data/) -----
MORT_PATTERNS = [
    "*mortality*.sas7bdat", "*mort*.sas7bdat",
    "*mortality*.xpt",      "*mort*.xpt",
    "*mortality*.csv",      "*mort*.csv"
]

def find_first(base: Path, patterns):
    for pat in patterns:
        hits = list(base.rglob(pat))
        if hits:
            return hits[0]
    return None

mort_path = find_first(DATA, MORT_PATTERNS)
if mort_path is None:
    raise FileNotFoundError(
        f"No mortality file found under {DATA}. "
        "Place a file like 'NHANES_1999_2019_LMF_public.csv' or 'mortality9918.sas7bdat' there."
    )
print("Using mortality file:", mort_path)

# --- Load mortality (handles .xpt / .sas7bdat / .csv) ----------
suffix = mort_path.suffix.lower()
if suffix == ".xpt":
    try:
        import pyreadstat
        mort, _ = pyreadstat.read_xport(mort_path)
    except Exception:
        mort = pd.read_sas(mort_path, format="xport")
elif suffix == ".sas7bdat":
    try:
        import pyreadstat
        mort, _ = pyreadstat.read_sas7bdat(mort_path)
    except Exception:
        mort = pd.read_sas(mort_path, format="sas7bdat")
elif suffix == ".csv":
    mort = pd.read_csv(mort_path)
else:
    raise ValueError(f"Unsupported mortality file type: {suffix}")

mort.columns = mort.columns.str.upper()

# --- Keep relevant columns if present ---------------------------
keep_cols = [c for c in [
    "SEQN","ELIGSTAT","MORTSTAT","PERMTH_EXM","PERMTH_INT",
    "UCOD_LEADING","DIABETES","HYPERTEN","DODQTR","DODYEAR"
] if c in mort.columns]
mort = mort[keep_cols].copy()

# --- Eligibility and time/event construction --------------------
if "ELIGSTAT" in mort.columns:
    mort = mort[mort["ELIGSTAT"] == 1].copy()

# default: exam-based time; switch to interview by changing TIME_COL
TIME_COL = "PERMTH_EXM" if "PERMTH_EXM" in mort.columns else "PERMTH_INT"
if TIME_COL not in mort.columns:
    raise ValueError("Neither PERMTH_EXM nor PERMTH_INT found in mortality file.")

mort["TIME_Y"] = pd.to_numeric(mort[TIME_COL], errors="coerce") / 12.0
mort["EVENT"]  = (mort["MORTSTAT"] == 1).astype(int)
mort = mort[(mort["TIME_Y"].notna()) & (mort["TIME_Y"] >= 0)].copy()

# --- Ensure DEMO columns & types (demo9923) ---------------------
demo9923.columns = demo9923.columns.str.upper()
if "SDDSRVYR" not in demo9923.columns:
    raise ValueError("SDDSRVYR not found in demo9923. Ensure your DEMO build carries SDDSRVYR.")
demo9923["SDDSRVYR"] = pd.to_numeric(demo9923["SDDSRVYR"], errors="coerce")

# --- Merge age + cycle into mortality ---------------------------
cols_to_merge = [c for c in ["SEQN","RIDAGEYR","SDDSRVYR"] if c in demo9923.columns]
mort_with_demo = mort.merge(demo9923[cols_to_merge], on="SEQN", how="left")

# Optional: adults (≥20)
# mort_with_demo = mort_with_demo.dropna(subset=["RIDAGEYR"])
# mort_with_demo = mort_with_demo[mort_with_demo["RIDAGEYR"] >= 20]

# --- Cycle counts and summary -----------------------------------
cycle_counts = mort_with_demo["SDDSRVYR"].value_counts(dropna=False).sort_index()

cycle_map = {
    1: "1999–2000",  2: "2001–2002",  3: "2003–2004",
    4: "2005–2006",  5: "2007–2008",  6: "2009–2010",
    7: "2011–2012",  8: "2013–2014",  9: "2015–2016",
    10: "2017–2018", 11: "2017–Mar 2020 (pre-pandemic)", 12: "Aug 2021–Aug 2023"
}

print("\n📊 NHANES cycles in mortality-linked data:")
print(cycle_counts.rename(index=cycle_map))
print(f"\nTotal records: {int(cycle_counts.sum())}")

print("\n⏱️  Survival summary:")
print("  N (unique SEQN):", mort_with_demo["SEQN"].nunique())
print("  Events (%):", round(100 * mort_with_demo["EVENT"].mean(), 2))
print("  TIME_Y (min / median / max):",
      np.nanmin(mort_with_demo["TIME_Y"]),
      np.nanmedian(mort_with_demo["TIME_Y"]),
      np.nanmax(mort_with_demo["TIME_Y"]))


Using mortality file: /Users/dengshuyue/Desktop/SDOH/analysis/data/less_important/mortality9918.sas7bdat

📊 NHANES cycles in mortality-linked data:
SDDSRVYR
1999–2000    4973
2001–2002    5586
2003–2004    5293
2005–2006    5332
2007–2008    5989
2009–2010    6346
2011–2012    5603
2013–2014    5913
2015–2016    5720
2017–2018    5498
Name: count, dtype: int64

Total records: 56253

⏱️  Survival summary:
  N (unique SEQN): 56253
  Events (%): 14.87
  TIME_Y (min / median / max): 0.0 9.416666666666666 20.75


<h2>Step 3: merge demo_mort with sodh info (BEGIN HERE !! )</h2>

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

# -----------------------------
# Step 0: Setup (NEW PATHS)
# -----------------------------
base_module_path = Path("/Users/dengshuyue/Desktop/SDOH/analysis/data/nhanes_by_module")

ocq_path = base_module_path / "ocq" / "ocq.sas7bdat"     # main OCQ (2003–2018)
hoq_path = base_module_path / "hoq" / "hoq.sas7bdat"     # main HOQ (2003–2018)
hiq_path = base_module_path / "hiq" / "hiqs.sas7bdat"    # insurance (HIQS for later years)
fsq_path = base_module_path / "fsq" / "fsqs.sas7bdat"    # food security (FSQS for later years)

# Step 1: Load demoall (unchanged)
demoall = pd.read_pickle("/Users/dengshuyue/Desktop/SDOH/analysis/data/demoall.pkl")
if "SEQN" not in demoall.columns or "RIDAGEYR" not in demoall.columns:
    raise ValueError("demoall is missing SEQN or RIDAGEYR")

# Step 2: Filter demoall to adults (age ≥ 20)
age_df = demoall[["SEQN", "RIDAGEYR"]].dropna()
age_df = age_df[age_df["RIDAGEYR"] >= 20]

# Helper: Filter any df to age ≥ 20 using demoall
def filter_adults(df):
    return df.merge(age_df, on="SEQN", how="inner")

# -----------------------------
# Step 3: Employment (OCQ)
# -----------------------------
if not ocq_path.exists():
    raise FileNotFoundError(f"OCQ file not found at: {ocq_path}")

ocq = filter_adults(pd.read_sas(str(ocq_path), format="sas7bdat", encoding="latin1"))
ocq['employ'] = np.nan
ocq.loc[ocq['OCD150'] == 1, 'employ'] = 1
ocq.loc[(ocq['OCD150'] == 3) | (ocq['OCQ380'] == 5), 'employ'] = 2
ocq.loc[ocq['OCQ380'] == 3, 'employ'] = 3
ocq.loc[ocq['OCQ380'].isin([4, 6]), 'employ'] = 4
ocq.loc[ocq['OCQ380'].isin([1, 2, 7]), 'employ'] = 5
ocq['unemployment'] = (ocq['employ'] == 2).astype(int)
ocq = ocq[['SEQN', 'employ', 'unemployment']]

# -----------------------------
# Step 4: Housing (HOQ)
# -----------------------------
if not hoq_path.exists():
    raise FileNotFoundError(f"HOQ file not found at: {hoq_path}")

hoq = filter_adults(pd.read_sas(str(hoq_path), format="sas7bdat", encoding="latin1"))
if "HOQ065" in hoq.columns:
    hoq.loc[hoq['HOQ065'].isin([7, 9]), 'HOQ065'] = np.nan
keep_hoq = [c for c in ['SEQN', 'HOD050', 'HOQ065'] if c in hoq.columns]
hoq = hoq[keep_hoq]

# -----------------------------
# Step 5: Insurance (HIQS)
# -----------------------------
if not hiq_path.exists():
    raise FileNotFoundError(f"HIQS file not found at: {hiq_path}")

hiqs = filter_adults(pd.read_sas(str(hiq_path), format="sas7bdat", encoding="latin1"))
ins = pd.DataFrame({'SEQN': hiqs['SEQN']})
ins['ins'] = np.nan

# Private
if 'HIQ031A' in hiqs: ins.loc[(hiqs['HIQ031A'] == 14), 'ins'] = 1
if 'HID030A' in hiqs: ins.loc[(hiqs['HID030A'] == 1), 'ins'] = 1

# Medicare
cond_med = False
if set(['HIQ031B','HIQ031D','HIQ031E']).issubset(hiqs.columns):
    cond_med = ((hiqs['HIQ031B'] == 15) & (hiqs['HIQ031D'] != 17) & (hiqs['HIQ031E'] != 18))
if set(['HID030B','HID030C']).issubset(hiqs.columns):
    cond_med = cond_med | ((hiqs['HID030B'] == 1) & (hiqs['HID030C'] != 1)) if isinstance(cond_med, pd.Series) else ((hiqs['HID030B'] == 1) & (hiqs['HID030C'] != 1))
ins.loc[cond_med, 'ins'] = 2

# Medicaid
cond_mcaid = False
if set(['HIQ031B','HIQ031D','HIQ031E']).issubset(hiqs.columns):
    cond_mcaid = (((hiqs['HIQ031D'] == 17) | (hiqs['HIQ031E'] == 18)) & (hiqs['HIQ031B'] != 15))
if set(['HID030B','HID030C']).issubset(hiqs.columns):
    cond_mcaid = cond_mcaid | ((hiqs['HID030B'] != 1) & (hiqs['HID030C'] == 1)) if isinstance(cond_mcaid, pd.Series) else ((hiqs['HID030B'] != 1) & (hiqs['HID030C'] == 1))
ins.loc[cond_mcaid, 'ins'] = 3

# Medicaid (both present)
if set(['HIQ031B','HIQ031D']).issubset(hiqs.columns):
    ins.loc[((hiqs['HIQ031B'] == 15) & (hiqs['HIQ031D'] == 17)), 'ins'] = 3
if set(['HID030B','HID030C']).issubset(hiqs.columns):
    ins.loc[((hiqs['HID030B'] == 1) & (hiqs['HID030C'] == 1)), 'ins'] = 3

# Other insurance
cols_other = [c for c in ['HIQ031C','HIQ031F','HIQ031G','HIQ031H','HIQ031I'] if c in hiqs.columns]
cond_other = hiqs[cols_other].eq(1).any(axis=1) if cols_other else False
if 'HID030D' in hiqs:
    cond_other = cond_other | (hiqs['HID030D'] == 1) if isinstance(cond_other, pd.Series) else (hiqs['HID030D'] == 1)
ins.loc[cond_other, 'ins'] = 5

# No insurance
conds_none = []
if 'HIQ011' in hiqs: conds_none.append(hiqs['HIQ011'] == 2)
if 'HID010' in hiqs: conds_none.append(hiqs['HID010'] == 2)
if conds_none:
    ins.loc[np.logical_or.reduce(conds_none), 'ins'] = 0


# -----------------------------
# Step 6: SNAP & Food Security (FSQS)
# -----------------------------
if not fsq_path.exists():
    raise FileNotFoundError(f"FSQS file not found: {fsq_path}")

fsqs = filter_adults(pd.read_sas(str(fsq_path), format="sas7bdat", encoding="latin1"))
snap = pd.DataFrame({'SEQN': fsqs['SEQN']})
if 'FSDHH' in fsqs: snap['FSDHH'] = fsqs['FSDHH']

snap['SNAP'] = np.nan
if 'FSQ165' in fsqs: snap.loc[fsqs['FSQ165'] == 2, 'SNAP'] = 0
if 'FSQ012' in fsqs:
    snap.loc[fsqs['FSQ012'] == 1, 'SNAP'] = 1
    snap.loc[fsqs['FSQ012'] == 2, 'SNAP'] = 0
if 'FSQ171' in fsqs:
    snap.loc[fsqs['FSQ171'] == 1, 'SNAP'] = 1
    snap.loc[fsqs['FSQ171'] == 2, 'SNAP'] = 0
if 'FSD170N' in fsqs: snap.loc[fsqs['FSD170N'] >= 1, 'SNAP'] = 1
if 'FSQ170' in fsqs:
    snap.loc[fsqs['FSQ170'] == 1, 'SNAP'] = 1
    snap.loc[(fsqs['FSQ170'] == 2) & (fsqs.get('FSD170N', pd.Series(index=fsqs.index)) < 1), 'SNAP'] = 0
if 'FSD200' in fsqs: snap.loc[fsqs['FSD200'] == 1, 'SNAP'] = 1

snap['FS'] = np.nan
if 'FSDHH' in fsqs:
    snap.loc[fsqs['FSDHH'].isin([1, 2]), 'FS'] = 1
    snap.loc[fsqs['FSDHH'] > 2, 'FS'] = 0

snap = snap[['SEQN', 'SNAP', 'FSDHH', 'FS']]


In [None]:

# merge previous 99-03 ir 99-01  first check their SEQN

import pyreadstat
from pathlib import Path

ocq_dir = Path("/Users/dengshuyue/Desktop/SDOH/analysis/data/nhanes_by_module/ocq")

files = [
    ("OCQ.xpt",   "1999–2000"),
    ("OCQ_B.xpt", "2001–2002"),
    ("OCQ_C.xpt", "2003–2004"),
]

for fname, years in files:
    path = ocq_dir / fname
    if not path.exists():
        print(f"⚠️ File not found: {path}")
        continue
    
    df, meta = pyreadstat.read_xport(path)
    df.columns = df.columns.str.upper()
    
    print(f"\n✅ Loaded {fname} ({years})")
    print("Shape:", df.shape)
    print("SEQN min:", df['SEQN'].min(), "SEQN max:", df['SEQN'].max())
    print("Unique SEQN:", df['SEQN'].nunique())
    print("First 5 SEQN:", df['SEQN'].head().tolist())


In [None]:
# first start with ocq 
import pandas as pd
import numpy as np
from pathlib import Path
import pyreadstat

# --- Paths ---
ocq_dir = Path("/Users/dengshuyue/Desktop/SDOH/analysis/data/nhanes_by_module/ocq")
ocq_main_path = ocq_dir / "ocq.sas7bdat"                 # 2003–2018 consolidated
ocq_early_paths = [(ocq_dir/"OCQ.xpt", 1), (ocq_dir/"OCQ_B.xpt", 2)]  # 1999–2002

def recode_employment(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = df.columns.str.upper()
    df["EMPLOY"] = np.nan
    if "OCD150" in df:
        df.loc[df["OCD150"] == 1, "EMPLOY"] = 1
        df.loc[df["OCD150"] == 3, "EMPLOY"] = 2
    if "OCQ380" in df:
        df.loc[df["OCQ380"] == 5, "EMPLOY"] = 2
        df.loc[df["OCQ380"] == 3, "EMPLOY"] = 3
        df.loc[df["OCQ380"].isin([4, 6]), "EMPLOY"] = 4
        df.loc[df["OCQ380"].isin([1, 2, 7]), "EMPLOY"] = 5
    df["UNEMPLOYMENT"] = (df["EMPLOY"] == 2).astype(int)
    keep = [c for c in ["SEQN","EMPLOY","UNEMPLOYMENT","SDDSRVYR"] if c in df.columns]
    return df[keep]

# --- A) Read early cycles (1999–2002) ---
early_parts = []
for p, cyc in ocq_early_paths:
    if p.exists():
        df, _ = pyreadstat.read_xport(str(p))
        df.columns = df.columns.str.upper()
        df["SDDSRVYR"] = cyc
        early_parts.append(recode_employment(df))
    else:
        print(f"⚠️ Missing early OCQ file: {p}")
ocq_early = pd.concat(early_parts, ignore_index=True) if early_parts else pd.DataFrame()

# --- B) Read main consolidated OCQ (2003–2018) from the SAME folder ---
if not ocq_main_path.exists():
    raise FileNotFoundError(f"Main OCQ not found: {ocq_main_path}")
ocq_main_raw = pd.read_sas(str(ocq_main_path), format="sas7bdat", encoding="latin1")
ocq_main_raw.columns = ocq_main_raw.columns.str.upper()

# Optional adult filter if you have it defined
try:
    ocq_main_raw = filter_adults(ocq_main_raw)
except NameError:
    pass

# If SDDSRVYR isn't present in the main file, you can skip it or infer later via SEQN.
ocq_main = recode_employment(ocq_main_raw)

# --- C) Combine to final OCQ (1999–2018) ---
ocq = pd.concat([ocq_early, ocq_main], ignore_index=True)

# Hygiene
ocq["SEQN"] = pd.to_numeric(ocq["SEQN"], errors="coerce").astype("Int64")
if "SDDSRVYR" in ocq.columns:
    ocq["SDDSRVYR"] = pd.to_numeric(ocq["SDDSRVYR"], errors="coerce").astype("Int64")
ocq = ocq.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])

# --- D) Quick checks ---
print("Final OCQ shape:", ocq.shape)
print("SEQN range:", ocq["SEQN"].min(), "→", ocq["SEQN"].max())
if "SDDSRVYR" in ocq.columns:
    print("Cycles present:\n", ocq["SDDSRVYR"].value_counts().sort_index())


In [None]:
# hoq (housing)

import pandas as pd
import numpy as np
from pathlib import Path

# --- Paths ---
hoq_dir = Path("/Users/dengshuyue/Desktop/SDOH/analysis/data/nhanes_by_module/hoq")
hoq_main_path = hoq_dir / "hoq.sas7bdat"  # your 2003–2018 consolidated file
early_candidates = [
    (hoq_dir / "HOQ.xpt",   1),  # 1999–2000
    (hoq_dir / "HOQ_B.xpt", 2),  # 2001–2002
    # if you happen to have SAS versions instead of XPTs, we’ll try those too:
    (hoq_dir / "hoq.sas7bdat",   1),  # fallback for 99–00 (rare)
    (hoq_dir / "hoq_b.sas7bdat", 2),
]

def read_any(path: Path) -> pd.DataFrame:
    if path.suffix.lower() == ".xpt":
        import pyreadstat
        df, _ = pyreadstat.read_xport(str(path))
    elif path.suffix.lower() == ".sas7bdat":
        df = pd.read_sas(str(path), format="sas7bdat", encoding="latin1")
    else:
        raise ValueError(f"Unsupported file: {path}")
    df.columns = df.columns.str.upper()
    return df

def preprocess_hoq(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # Clean HOQ065: set 7/9 to NaN where present
    if "HOQ065" in df.columns:
        df.loc[df["HOQ065"].isin([7, 9]), "HOQ065"] = np.nan
    # Keep the core columns that exist
    keep = [c for c in ["SEQN", "HOD050", "HOQ065", "SDDSRVYR"] if c in df.columns]
    return df[keep]

# --- A) Early cycles (99–02) ---
early_parts = []
seen_cycles = set()
for p, cyc in early_candidates:
    if p.exists() and cyc not in seen_cycles:
        df = read_any(p)
        df["SDDSRVYR"] = cyc
        early_parts.append(preprocess_hoq(df))
        seen_cycles.add(cyc)
hoq_early = pd.concat(early_parts, ignore_index=True) if early_parts else pd.DataFrame()

# --- B) Main consolidated HOQ (2003–2018) ---
if not hoq_main_path.exists():
    raise FileNotFoundError(f"Main HOQ not found: {hoq_main_path}")
hoq_main_raw = read_any(hoq_main_path)
# (Optional) If you have an adult filter function:
try:
    hoq_main_raw = filter_adults(hoq_main_raw)  # your function, if defined
except NameError:
    pass
hoq_main = preprocess_hoq(hoq_main_raw)

# --- C) Combine to final HOQ (1999–2018) ---
hoq_all = pd.concat([hoq_early, hoq_main], ignore_index=True)

# Hygiene
if "SEQN" in hoq_all:
    hoq_all["SEQN"] = pd.to_numeric(hoq_all["SEQN"], errors="coerce").astype("Int64")
if "SDDSRVYR" in hoq_all:
    hoq_all["SDDSRVYR"] = pd.to_numeric(hoq_all["SDDSRVYR"], errors="coerce").astype("Int64")
hoq_all = hoq_all.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])

# --- D) Quick checks ---
print("Final HOQ shape:", hoq_all.shape)
print("SEQN range:", hoq_all["SEQN"].min(), "→", hoq_all["SEQN"].max())
if "SDDSRVYR" in hoq_all:
    cycle_map = {1:"1999–2000",2:"2001–2002",3:"2003–2004",4:"2005–2006",5:"2007–2008",
                 6:"2009–2010",7:"2011–2012",8:"2013–2014",9:"2015–2016",10:"2017–2018"}
    counts = hoq_all["SDDSRVYR"].value_counts().sort_index()
    print("Cycles present:\n", counts.rename(index=cycle_map))

# (Optional) Save for reuse
hoq_all.to_csv("/Users/dengshuyue/Desktop/SDOH/analysis/data/nhanes_by_module/hoq/hoq_99_18.csv", index=False)


In [None]:
# hiqs 

# -----------------------------
# Insurance (HIQ/HIQS) — add 1999–2002
# -----------------------------
from pathlib import Path
import pandas as pd
import numpy as np

base_module_path = Path("/Users/dengshuyue/Desktop/SDOH/analysis/data/nhanes_by_module")
hiq_dir = base_module_path / "hiq"

# Main consolidated later years (HIQS for 2003–2018)
hiq_main_path = hiq_dir / "hiqs.sas7bdat"

# Early cycles (prefer XPT; fallback to sas7bdat if that’s what you have)
early_candidates = [
    (hiq_dir / "HIQ.xpt",       1),  # 1999–2000
    (hiq_dir / "HIQ_B.xpt",     2),  # 2001–2002
    (hiq_dir / "hiq.sas7bdat",  1),  # fallback (rare)
    (hiq_dir / "hiq_b.sas7bdat",2),
]

def read_any(p: Path) -> pd.DataFrame:
    if p.suffix.lower() == ".xpt":
        import pyreadstat
        df, _ = pyreadstat.read_xport(str(p))
    elif p.suffix.lower() == ".sas7bdat":
        df = pd.read_sas(str(p), format="sas7bdat", encoding="latin1")
    else:
        raise ValueError(f"Unsupported file: {p}")
    df.columns = df.columns.str.upper()
    return df

# A) Early cycles (99–02)
early_parts, seen = [], set()
for p, cyc in early_candidates:
    if p.exists() and cyc not in seen:
        df = read_any(p)
        df["SDDSRVYR"] = cyc
        early_parts.append(df)
        seen.add(cyc)
hiq_early_raw = pd.concat(early_parts, ignore_index=True) if early_parts else pd.DataFrame()

# B) Main consolidated (03–18)
if not hiq_main_path.exists():
    raise FileNotFoundError(f"HIQS file not found at: {hiq_main_path}")
hiq_main_raw = read_any(hiq_main_path)

# C) Filter to adults using your helper
hiq_early = filter_adults(hiq_early_raw) if not hiq_early_raw.empty else hiq_early_raw
hiqs       = filter_adults(hiq_main_raw)

# D) Stack early + main
hiq_all = pd.concat([hiq_early, hiqs], ignore_index=True)
hiq_all.columns = hiq_all.columns.str.upper()

# E) Compute insurance 'ins' on the combined data (guards for varying columns)
ins = pd.DataFrame({"SEQN": hiq_all["SEQN"]})
if "SDDSRVYR" in hiq_all.columns:
    ins["SDDSRVYR"] = hiq_all["SDDSRVYR"]
ins["ins"] = np.nan

# Private
if "HIQ031A" in hiq_all: ins.loc[hiq_all["HIQ031A"] == 14, "ins"] = 1
if "HID030A" in hiq_all: ins.loc[hiq_all["HID030A"] == 1,  "ins"] = 1

# Medicare
cond_med = False
if {"HIQ031B","HIQ031D","HIQ031E"}.issubset(hiq_all.columns):
    cond_med = (hiq_all["HIQ031B"] == 15) & (hiq_all["HIQ031D"] != 17) & (hiq_all["HIQ031E"] != 18)
if {"HID030B","HID030C"}.issubset(hiq_all.columns):
    cond_med = cond_med | ((hiq_all["HID030B"] == 1) & (hiq_all["HID030C"] != 1)) if isinstance(cond_med, pd.Series) else ((hiq_all["HID030B"] == 1) & (hiq_all["HID030C"] != 1))
ins.loc[cond_med, "ins"] = 2

# Medicaid
cond_mcaid = False
if {"HIQ031B","HIQ031D","HIQ031E"}.issubset(hiq_all.columns):
    cond_mcaid = (((hiq_all["HIQ031D"] == 17) | (hiq_all["HIQ031E"] == 18)) & (hiq_all["HIQ031B"] != 15))
if {"HID030B","HID030C"}.issubset(hiq_all.columns):
    cond_mcaid = cond_mcaid | ((hiq_all["HID030B"] != 1) & (hiq_all["HID030C"] == 1)) if isinstance(cond_mcaid, pd.Series) else ((hiq_all["HID030B"] != 1) & (hiq_all["HID030C"] == 1))
ins.loc[cond_mcaid, "ins"] = 3

# Medicaid when both present
if {"HIQ031B","HIQ031D"}.issubset(hiq_all.columns):
    ins.loc[(hiq_all["HIQ031B"] == 15) & (hiq_all["HIQ031D"] == 17), "ins"] = 3
if {"HID030B","HID030C"}.issubset(hiq_all.columns):
    ins.loc[(hiq_all["HID030B"] == 1) & (hiq_all["HID030C"] == 1), "ins"] = 3

# Other insurance
other_cols = [c for c in ["HIQ031C","HIQ031F","HIQ031G","HIQ031H","HIQ031I"] if c in hiq_all.columns]
cond_other = hiq_all[other_cols].eq(1).any(axis=1) if other_cols else False
if "HID030D" in hiq_all:
    cond_other = cond_other | (hiq_all["HID030D"] == 1) if isinstance(cond_other, pd.Series) else (hiq_all["HID030D"] == 1)
ins.loc[cond_other, "ins"] = 5

# No insurance
conds_none = []
if "HIQ011" in hiq_all: conds_none.append(hiq_all["HIQ011"] == 2)
if "HID010" in hiq_all: conds_none.append(hiq_all["HID010"] == 2)
if conds_none:
    ins.loc[np.logical_or.reduce(conds_none), "ins"] = 0

# Tidy up
ins["SEQN"] = pd.to_numeric(ins["SEQN"], errors="coerce").astype("Int64")
if "SDDSRVYR" in ins.columns:
    ins["SDDSRVYR"] = pd.to_numeric(ins["SDDSRVYR"], errors="coerce").astype("Int64")
ins = ins.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])

print("Insurance table shape:", ins.shape)
if "SDDSRVYR" in ins.columns:
    print("Cycles present:\n", ins["SDDSRVYR"].value_counts().sort_index())

# print(ins["SEQN"])

# (Optional) Save for reuse
ins.to_csv("/Users/dengshuyue/Desktop/SDOH/analysis/data/nhanes_by_module/hiq/ins_99_18.csv", index=False)


In [None]:
# fsq
# -----------------------------
# Step 6: SNAP & Food Security (FSQ/FSQS) — add 1999–2002
# -----------------------------
from pathlib import Path
import pandas as pd
import numpy as np

base_module_path = Path("/Users/dengshuyue/Desktop/SDOH/analysis/data/nhanes_by_module")
fsq_dir = base_module_path / "fsq"

# Main later-years file (FSQS for 2003–2018)
fsq_main_path = fsq_dir / "fsqs.sas7bdat"

# Early cycles (prefer XPT; fallback to sas7bdat if that’s what you have)
early_candidates = [
    (fsq_dir / "FSQ.xpt",         1),  # 1999–2000
    (fsq_dir / "FSQ_B.xpt",       2),  # 2001–2002
    (fsq_dir / "fsq.sas7bdat",    1),  # fallback (rare)
    (fsq_dir / "fsq_b.sas7bdat",  2),
]

def read_any(p: Path) -> pd.DataFrame:
    if p.suffix.lower() == ".xpt":
        import pyreadstat
        df, _ = pyreadstat.read_xport(str(p))
    elif p.suffix.lower() == ".sas7bdat":
        df = pd.read_sas(str(p), format="sas7bdat", encoding="latin1")
    else:
        raise ValueError(f"Unsupported file: {p}")
    df.columns = df.columns.str.upper()
    return df

# A) Early cycles (99–02)
early_parts, seen = [], set()
for p, cyc in early_candidates:
    if p.exists() and cyc not in seen:
        df = read_any(p)
        df["SDDSRVYR"] = cyc
        early_parts.append(df)
        seen.add(cyc)
fsq_early_raw = pd.concat(early_parts, ignore_index=True) if early_parts else pd.DataFrame()

# B) Main consolidated (03–18)
if not fsq_main_path.exists():
    raise FileNotFoundError(f"FSQS file not found: {fsq_main_path}")
fsq_main_raw = read_any(fsq_main_path)

# C) Filter to adults using your helper
fsq_early = filter_adults(fsq_early_raw) if not fsq_early_raw.empty else fsq_early_raw
fsqs       = filter_adults(fsq_main_raw)

# D) Stack early + main
fsq_all = pd.concat([fsq_early, fsqs], ignore_index=True)
fsq_all.columns = fsq_all.columns.str.upper()

# E) Build SNAP/FS outputs (guards for varying columns)
snap = pd.DataFrame({"SEQN": fsq_all["SEQN"]})
if "SDDSRVYR" in fsq_all: snap["SDDSRVYR"] = fsq_all["SDDSRVYR"]
if "FSDHH" in fsq_all:   snap["FSDHH"] = fsq_all["FSDHH"]

snap["SNAP"] = np.nan
if "FSQ165" in fsq_all: snap.loc[fsq_all["FSQ165"] == 2, "SNAP"] = 0
if "FSQ012" in fsq_all:
    snap.loc[fsq_all["FSQ012"] == 1, "SNAP"] = 1
    snap.loc[fsq_all["FSQ012"] == 2, "SNAP"] = 0
if "FSQ171" in fsq_all:
    snap.loc[fsq_all["FSQ171"] == 1, "SNAP"] = 1
    snap.loc[fsq_all["FSQ171"] == 2, "SNAP"] = 0
if "FSD170N" in fsq_all: snap.loc[fsq_all["FSD170N"] >= 1, "SNAP"] = 1
if "FSQ170" in fsq_all:
    snap.loc[fsq_all["FSQ170"] == 1, "SNAP"] = 1
    snap.loc[(fsq_all["FSQ170"] == 2) & (fsq_all.get("FSD170N", pd.Series(index=fsq_all.index)) < 1), "SNAP"] = 0
if "FSD200" in fsq_all: snap.loc[fsq_all["FSD200"] == 1, "SNAP"] = 1

snap["FS"] = np.nan
if "FSDHH" in fsq_all:
    snap.loc[fsq_all["FSDHH"].isin([1, 2]), "FS"] = 1
    snap.loc[fsq_all["FSDHH"] > 2, "FS"] = 0

# Final columns
snap = snap[[c for c in ["SEQN", "SNAP", "FSDHH", "FS", "SDDSRVYR"] if c in snap.columns]]

# Hygiene
snap["SEQN"] = pd.to_numeric(snap["SEQN"], errors="coerce").astype("Int64")
if "SDDSRVYR" in snap.columns:
    snap["SDDSRVYR"] = pd.to_numeric(snap["SDDSRVYR"], errors="coerce").astype("Int64")
snap = snap.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])

# Quick checks
print("SNAP table shape:", snap.shape)
if "SDDSRVYR" in snap.columns:
    print("Cycles present:\n", snap["SDDSRVYR"].value_counts().sort_index())

print(snap["SEQN"])


In [None]:

#  SEQN check — i.scores, covar, covariates, gg (dietwt), mortality9918

import pandas as pd
import numpy as np
import os
from pathlib import Path

# Set your folder
folder_path = "/Users/dengshuyue/Desktop/SDOH/analysis/data/less_important"

def _standardize_seqn(df):
    # make columns case-insensitive and return a Series of SEQN (nullable Int64)
    df.columns = df.columns.str.upper()
    if "SEQN" not in df.columns:
        return None
    s = pd.to_numeric(df["SEQN"], errors="coerce").astype("Int64")
    return s

def summarize_seqn(name, seqn):
    print(f"\n{name}")
    if seqn is None:
        print("  ❌ SEQN column not found")
        return
    print(f"  Rows: {len(seqn)}")
    print(f"  Null SEQN: {seqn.isna().sum()}")
    if seqn.notna().any():
        print(f"  dtype: {seqn.dtype}")
        print(f"  Min / Max: {seqn.min()} / {seqn.max()}")
        nunq = seqn.nunique(dropna=True)
        dups = (seqn.notna().sum() - nunq)
        print(f"  Unique SEQN: {nunq}  |  Duplicates: {dups}")
        # small sample
        print(f"  Head SEQN: {list(seqn.dropna().head(5))}")
        print(f"  Tail SEQN: {list(seqn.dropna().tail(5))}")

# 1) i.scores.xlsx
try:
    scores = pd.read_excel(os.path.join(folder_path, "i.scores.xlsx"), engine="openpyxl")
    # handle seqn casing
    if "seqn" in scores.columns and "SEQN" not in scores.columns:
        scores = scores.rename(columns={"seqn": "SEQN"})
    summarize_seqn("i.scores.xlsx", _standardize_seqn(scores))
except Exception as e:
    print("\n[i.scores.xlsx] ⚠️", e)

# 2) covar.sas7bdat
try:
    covar = pd.read_sas(os.path.join(folder_path, "covar.sas7bdat"),
                        format="sas7bdat", encoding="latin1")
    summarize_seqn("covar.sas7bdat", _standardize_seqn(covar))
except Exception as e:
    print("\n[covar.sas7bdat] ⚠️", e)

# 3) covariates.csv
try:
    covariates1_raw = pd.read_csv(os.path.join(folder_path, "covariates.csv"))
    if "seqn" in covariates1_raw.columns and "SEQN" not in covariates1_raw.columns:
        covariates1_raw = covariates1_raw.rename(columns={"seqn": "SEQN"})
    summarize_seqn("covariates.csv", _standardize_seqn(covariates1_raw))
except Exception as e:
    print("\n[covariates.csv] ⚠️", e)

# 4) gg.sas7bdat (diet weights)
try:
    dietwt = pd.read_sas(os.path.join(folder_path, "gg.sas7bdat"),
                         format="sas7bdat", encoding="latin1")
    summarize_seqn("gg.sas7bdat", _standardize_seqn(dietwt))
except Exception as e:
    print("\n[gg.sas7bdat] ⚠️", e)

# 5) mortality9918.sas7bdat
try:
    mort = pd.read_sas(os.path.join(folder_path, "mortality9918.sas7bdat"),
                       format="sas7bdat", encoding="latin1")
    summarize_seqn("mortality9918.sas7bdat", _standardize_seqn(mort))
except Exception as e:
    print("\n[mortality9918.sas7bdat] ⚠️", e)


In [None]:
SODH_diet_mort = pd.read_pickle(os.path.join(folder_path, "SODH_diet_mort.pkl"))

score_mort.to_csv("/Users/dengshuyue/Desktop/SDOH/analysis/data/SODH_diet_mort_depr2.csv", index=False)

# Number of rows
total_rows = SODH_diet_mort.shape[0]

# Number of unique SEQN values
unique_ids = SODH_diet_mort['SEQN'].nunique()

print(f" Total rows: {total_rows}")
print(f" Unique SEQN values: {unique_ids}")


In [None]:
import pandas as pd

# Load the .pkl file
pkl_path = "/Users/dengshuyue/Desktop/SDOH/analysis/data/SODH_diet_mort.pkl"
df = pd.read_pickle(pkl_path)

# Save as .csv
csv_path = "/Users/dengshuyue/Desktop/SDOH/analysis/data/SODH_diet_mort.csv"
df.to_csv(csv_path, index=False)

print("✅ Conversion complete: .pkl → .csv")
