<h1> 00 — merge demo, mort, sdoh</h1>

<h2>Shared environment and helper functions used across notebooks.</h2>
    

## Step 1: NHANES demographic data load and find need column

## merge above needed column to demo_cov_depression 

## Step 3: merge demo_mort_depr with below sdoh


<h3> OCQ / HOQ / HIQ / FSQ — early (1999–2002) + main (2003–2018), adult filter, tidy outputs </h3>


In [23]:
# %% Prereqs & helpers
import pandas as pd, numpy as np, os
from pathlib import Path

ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
DATA = ROOT / "data"
MOD  = DATA / "nhanes_by_module"

def read_any(p: Path) -> pd.DataFrame:
    s = p.suffix.lower()
    if s == ".xpt":
        import pyreadstat
        df, _ = pyreadstat.read_xport(str(p))
    elif s == ".sas7bdat":
        df = pd.read_sas(str(p), format="sas7bdat", encoding="latin1")
    elif s == ".csv":
        df = pd.read_csv(p)
    elif s == ".parquet":
        df = pd.read_parquet(p)
    else:
        raise ValueError(f"Unsupported file: {p}")
    df.columns = df.columns.str.upper()
    return df

def filter_adults(df: pd.DataFrame) -> pd.DataFrame:
    """Adults ≥20 using demo9923 already in memory (SEQN, RIDAGEYR)."""
    if "demo9923" not in globals():
        raise RuntimeError("demo9923 not found in memory; load/build it first.")
    demo = demo9923.copy()
    demo.columns = demo.columns.str.upper()
    age = demo[["SEQN","RIDAGEYR"]].dropna()
    age = age[age["RIDAGEYR"] >= 20]
    return df.merge(age, on="SEQN", how="inner")

def s_or_false(df: pd.DataFrame, col: str):
    return df[col] if col in df.columns else pd.Series(False, index=df.index)


In [24]:
# %% OCQ: 1999–2018 (employment recode)
OCQ = MOD / "ocq"
ocq_main = OCQ / "ocq.sas7bdat"
ocq_early_files = [(OCQ/"OCQ.xpt",1), (OCQ/"OCQ_B.xpt",2)]

def recode_employment(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["EMPLOY"] = np.nan
    if "OCD150" in df:
        df.loc[df["OCD150"] == 1, "EMPLOY"] = 1
        df.loc[df["OCD150"] == 3, "EMPLOY"] = 2
    if "OCQ380" in df:
        df.loc[df["OCQ380"] == 5, "EMPLOY"] = 2
        df.loc[df["OCQ380"] == 3, "EMPLOY"] = 3
        df.loc[df["OCQ380"].isin([4,6]), "EMPLOY"] = 4
        df.loc[df["OCQ380"].isin([1,2,7]), "EMPLOY"] = 5
    df["UNEMPLOYMENT"] = (df["EMPLOY"] == 2).astype("Int64")
    keep = [c for c in ["SEQN","EMPLOY","UNEMPLOYMENT","SDDSRVYR"] if c in df.columns]
    return df[keep]

# early (adult filter applied)
parts = []
for p, cyc in ocq_early_files:
    if p.exists():
        df = read_any(p)
        df["SDDSRVYR"] = cyc
        df = filter_adults(df)
        parts.append(recode_employment(df))
ocq_early = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()

# main (adult filter applied)
if not ocq_main.exists():
    raise FileNotFoundError(f"Missing {ocq_main}")
ocq_main_df = filter_adults(read_any(ocq_main))
ocq_main_df = recode_employment(ocq_main_df)

# combine + hygiene
ocq = pd.concat([ocq_early, ocq_main_df], ignore_index=True)
ocq["SEQN"] = pd.to_numeric(ocq["SEQN"], errors="coerce").astype("Int64")
if "SDDSRVYR" in ocq: ocq["SDDSRVYR"] = pd.to_numeric(ocq["SDDSRVYR"], errors="coerce").astype("Int64")
ocq = ocq.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])
print("OCQ:", ocq.shape)


OCQ: (55081, 4)


In [25]:
# %% HOQ: 1999–2018 (housing subset)
HOQ = MOD / "hoq"
hoq_main = HOQ / "hoq.sas7bdat"
hoq_early_files = [(HOQ/"HOQ.xpt",1), (HOQ/"HOQ_B.xpt",2), (HOQ/"hoq.sas7bdat",1), (HOQ/"hoq_b.sas7bdat",2)]

def preprocess_hoq(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "HOQ065" in df: df.loc[df["HOQ065"].isin([7,9]), "HOQ065"] = np.nan
    keep = [c for c in ["SEQN","HOD050","HOQ065","SDDSRVYR"] if c in df.columns]
    return df[keep]

parts = []
seen = set()
for p, cyc in hoq_early_files:
    if p.exists() and cyc not in seen:
        df = read_any(p); df["SDDSRVYR"] = cyc
        df = filter_adults(df)
        parts.append(preprocess_hoq(df)); seen.add(cyc)
hoq_early = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()

if not hoq_main.exists():
    raise FileNotFoundError(f"Missing {hoq_main}")
hoq_main_df = preprocess_hoq(filter_adults(read_any(hoq_main)))

hoq_all = pd.concat([hoq_early, hoq_main_df], ignore_index=True)
for c in ("SEQN","SDDSRVYR"):
    if c in hoq_all: hoq_all[c] = pd.to_numeric(hoq_all[c], errors="coerce").astype("Int64")
hoq_all = hoq_all.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])
print("HOQ:", hoq_all.shape)


HOQ: (55081, 4)


In [26]:
# %% HIQ/HIQS: 1999–2018 (insurance category INS)
HIQ = MOD / "hiq"
hiq_main = HIQ / "hiqs.sas7bdat"
hiq_early_files = [(HIQ/"HIQ.xpt",1), (HIQ/"HIQ_B.xpt",2), (HIQ/"hiq.sas7bdat",1), (HIQ/"hiq_b.sas7bdat",2)]

# stack early + main, adult filter
parts, seen = [], set()
for p, cyc in hiq_early_files:
    if p.exists() and cyc not in seen:
        df = read_any(p); df["SDDSRVYR"] = cyc
        parts.append(filter_adults(df)); seen.add(cyc)
hiq_early = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()

if not hiq_main.exists():
    raise FileNotFoundError(f"Missing {hiq_main}")
hiqs = filter_adults(read_any(hiq_main))

hiq_all = pd.concat([hiq_early, hiqs], ignore_index=True)
hiq_all.columns = hiq_all.columns.str.upper()

# build INS (0 none, 1 private, 2 Medicare, 3 Medicaid/both, 5 other)
ins = pd.DataFrame({"SEQN": hiq_all["SEQN"]})
if "SDDSRVYR" in hiq_all: ins["SDDSRVYR"] = hiq_all["SDDSRVYR"]
ins["INS"] = np.nan

cond_private = (s_or_false(hiq_all,"HIQ031A") == 14) | (s_or_false(hiq_all,"HID030A") == 1)
ins.loc[cond_private, "INS"] = 1

cond_med = (
    (s_or_false(hiq_all,"HIQ031B") == 15) &
    (s_or_false(hiq_all,"HIQ031D") != 17) &
    (s_or_false(hiq_all,"HIQ031E") != 18)
) | (
    (s_or_false(hiq_all,"HID030B") == 1) &
    (s_or_false(hiq_all,"HID030C") != 1)
)
ins.loc[cond_med, "INS"] = 2

cond_mcaid_only = (
    ((s_or_false(hiq_all,"HIQ031D") == 17) | (s_or_false(hiq_all,"HIQ031E") == 18)) &
    (s_or_false(hiq_all,"HIQ031B") != 15)
) | (
    (s_or_false(hiq_all,"HID030B") != 1) &
    (s_or_false(hiq_all,"HID030C") == 1)
)
cond_both = (
    (s_or_false(hiq_all,"HIQ031B") == 15) & (s_or_false(hiq_all,"HIQ031D") == 17)
) | (
    (s_or_false(hiq_all,"HID030B") == 1) & (s_or_false(hiq_all,"HID030C") == 1)
)
ins.loc[cond_mcaid_only | cond_both, "INS"] = 3

other_cols = [c for c in ["HIQ031C","HIQ031F","HIQ031G","HIQ031H","HIQ031I"] if c in hiq_all.columns]
cond_other = hiq_all[other_cols].eq(1).any(axis=1) if other_cols else pd.Series(False, index=hiq_all.index)
cond_other = cond_other | (s_or_false(hiq_all,"HID030D") == 1)
ins.loc[cond_other, "INS"] = 5

none_conds = []
if "HIQ011" in hiq_all: none_conds.append(hiq_all["HIQ011"] == 2)
if "HID010" in hiq_all: none_conds.append(hiq_all["HID010"] == 2)
if none_conds: ins.loc[np.logical_or.reduce(none_conds), "INS"] = 0

for c in ("SEQN","SDDSRVYR"):
    if c in ins: ins[c] = pd.to_numeric(ins[c], errors="coerce").astype("Int64")
ins = ins.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])
print("INS:", ins.shape)


INS: (55081, 3)


In [27]:
# %% FSQ/FSQS: 1999–2018 (SNAP & FS)
FSQ = MOD / "fsq"
fsq_main = FSQ / "fsqs.sas7bdat"
fsq_early_files = [(FSQ/"FSQ.xpt",1), (FSQ/"FSQ_B.xpt",2), (FSQ/"fsq.sas7bdat",1), (FSQ/"fsq_b.sas7bdat",2)]

parts, seen = [], set()
for p, cyc in fsq_early_files:
    if p.exists() and cyc not in seen:
        df = read_any(p); df["SDDSRVYR"] = cyc
        parts.append(filter_adults(df)); seen.add(cyc)
fsq_early = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()

if not fsq_main.exists():
    raise FileNotFoundError(f"Missing {fsq_main}")
fsqs = filter_adults(read_any(fsq_main))

fsq_all = pd.concat([fsq_early, fsqs], ignore_index=True)
fsq_all.columns = fsq_all.columns.str.upper()

snap = pd.DataFrame({"SEQN": fsq_all["SEQN"]})
if "SDDSRVYR" in fsq_all: snap["SDDSRVYR"] = fsq_all["SDDSRVYR"]
if "FSDHH" in fsq_all:   snap["FSDHH"] = fsq_all["FSDHH"]

snap["SNAP"] = np.nan
if "FSQ165" in fsq_all: snap.loc[fsq_all["FSQ165"] == 2, "SNAP"] = 0
if "FSQ012" in fsq_all:
    snap.loc[fsq_all["FSQ012"] == 1, "SNAP"] = 1
    snap.loc[fsq_all["FSQ012"] == 2, "SNAP"] = 0
if "FSQ171" in fsq_all:
    snap.loc[fsq_all["FSQ171"] == 1, "SNAP"] = 1
    snap.loc[fsq_all["FSQ171"] == 2, "SNAP"] = 0
if "FSD170N" in fsq_all: snap.loc[fsq_all["FSD170N"] >= 1, "SNAP"] = 1
if "FSQ170" in fsq_all:
    snap.loc[fsq_all["FSQ170"] == 1, "SNAP"] = 1
    snap.loc[(fsq_all["FSQ170"] == 2) & (fsq_all.get("FSD170N", pd.Series(index=fsq_all.index)) < 1), "SNAP"] = 0
if "FSD200" in fsq_all: snap.loc[fsq_all["FSD200"] == 1, "SNAP"] = 1

snap["FS"] = np.nan
if "FSDHH" in fsq_all:
    snap.loc[fsq_all["FSDHH"].isin([1,2]), "FS"] = 1
    snap.loc[fsq_all["FSDHH"] > 2,        "FS"] = 0

for c in ("SEQN","SDDSRVYR"):
    if c in snap: snap[c] = pd.to_numeric(snap[c], errors="coerce").astype("Int64")
snap = snap[[c for c in ["SEQN","SNAP","FSDHH","FS","SDDSRVYR"] if c in snap.columns]]
snap = snap.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])
print("SNAP/FS:", snap.shape)


SNAP/FS: (55081, 5)


<h3> Audit — cycles present & inner vs left-merge coverage </h3>

In [28]:
# %% Audit helpers + run
def cycles(df, name):
    if "SDDSRVYR" in df.columns and df["SDDSRVYR"].notna().any():
        print(f"{name} cycles:", sorted(df["SDDSRVYR"].dropna().astype(int).unique()))
    else:
        tmp = df.merge(demo9923[["SEQN","SDDSRVYR"]], on="SEQN", how="left")
        if tmp["SDDSRVYR"].notna().any():
            print(f"{name} cycles (via DEMO):", sorted(tmp["SDDSRVYR"].dropna().astype(int).unique()))
        else:
            print(f"{name} cycles: (none found)")

def coverage(base, addon, name):
    base_u = base["SEQN"].dropna().nunique()
    inner_u = base.merge(addon[["SEQN"]].drop_duplicates(), on="SEQN", how="inner")["SEQN"].nunique()
    left_u  = base.merge(addon[["SEQN"]].drop_duplicates(),  on="SEQN", how="left")["SEQN"].nunique()
    print(f"{name:>5} | base={base_u:,} | inner keeps={inner_u:,} | left keeps={left_u:,}")

# Choose a base for audit (prefers mortality, else diet)
if "mort_with_demo" in globals():
    base_df = mort_with_demo
elif "SODH_diet_mort" in globals():
    base_df = SODH_diet_mort
else:
    base_df = demo9923

for (n, df) in [("OCQ", ocq), ("HOQ", hoq_all), ("HIQ", ins), ("FSQ", snap)]:
    cycles(df, n)
    coverage(base_df, df, n)


OCQ cycles: [np.int64(1), np.int64(2)]
  OCQ | base=56,253 | inner keeps=52,287 | left keeps=56,253
HOQ cycles: [np.int64(1), np.int64(2)]
  HOQ | base=56,253 | inner keeps=52,287 | left keeps=56,253
HIQ cycles: [np.int64(1), np.int64(2)]
  HIQ | base=56,253 | inner keeps=52,287 | left keeps=56,253
FSQ cycles: [np.int64(1), np.int64(2)]
  FSQ | base=56,253 | inner keeps=52,287 | left keeps=56,253


<h3> Left-merge onto mortality base and save </h3>

In [54]:
# check if output path is correct 
import os
from pathlib import Path

print("ROOT =", ROOT)
print("OUT  =", OUT)
print("SDOH_ROOT env =", os.environ.get("SDOH_ROOT"))

# ✅ Force the correct OUT (one-time reassignment in this kernel)
OUT = ROOT / "output"
print("OUT (fixed) =", OUT)


ROOT = /Users/dengshuyue/Desktop/SDOH/analysis
OUT  = /Users/dengshuyue/Desktop/SDOH/analysis/output
SDOH_ROOT env = None
OUT (fixed) = /Users/dengshuyue/Desktop/SDOH/analysis/output


In [52]:
# %% Build master (LEFT merges onto mort_with_demo) and save — refined
import pandas as pd
import numpy as np

if "mort_with_demo" not in globals():
    print("⚠️ mort_with_demo not found — skip merge/save.")
else:
    def _key(df: pd.DataFrame) -> pd.DataFrame:
        d = df.copy()
        d.columns = d.columns.str.upper()
        d["SEQN"] = pd.to_numeric(d["SEQN"], errors="coerce").astype("Int64")
        return d.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])

    base = _key(mort_with_demo)

    # Pick only needed columns if present
    ocq_m  = _key(ocq)      [[c for c in ("SEQN","EMPLOY","UNEMPLOYMENT") if c in ocq.columns]]      if "ocq"      in globals() else None
    hoq_m  = _key(hoq_all)  [[c for c in ("SEQN","HOD050","HOQ065")        if c in hoq_all.columns]]  if "hoq_all"  in globals() else None
    ins_m  = _key(ins)      [[c for c in ("SEQN","INS")                    if c in ins.columns]]      if "ins"      in globals() else None
    snap_m = _key(snap)     [[c for c in ("SEQN","SNAP","FSDHH","FS")      if c in snap.columns]]     if "snap"     in globals() else None

    # Merge in sequence with one-to-one validation
    master = base
    for name, piece in [("OCQ", ocq_m), ("HOQ", hoq_m), ("INS", ins_m), ("FSQ", snap_m)]:
        if piece is None:
            print(f"ℹ️  {name}: not available — skipped")
            continue
        try:
            master = master.merge(piece, on="SEQN", how="left", validate="one_to_one")
        except Exception as e:
            raise RuntimeError(f"Merge failed for {name} (check duplicate SEQN).") from e

    print("✅ master shape:", master.shape)

    # Quick audit (non-missing coverage for key SDOH fields)
    audit_cols = [c for c in ["EMPLOY","UNEMPLOYMENT","HOD050","HOQ065","INS","SNAP","FS"] if c in master.columns]
    if audit_cols:
        cov = (master[audit_cols].notna().mean()*100).round(1).sort_values(ascending=False)
        print("\nNon-missing coverage (%):")
        for k, v in cov.items(): print(f"  {k}: {v}%")

    # Save (use predefined OUT)
    OUT.mkdir(parents=True, exist_ok=True)
    base_name = "nhanes_mort_demo_sdoh_1999_2018"
    master.to_parquet(OUT / f"{base_name}.parquet", index=False)
    master.to_csv(    OUT / f"{base_name}.csv",     index=False)
    print("\nSaved:\n ", OUT / f"{base_name}.parquet", "\n ", OUT / f"{base_name}.csv")


✅ master shape: (56253, 20)

Non-missing coverage (%):
  UNEMPLOYMENT: 92.9%
  HOD050: 91.6%
  HOQ065: 91.5%
  INS: 87.7%
  EMPLOY: 86.8%
  SNAP: 76.3%
  FS: 74.7%

Saved:
  /Users/dengshuyue/Desktop/SDOH/analysis/output/nhanes_mort_demo_sdoh_1999_2018.parquet 
  /Users/dengshuyue/Desktop/SDOH/analysis/output/nhanes_mort_demo_sdoh_1999_2018.csv


In [65]:
# %% Build master (LEFT merges onto mort_with_demo) and save — refined
from pathlib import Path
import pandas as pd
import numpy as np

# If OUT wasn't defined earlier, set it here
ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
OUT = ROOT / "out"
OUT.mkdir(parents=True, exist_ok=True)

if "mort_with_demo" not in globals():
    print("⚠️ mort_with_demo not found — skip merge/save.")
else:
    def _key(df: pd.DataFrame) -> pd.DataFrame:
        d = df.copy()
        d.columns = d.columns.str.upper()
        # normalize SEQN
        d["SEQN"] = pd.to_numeric(d["SEQN"], errors="coerce")
        # if it's float but all .0, move to Int64
        if pd.api.types.is_float_dtype(d["SEQN"]) and ((d["SEQN"] % 1) == 0).all():
            d["SEQN"] = d["SEQN"].astype("Int64")
        else:
            d["SEQN"] = d["SEQN"].astype("Int64")
        return d.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])

    base = _key(mort_with_demo)

    # helper to pick columns AFTER uppercasing
    def _pick_cols(df_up: pd.DataFrame, candidates: tuple[str, ...]) -> pd.DataFrame:
        keep = [c for c in candidates if c in df_up.columns]
        return df_up[keep] if keep else df_up[["SEQN"]]

    ocq_m  = _pick_cols(_key(ocq),      ("SEQN","EMPLOY","UNEMPLOYMENT")) if "ocq"      in globals() else None
    hoq_m  = _pick_cols(_key(hoq_all),  ("SEQN","HOD050","HOQ065"))        if "hoq_all"  in globals() else None
    ins_m  = _pick_cols(_key(ins),      ("SEQN","INS"))                    if "ins"      in globals() else None
    snap_m = _pick_cols(_key(snap),     ("SEQN","SNAP","FSDHH","FS"))      if "snap"     in globals() else None

    # Merge in sequence with one-to-one validation
    master = base
    for name, piece in [("OCQ", ocq_m), ("HOQ", hoq_m), ("INS", ins_m), ("FSQ", snap_m)]:
        if piece is None:
            print(f"ℹ️  {name}: not available — skipped")
            continue
        # make sure no dupes in piece (already handled by _key → but safe)
        if piece["SEQN"].duplicated().any():
            raise RuntimeError(f"{name}: duplicate SEQN detected after cleaning.")
        try:
            master = master.merge(piece, on="SEQN", how="left", validate="one_to_one")
        except Exception as e:
            raise RuntimeError(f"Merge failed for {name} (check duplicate SEQN).") from e

    print("✅ master shape:", master.shape)

    # Quick audit (non-missing coverage for key SDOH fields)
    audit_cols = [c for c in ["EMPLOY","UNEMPLOYMENT","HOD050","HOQ065","INS","SNAP","FS"] if c in master.columns]
    if audit_cols:
        cov = (master[audit_cols].notna().mean()*100).round(1).sort_values(ascending=False)
        print("\nNon-missing coverage (%):")
        for k, v in cov.items():
            print(f"  {k}: {v}%")
    else:
        print("ℹ️ No audit columns present among EMPLOY/UNEMPLOYMENT/HOD050/HOQ065/INS/SNAP/FS.")

    # Save
    base_name = "nhanes_mort_demo_sdoh_1999_2018"
    master.to_parquet(OUT / f"{base_name}.parquet", index=False)
    master.to_csv(    OUT / f"{base_name}.csv",     index=False)
    print("\nSaved:\n ", OUT / f"{base_name}.parquet", "\n ", OUT / f"{base_name}.csv")


✅ master shape: (56253, 20)

Non-missing coverage (%):
  UNEMPLOYMENT: 92.9%
  HOD050: 91.6%
  HOQ065: 91.5%
  INS: 87.7%
  EMPLOY: 86.8%
  SNAP: 76.3%
  FS: 74.7%

Saved:
  /Users/dengshuyue/Desktop/SDOH/analysis/out/nhanes_mort_demo_sdoh_1999_2018.parquet 
  /Users/dengshuyue/Desktop/SDOH/analysis/out/nhanes_mort_demo_sdoh_1999_2018.csv


In [58]:
# keep a clear in-memory handle for downstream cells
nhanes_mort_demo_sdoh = master.copy()

# TEMP compatibility alias for older notebooks
nhanes_mort_demo_soc_9918 = nhanes_mort_demo_sdoh
df = nhanes_mort_demo_sdoh

In [59]:


# Use in-memory object if available; else load from disk
try:
    df = nhanes_mort_demo_soc_9918
except NameError:
    ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
    candidates = [
        ROOT / "analysis" / "output" / "nhanes_mort_demo_soc_1999_2018.parquet",
        ROOT / "analysis" / "output" / "mort_with_demo_plus_soc.parquet",
    ]
    for p in candidates:
        if p.exists():
            df = pd.read_parquet(p)
            print(f"Loaded from: {p}")
            break
    else:
        raise FileNotFoundError("Couldn’t find the saved table in expected locations.")

# Ensure SEQN exists and is numeric
if "SEQN" not in df.columns:
    raise KeyError("SEQN column not found.")
s = pd.to_numeric(df["SEQN"], errors="coerce")

print(f"Rows: {len(df):,} | Unique SEQN: {s.nunique(dropna=True):,} | "
      f"Missing SEQN: {s.isna().sum():,} | Duplicates: {df.duplicated('SEQN').sum():,}")
print(f"SEQN range: {int(s.min())} → {int(s.max())}")

# Optional: cycles present (if available)
if "SDDSRVYR" in df.columns:
    print("Cycles:\n", df["SDDSRVYR"].value_counts(dropna=False).sort_index())


Rows: 56,253 | Unique SEQN: 56,253 | Missing SEQN: 0 | Duplicates: 0
SEQN range: 2 → 102956
Cycles:
 SDDSRVYR
1.0     4973
2.0     5586
3.0     5293
4.0     5332
5.0     5989
6.0     6346
7.0     5603
8.0     5913
9.0     5720
10.0    5498
Name: count, dtype: int64


<h3> Optional Check for old files and moved to old </h3>

In [61]:
from pathlib import Path
from datetime import datetime
import pandas as pd

# Correct output folder (no double "analysis")
out = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")

# Try new names first, then legacy
candidates = [
    out / "nhanes_mort_demo_sdoh_1999_2018.parquet",
    out / "nhanes_mort_demo_sdoh_1999_2018.csv",
    out / "nhanes_mort_demo_soc_1999_2018.parquet",  # legacy
    out / "nhanes_mort_demo_soc_1999_2018.csv",      # legacy
]

# Show which of the candidates exist, with timestamps and sizes
print("Looking in:", out)
found_any = False
for p in candidates:
    if p.exists():
        ts = datetime.fromtimestamp(p.stat().st_mtime).strftime("%Y-%m-%d %H:%M:%S")
        print(f"  ✓ {p.name:40s} | modified: {ts} | size: {p.stat().st_size:,} bytes")
        found_any = True
    else:
        print(f"  - {p.name:40s} (missing)")

# Load the first existing file (prefer Parquet)
df = None
for p in candidates:
    if p.exists():
        df = pd.read_parquet(p) if p.suffix == ".parquet" else pd.read_csv(p)
        print("\nLoaded:", p)
        break

if df is None:
    print("\n⚠️ No saved table found in expected locations. "
          "Re-run the merge/save cell in 00_demo_mort_sdoh.ipynb.")
else:
    print("Shape:", df.shape)
    print("Cols (first 10):", df.columns[:10].tolist())


Looking in: /Users/dengshuyue/Desktop/SDOH/analysis/output
  ✓ nhanes_mort_demo_sdoh_1999_2018.parquet  | modified: 2025-09-09 13:20:01 | size: 774,526 bytes
  ✓ nhanes_mort_demo_sdoh_1999_2018.csv      | modified: 2025-09-09 13:20:01 | size: 4,562,121 bytes
  - nhanes_mort_demo_soc_1999_2018.parquet   (missing)
  - nhanes_mort_demo_soc_1999_2018.csv       (missing)

Loaded: /Users/dengshuyue/Desktop/SDOH/analysis/output/nhanes_mort_demo_sdoh_1999_2018.parquet
Shape: (56253, 20)
Cols (first 10): ['SEQN', 'ELIGSTAT', 'MORTSTAT', 'PERMTH_EXM', 'PERMTH_INT', 'UCOD_LEADING', 'DIABETES', 'HYPERTEN', 'TIME_Y', 'EVENT']


In [62]:
from pathlib import Path
from datetime import datetime

ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
OUT  = ROOT / "analysis" / "output"
DATA = ROOT / "data"

patterns = {
    OUT:  ["mort_with_demo_plus_soc.*",
           "nhanes_mort_demo_soc_*.parquet",
           "nhanes_mort_demo_soc_*.csv"],
    DATA: ["SODH_diet_mort*"],
}

def keep_newest_per_ext(paths):
    by_ext = {}
    for p in paths:
        by_ext.setdefault(p.suffix.lower(), []).append(p)
    keep = set()
    for ext, files in by_ext.items():
        keep.add(max(files, key=lambda x: x.stat().st_mtime))
    return keep

def fmt_size(num_bytes: int) -> str:
    for unit in ("B","KB","MB","GB","TB"):
        if num_bytes < 1024 or unit == "TB":
            return f"{num_bytes:,.0f} {unit}"
        num_bytes /= 1024

# gather matches
all_matches = []
for base, globs in patterns.items():
    for pat in globs:
        all_matches.extend(sorted(base.glob(pat)))

# decide which to keep/delete
TO_KEEP = keep_newest_per_ext(all_matches)
TO_DELETE = [p for p in all_matches if p not in TO_KEEP]

print("✅ Keeping (newest per extension):")
keep_total = 0
for p in sorted(TO_KEEP):
    stat = p.stat()
    ts = datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M:%S")
    sz = fmt_size(stat.st_size); keep_total += stat.st_size
    print(f"  ✓ {p}  |  {ts}  |  {sz}")
print(f"  ↳ total size kept: {fmt_size(keep_total)}")

print("\n🗑️ Candidates to delete (older versions):")
del_total = 0
for p in sorted(TO_DELETE):
    stat = p.stat()
    ts = datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M:%S")
    sz = fmt_size(stat.st_size); del_total += stat.st_size
    print(f"  - {p}  |  {ts}  |  {sz}")
print(f"  ↳ total size to delete: {fmt_size(del_total)}")

(len(TO_KEEP), len(TO_DELETE))


✅ Keeping (newest per extension):
  ✓ /Users/dengshuyue/Desktop/SDOH/analysis/data/SODH_diet_mort.pkl  |  2025-07-18 10:39:20  |  27 MB
  ✓ /Users/dengshuyue/Desktop/SDOH/analysis/data/SODH_diet_mort_depr2.csv  |  2025-09-08 16:16:55  |  21 MB
  ↳ total size kept: 48 MB

🗑️ Candidates to delete (older versions):
  - /Users/dengshuyue/Desktop/SDOH/analysis/data/SODH_diet_mort6.csv  |  2025-08-10 11:39:07  |  14 MB
  ↳ total size to delete: 14 MB


(2, 1)

In [63]:
from pathlib import Path
from datetime import datetime
import shutil

ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
DATA = ROOT / "data"
OLD  = DATA / "old"
OLD.mkdir(parents=True, exist_ok=True)

# 1) Find the SODH diet CSVs under data/
cands = sorted(DATA.glob("SODH_diet_mort*.csv"))

# 2) Exclude the two you want to keep
exclude_stems = {"SODH_diet_mort_depr2", "SODH_diet_mort6"}
to_move = [p for p in cands if p.stem not in exclude_stems]

# --- Preview ---
def fmt_size(n):
    for u in ("B","KB","MB","GB","TB"):
        if n < 1024: return f"{n:,.0f} {u}"
        n /= 1024

if not to_move:
    print("Nothing to move.")
else:
    total = 0
    print("Will move to /data/old:")
    for p in to_move:
        st = p.stat()
        total += st.st_size
        ts = datetime.fromtimestamp(st.st_mtime).strftime("%Y-%m-%d %H:%M:%S")
        print(f"  • {p.name}  |  {fmt_size(st.st_size)}  |  {ts}")
    print(f"Total: {fmt_size(total)}")

# --- Move (flip to True to execute) ---
CONFIRM_MOVE = False  # <-- set True to actually move

if CONFIRM_MOVE and to_move:
    for p in to_move:
        target = OLD / p.name
        if target.exists():  # avoid overwrite if a same-named file already there
            stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
            target = OLD / f"{p.stem}_{stamp}{p.suffix}"
        shutil.move(str(p), str(target))
        print(f"📦 moved: {p.name} -> {target.name}")


Nothing to move.


In [64]:
# Recompute to_move safely
cands = sorted(DATA.glob("SODH_diet_mort*.csv"))
exclude_stems = {"SODH_diet_mort_depr2", "SODH_diet_mort6"}
to_move = [p for p in cands if p.stem not in exclude_stems]

# Execute move
for p in to_move:
    target = OLD / p.name
    if target.exists():  # avoid overwrite
        stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
        target = OLD / f"{p.stem}_{stamp}{p.suffix}"
    shutil.move(str(p), str(target))
    print(f"📦 moved: {p.name} -> {target}")

# Quick verify
print("\nNow in /data/old:")
for p in sorted(OLD.glob("SODH_diet_mort*.csv")):
    print(" -", p.name)


Now in /data/old:
 - SODH_diet_mort.csv
 - SODH_diet_mort2.csv
 - SODH_diet_mort3.csv
 - SODH_diet_mort4.csv
 - SODH_diet_mort5.csv
 - SODH_diet_mort_depr.csv
