# 01 — check previous merged file and preview COV(NHANES)

Run first cell:
```python
%run 00_demo_mort_sdoh.ipynb
```

In [2]:

# If not already installed, install pyreadstat (used to read .sas7bdat files)
!pip install pyreadstat
!pip install openpyxl

# Import core packages
import pandas as pd
import os
import numpy as np
import pyreadstat
from scipy.stats import sem
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")


#path 
output_path = "/Users/dengshuyue/Desktop/SDOH/analysis/output/demo_summary.csv"
data_path = "/Users/dengshuyue/Desktop/SDOH/analysis/data"


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
from pathlib import Path
import pandas as pd

OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
cands = [
    OUT/"nhanes_mort_demo_sdoh_1999_2018.parquet",
    OUT/"nhanes_mort_demo_soc_1999_2018.parquet",
    OUT/"nhanes_mort_demo_sdoh_1999_2018.csv",
    OUT/"nhanes_mort_demo_soc_1999_2018.csv",
]

demo_mort_sodh = None
for p in cands:
    if p.exists():
        demo_mort_sodh = pd.read_parquet(p) if p.suffix == ".parquet" else pd.read_csv(p)
        print("Loaded:", p)
        break
if demo_mort_sodh is None:
    raise FileNotFoundError("No saved nhanes_mort_demo_* file found in OUT.")


Loaded: /Users/dengshuyue/Desktop/SDOH/analysis/output/nhanes_mort_demo_sdoh_1999_2018.parquet


In [12]:
# Display total number of rows (i.e., participants) in the dataset
print(f"Total sample size: {demo_mort_sodh.shape[0]:,}")

# Check the min and max age
min_age = demo_mort_sodh['RIDAGEYR'].min()
max_age = demo_mort_sodh['RIDAGEYR'].max()
print(f"Age range: {min_age} to {max_age} years")



Total sample size: 56,253
Age range: 18.0 to 85.0 years


<h2>check demo</h2>

In [14]:
DATA = Path("/Users/dengshuyue/Desktop/SDOH/analysis/data")

# 1) Load the pickle (fastest, preserves dtypes)
demo9923 = pd.read_pickle(DATA / "demo9923.pkl")

# 2) Or load the CSV (max compatibility)
# demo9923 = pd.read_csv(DATA / "demo9923.csv")

print("RIDAGEYR" in demo9923.columns)  # Should print True
print(demo9923.columns)

True
Index(['SEQN', 'RIDAGEYR', 'SDDSRVYR', 'CYCLE', 'MARRIAGE', 'MARRIAGE3'], dtype='object')


<h2>check cov</h2>

In [18]:
# first load mort_with_demo get ready to merge cov in 
from pathlib import Path
import pandas as pd

OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")

candidates = [
    OUT/"mort_with_demo.parquet",
    OUT/"nhanes_mort_demo_sdoh_1999_2018.parquet",  # your later master also works
    OUT/"mort_with_demo.csv",
]
for p in candidates:
    if p.exists():
        mort_with_demo = pd.read_parquet(p) if p.suffix==".parquet" else pd.read_csv(p)
        print("Loaded mort_with_demo from:", p)
        break
else:
    print("No saved mort_with_demo file found.")


Loaded mort_with_demo from: /Users/dengshuyue/Desktop/SDOH/analysis/output/nhanes_mort_demo_sdoh_1999_2018.parquet


In [24]:
# # %% Inventory & peek: DATA/cov
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np

# If DATA isn't defined already, set it:
try:
    DATA
except NameError:
    DATA = Path("/Users/dengshuyue/Desktop/SDOH/analysis/data")

COV_DIR = DATA / "cov"
if not COV_DIR.exists():
    raise FileNotFoundError(f"cov folder not found: {COV_DIR}")

EXSTS = {".csv", ".parquet", ".pkl", ".sas7bdat", ".xpt"}

def fmt_size(n):
    for u in ("B","KB","MB","GB"):
        if n < 1024 or u == "GB": return f"{n:,.0f} {u}"
        n /= 1024

def peek_file(p: Path):
    suf = p.suffix.lower()
    cols, sample, ncols = [], None, None

    try:
        if suf == ".csv":
            df = pd.read_csv(p, nrows=5, low_memory=False)
            cols = df.columns.tolist(); ncols = len(cols); sample = df.head(3)
        elif suf == ".parquet":
            try:
                import pyarrow.parquet as pq
                schema = pq.ParquetFile(p).schema_arrow
                cols = list(schema.names); ncols = len(cols)
                # light sample (first row group if available)
                try:
                    sample = pq.read_table(p, columns=cols[:10], use_threads=False).to_pandas().head(3)
                except Exception:
                    sample = None
            except Exception:
                # fallback (may read whole file—ok if small)
                df = pd.read_parquet(p)
                cols = df.columns.tolist(); ncols = len(cols); sample = df.head(3)
        elif suf == ".pkl":
            df = pd.read_pickle(p)  # loads fully; ok if manageable
            cols = df.columns.tolist(); ncols = len(cols); sample = df.head(3)
        elif suf in {".sas7bdat", ".xpt"}:
            try:
                import pyreadstat
                if suf == ".sas7bdat":
                    df, _ = pyreadstat.read_sas7bdat(p, row_limit=5)
                else:
                    df, _ = pyreadstat.read_xport(p, row_limit=5)
            except Exception:
                # pandas fallback (may be slower)
                fmt = "sas7bdat" if suf == ".sas7bdat" else "xport"
                df = pd.read_sas(p, format=fmt)
                df = df.head(5)
            cols = df.columns.tolist(); ncols = len(cols); sample = df.head(3)
        else:
            cols, ncols, sample = [], None, None
    except Exception as e:
        print(f"   ⚠️ Could not peek {p.name}: {e}")

    return cols, ncols, sample

# List & describe
files = sorted([p for p in COV_DIR.iterdir() if p.is_file() and p.suffix.lower() in EXSTS],
               key=lambda x: x.stat().st_mtime, reverse=True)

if not files:
    print("No covariate files found in", COV_DIR)
else:
    for p in files:
        stat = p.stat()
        ts = datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M:%S")
        sz = fmt_size(stat.st_size)
        print(f"\n— {p.name}  |  {sz}  |  {ts}")

        cols, ncols, sample = peek_file(p)
        if ncols is not None:
            print(f"   Columns ({ncols}): {cols[:25]}{' ...' if ncols>25 else ''}")
            up = [c.upper() for c in cols]
            has = {k: (k in up) for k in ["SEQN","RIDAGEYR","SDDSRVYR","EVENT","TIME_Y"]}
            print("   Key fields:", has)
        if isinstance(sample, pd.DataFrame):
            print("   Sample rows:")
            display(sample)  # in notebooks this renders a nice table

# nhanes_primary_anal_full_singleimputation_v2: No smoking-related columns found 


— nhanes_primary_anal_full_singleimputation_v2.csv  |  22 MB  |  2025-09-09 12:01:49
   Columns (75): ['X', 'SEQN', 'SDDSRVYR', 'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'SDMVPSU', 'SDMVSTRA', 'age', 'sex', 're', 'edu', 'pir', 'tchol', 'hdl', 'ldl', 'tg', 'wc', 'bmi', 'dm_self', 'hba1c', 'fpg', 'chf', 'chd', 'mi'] ...
   Key fields: {'SEQN': True, 'RIDAGEYR': False, 'SDDSRVYR': True, 'EVENT': False, 'TIME_Y': False}
   Sample rows:


Unnamed: 0,X,SEQN,SDDSRVYR,WTINT2YR,WTMEC2YR,WTSAF2YR,SDMVPSU,SDMVSTRA,age,sex,...,poor_all,optimal_all_sec,poor_all_sec,MetS_hdl,MetS_triglycerides,MetS_bp,MetS_wc,MetS_fpg,MetS_count,MetS
0,1,1,1,9727.078709,10982.898896,75131.2,1,5,2,2,...,0,0,0,0,0,0,0,1,1,0
1,2,2,1,26678.636376,28325.384898,60586.147294,3,1,77,1,...,0,0,0,0,0,0,0,0,0,0
2,3,3,1,43621.680548,46192.256945,121969.841152,2,7,10,2,...,0,0,0,1,1,0,0,0,2,0



— BMX_9918.csv  |  2 MB  |  2025-09-09 12:01:43
   Columns (4): ['Unnamed: 0', 'seqn', 'BMXWT', 'BMXHT']
   Key fields: {'SEQN': True, 'RIDAGEYR': False, 'SDDSRVYR': False, 'EVENT': False, 'TIME_Y': False}
   Sample rows:


Unnamed: 0.1,Unnamed: 0,seqn,BMXWT,BMXHT
0,1,1,12.5,91.6
1,2,2,75.4,174.0
2,3,3,32.9,136.6



— smk_9918.csv  |  2 MB  |  2025-09-09 12:01:40
   Columns (7): ['Unnamed: 0', 'seqn', 'smk', 'smk_yr', 'pack', 'pack_yr', 'smk_avg']
   Key fields: {'SEQN': True, 'RIDAGEYR': False, 'SDDSRVYR': False, 'EVENT': False, 'TIME_Y': False}
   Sample rows:


Unnamed: 0.1,Unnamed: 0,seqn,smk,smk_yr,pack,pack_yr,smk_avg
0,1,2,1,7777,7777.0,,
1,2,5,2,7777,7777.0,,
2,3,7,2,22,1.0,8030.0,



— totalpa_9918_imputed.csv  |  8 MB  |  2025-09-09 12:01:37
   Columns (38): ['SEQN', 'sddsrvyr', 'ltpa', 'SDMVPSU', 'SDMVSTRA', 'age', 'sex', 're', 'edu', 'pir', 'tchol', 'hdl', 'ldl', 'tg', 'wc', 'bmi', 'dm_self', 'hba1c', 'fpg', 'chf', 'chd', 'mi', 'stroke', 'cancer', 'emphysema'] ...
   Key fields: {'SEQN': True, 'RIDAGEYR': False, 'SDDSRVYR': True, 'EVENT': False, 'TIME_Y': False}
   Sample rows:


Unnamed: 0,SEQN,sddsrvyr,ltpa,SDMVPSU,SDMVSTRA,age,sex,re,edu,pir,...,copd,sbp,dbp,dm_rx,chol_rx,angina_rx,htn_rx,roseQ,metscore,imp
0,2,1,0.0,3,1,77,1,3,5,5.0,...,2,100.666667,56.666667,0,0,0,0,0,60,1
1,5,1,41.066667,2,8,49,1,3,5,5.0,...,2,122.0,82.666667,0,0,0,1,0,1920,1
2,7,1,3.033333,2,4,59,2,4,2,1.04,...,2,125.333333,80.0,0,0,0,0,0,0,1


In [21]:
# %% Load covariate CSVs from DATA/cov and (optionally) merge
from pathlib import Path
import pandas as pd
import numpy as np

# assumes DATA already defined, e.g. DATA = Path("/Users/dengshuyue/Desktop/SDOH/analysis/data")
COV_DIR = DATA / "cov"
if not COV_DIR.exists():
    raise FileNotFoundError(f"cov folder not found: {COV_DIR}")

cov_files = sorted(COV_DIR.glob("*.csv"))
if not cov_files:
    raise FileNotFoundError(f"No CSVs in {COV_DIR}")

def _prep(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    d.columns = d.columns.str.upper()
    if "SEQN" not in d.columns:
        raise ValueError("SEQN not found in a covariate file.")
    d["SEQN"] = pd.to_numeric(d["SEQN"], errors="coerce").astype("Int64")
    return d.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"])

# Load all cov CSVs -> dict {stem: df}
cov = {}
print("Found covariate files:")
for p in cov_files:
    df = pd.read_csv(p, low_memory=False)
    df = _prep(df)
    cov[p.stem] = df
    # small preview
    print(f"  • {p.name:30s} | rows={df.shape[0]:,} | cols={df.shape[1]} | SEQN uniq={df['SEQN'].nunique():,}")

# ---------- OPTIONAL MERGE ----------
# Choose a base to merge onto
base = None
if "mort_with_demo" in globals():
    base = _prep(mort_with_demo)
    base_name = "mort_with_demo"
elif "demo9923" in globals():
    base = _prep(demo9923)
    base_name = "demo9923"

if base is None:
    print("\nℹ️ Skipping merge (no base df in memory). Use `cov` dict as needed.")
else:
    merged = base
    for name, df in cov.items():
        # Avoid column collisions: if overlap (besides SEQN), prefix with file stem
        overlap = [c for c in df.columns if c != "SEQN" and c in merged.columns]
        if overlap:
            df = df.rename(columns={c: f"{name.upper()}__{c}" for c in overlap})
        merged = merged.merge(df, on="SEQN", how="left", validate="one_to_one")
        print(f"  + merged {name}: now {merged.shape}")

    print(f"\n✅ covariates merged onto {base_name}. Final shape: {merged.shape}")
    # keep it available
    cov_merged = merged


Found covariate files:
  • BMX_9918.csv                   | rows=96,766 | cols=4 | SEQN uniq=96,766
  • nhanes_primary_anal_full_singleimputation_v2.csv | rows=101,316 | cols=75 | SEQN uniq=101,316
  • smk_9918.csv                   | rows=55,081 | cols=7 | SEQN uniq=55,081
  • totalpa_9918_imputed.csv       | rows=55,081 | cols=38 | SEQN uniq=55,081
  + merged BMX_9918: now (56253, 23)
  + merged nhanes_primary_anal_full_singleimputation_v2: now (56253, 97)
  + merged smk_9918: now (56253, 103)
  + merged totalpa_9918_imputed: now (56253, 140)

✅ covariates merged onto mort_with_demo. Final shape: (56253, 140)


In [22]:
# Rename RIDAGEYR in covar to avoid overwriting demoall's RIDAGEYR
# covar = covar.rename(columns={"RIDAGEYR": "RIDAGEYR_covar"})


In [None]:
from functools import reduce

# ✅ Step 0: Merge all datasets by SEQN
merge_list = [demoall, hoq_all, ocq, mort, snap, ins, covar, dietwt, scores2, covariates1]
score_mort = reduce(lambda left, right: pd.merge(left, right, on="SEQN", how="left"), merge_list)

In [None]:
# ✅ Step 1: Filter to adults age ≥ 20
if "RIDAGEYR" not in score_mort.columns:
    raise ValueError("RIDAGEYR is missing after merge — check if demoall was merged correctly.")
score_mort = score_mort[score_mort["RIDAGEYR"] >= 20]

In [1]:
import nbformat, textwrap, sys
nb = nbformat.read("01_extract_clean_core.ipynb", as_version=4)
for i, cell in enumerate(nb.cells, 1):
    if cell.cell_type == "code":
        print(f"\n# %% [cell {i}]")
        print(cell.source)



# %% [cell 3]

# If not already installed, install pyreadstat (used to read .sas7bdat files)
!pip install pyreadstat
!pip install openpyxl

# Import core packages
import pandas as pd
import os
import numpy as np
import pyreadstat
from scipy.stats import sem
import statsmodels.api as sm


#path 
output_path = "/Users/dengshuyue/Desktop/SDOH/analysis/output/demo_summary.csv"
data_path = "/Users/dengshuyue/Desktop/SDOH/analysis/data"

# %% [cell 4]
# Display total number of rows (i.e., participants) in the dataset
print(f"Total sample size: {df.shape[0]:,}")

# Check the min and max age
min_age = df['RIDAGEYR'].min()
max_age = df['RIDAGEYR'].max()
print(f"Age range: {min_age} to {max_age} years")



# %% [cell 6]
print("RIDAGEYR" in demoall.columns)  # Should print True
print(demoall.columns)

# %% [cell 7]
# Rename RIDAGEYR in covar to avoid overwriting demoall's RIDAGEYR
covar = covar.rename(columns={"RIDAGEYR": "RIDAGEYR_covar"})


# %% [cell 8]
from functools import reduce

# ✅ Step 