## set up 

In [121]:
from pathlib import Path
import pandas as pd

ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
OUT  = ROOT / "output"

MY_PATH = OUT / "demo_mt_cov_dp_sdoh.parquet"
LU_PATH = ROOT / "data/cov/nhanes_primary_anal_full_singleimputation_v2.csv"

df_my_cov_1999_2023 = pd.read_parquet(MY_PATH)
df_lu_cov_1999_2018 = pd.read_csv(LU_PATH)

print("mine:", df_my_cov_1999_2023.shape, "| lu:", df_lu_cov_1999_2018.shape)
print("Loaded:", MY_PATH)
print("Loaded:", LU_PATH)



mine: (128809, 82) | lu: (101316, 75)
Loaded: /Users/dengshuyue/Desktop/SDOH/analysis/output/demo_mt_cov_dp_sdoh.parquet
Loaded: /Users/dengshuyue/Desktop/SDOH/analysis/data/cov/nhanes_primary_anal_full_singleimputation_v2.csv


In [122]:
df_my_cov_1999_2023.head(10)

Unnamed: 0,SEQN,SDDSRVYR,SDMVPSU,SDMVSTRA,WTMEC2YR,AGE_YR,RIAGENDR,SEX,FEMALE,SMK_STATUS,...,FS_HH,FS_ADULT,FS_FINAL,HHFDSEC,ADFDSEC,FS_HH4,FS_ADULT4,FS_SOURCE_HH,FS_SOURCE_FINAL,SNAP_SOURCE
0,1,1.0,1.0,5.0,10982.898896,2.0,2,F,1,,...,,,,,,,,,,
1,2,1.0,3.0,1.0,28325.384898,77.0,1,M,0,NEVER,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,HHFDSEC,household,
2,3,1.0,2.0,7.0,46192.256945,10.0,2,F,1,,...,,,,,,,,,,
3,4,1.0,1.0,2.0,10251.26002,1.0,1,M,0,,...,,,,,,,,,,
4,5,1.0,2.0,8.0,99445.065735,49.0,1,M,0,FORMER,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,HHFDSEC,household,
5,6,1.0,2.0,2.0,39656.600444,19.0,2,F,1,,...,,,,,,,,,,
6,7,1.0,2.0,4.0,25525.423409,59.0,2,F,1,FORMER,...,,,,,,,,,,
7,8,1.0,1.0,6.0,31510.587866,13.0,1,M,0,,...,,,,,,,,,,
8,9,1.0,2.0,9.0,7575.870247,11.0,2,F,1,,...,,,,,,,,,,
9,10,1.0,1.0,7.0,22445.808572,43.0,1,M,0,CURRENT,...,,,,,,,,,,


#### Helpers (used later)

In [211]:
# 1) HELPERS (robust + imports np)
import numpy as np
import pandas as pd

def _norm_str_col(series: pd.Series) -> pd.Series:
    """Lowercase + strip + turn 'nan' into actual NaN."""
    s = series.astype(str).str.strip().str.lower()
    return s.replace({"nan": np.nan})

def _num_summary(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    """Lightweight numeric summary for a set of columns."""
    rows = []
    for c in cols:
        s = pd.to_numeric(df[c], errors="coerce")
        rows.append({
            "column": c,
            "n": len(s),
            "na_rate": float(s.isna().mean()),
            "min": np.nanmin(s.values),
            "p25": np.nanpercentile(s.values, 25),
            "median": np.nanmedian(s.values),
            "p75": np.nanpercentile(s.values, 75),
            "max": np.nanmax(s.values),
            "mean": np.nanmean(s.values),
            "std": np.nanstd(s.values),
            "unique_non_na": int(s.nunique(dropna=True)),
        })
    return pd.DataFrame(rows)

def _binary_sig(series: pd.Series) -> str | None:
    """Detect common binary encodings."""
    vals = set(_norm_str_col(series).dropna().unique())
    if vals <= {"0","1"}: return "0/1"
    if vals <= {"yes","no"}: return "yes/no"
    if vals <= {"true","false"}: return "true/false"
    if vals <= {"male","female"}: return "male/female"
    return None


#### Column set differences 

In [212]:
# 2) COLUMN SET DIFFERENCES (a)

cols_my = set(df_my_cov_1999_2023.columns)
cols_lu = set(df_lu_cov_1999_2018.columns)

audit_only_in_lu = sorted(cols_lu - cols_my)
audit_only_in_my = sorted(cols_my - cols_lu)
audit_in_both    = sorted(cols_my & cols_lu)

print(f"[Columns] only_in_lu: {len(audit_only_in_lu)} | only_in_my: {len(audit_only_in_my)} | in_both: {len(audit_in_both)}")
print("• only_in_lu (first 20):", audit_only_in_lu[:20])
print("• only_in_my (first 20):", audit_only_in_my[:20])



[Columns] only_in_lu: 69 | only_in_my: 76 | in_both: 6
• only_in_lu (first 20): ['MetS', 'MetS_bp', 'MetS_count', 'MetS_fpg', 'MetS_hdl', 'MetS_triglycerides', 'MetS_wc', 'WTINT2YR', 'WTSAF2YR', 'X', 'adiposity_pri', 'adiposity_sec', 'age', 'age_cat', 'angina', 'angina_rx', 'asthma', 'bmi', 'bp_pri', 'bp_sec']
• only_in_my (first 20): ['ADFDSEC', 'AGE_YR', 'ALCOHOL_CAT', 'BMI', 'BMI_CLAS', 'BMXHT', 'BMXWT', 'CANCER', 'CENSORED', 'CIDI_12M_MDE', 'CIDI_SCORE_RAW', 'CIGS_PER_DAY', 'DBP', 'DEP_HARMONIZED', 'DEP_IMP', 'DEP_SOURCE', 'DIABETES', 'DMDHHSIZ', 'DPQ_CAT', 'DRINKS_PER_DAY']


In [213]:
# mine
print(len(df_my_cov_1999_2023.columns), "columns")
print(df_my_cov_1999_2023.columns.tolist())

# lu
print(len(df_lu_cov_1999_2018.columns), "columns")
print(df_lu_cov_1999_2018.columns.tolist())

82 columns
['SEQN', 'SDDSRVYR', 'SDMVPSU', 'SDMVSTRA', 'WTMEC2YR', 'AGE_YR', 'RIAGENDR', 'SEX', 'FEMALE', 'SMK_STATUS', 'CIGS_PER_DAY', 'PACK_YEARS', 'FORMER_SMOKER', 'DRINKS_PER_DAY', 'ALCOHOL_CAT', 'LTPA', 'METSCORE', 'IMP', 'BMXWT', 'BMXHT', 'BMI', 'BMI_CLAS', 'DIABETES', 'HTN', 'HIGH_CHOL', 'CVD', 'CANCER', 'SBP', 'DBP', 'TCHOL', 'HDL', 'LDL', 'TG', 'DMDHHSIZ', 'ELIGSTAT', 'MORTSTAT', 'PERMTH_EXM', 'PERMTH_INT', 'UCOD_LEADING', 'IS_POST2018', 'IS_ADULT', 'MORTALITY_COVERED', 'EVENT', 'CENSORED', 'FU_YRS_EXM', 'FU_YRS_INT', 'UCOD_LABEL', 'PHQ9', 'PHQ9_GE10', 'DPQ_CAT', 'DEP_IMP', 'CIDI_SCORE_RAW', 'CIDI_12M_MDE', 'WTSCI2YR', 'DEP_HARMONIZED', 'DEP_SOURCE', 'PIR', 'PIR_CAT', 'INDFMINC', 'EDU', 'EDU_CAT', 'RACE_ETH', 'MARITAL', 'MARITAL_CAT', 'EMPLOY', 'UNEMPLOYMENT', 'HOD050', 'HOQ065', 'INS', 'SNAP', 'FSDHH', 'FS', 'FS_HH', 'FS_ADULT', 'FS_FINAL', 'HHFDSEC', 'ADFDSEC', 'FS_HH4', 'FS_ADULT4', 'FS_SOURCE_HH', 'FS_SOURCE_FINAL', 'SNAP_SOURCE']
75 columns
['X', 'SEQN', 'SDDSRVYR', 'WTIN

In [214]:
# %whos DataFrame


In [215]:
# df_my_cov_1999_2023[['MORTALITY_COVERED', 'EVENT', "UCOD_LABEL", "CANCER"]].head(20)

## Align column same as lu 

In [216]:
import re, difflib
import pandas as pd

# === Inputs (your two originals) ===
mine = df_my_cov_1999_2023.copy()
lu   = df_lu_cov_1999_2018.copy()

lu_cols = list(lu.columns)
mine_cols = list(mine.columns)

# ---------- Helpers ----------
def norm(s: str) -> str:
    # Lower, drop non-alnum (so AGE_YR -> ageyr; RACE_ETH -> raceeth)
    return re.sub(r'[^0-9a-z]+', '', s.lower()) if isinstance(s, str) else s

lu_norm_to_name = {}
for c in lu_cols:
    lu_norm_to_name.setdefault(norm(c), c)  # keep first occurrence

# Minimal synonyms (expand as needed if you see mismatches)
# YOUR column name -> Lu column name
synonyms = {
    'AGE_YR':'age',
    'SEX':'sex',
    'RACE_ETH':'re',
    'EDU':'edu',
    'PIR':'pir',
    'TCHOL':'tchol',
    'HDL':'hdl',
    'LDL':'ldl',
    'TG':'tg',
    'WC':'wc',
    'BMI':'bmi',
    'SBP':'sbp',
    'DBP':'dbp',
    'DIABETES':'diabetes',
    'CVD':'CVD',          # Lu uses uppercase "CVD" in your list
    'CANCER':'cancer',
    'DM_RX':'dm_rx',
    'CHOL_RX':'chol_rx',
    'HTN_RX':'htn_rx',
    'ANGINA_RX':'angina_rx',
    'ANGINA':'angina',
    'AGE_CAT':'age_cat',
    'PIR_CAT':'pir_cat',
    'EDU2':'edu2',
    'METS_HDL':'MetS_hdl',
    'METS_TRIGLYCERIDES':'MetS_triglycerides',
    'METS_BP':'MetS_bp',
    'METS_WC':'MetS_wc',
    'METS_FPG':'MetS_fpg',
    'METS_COUNT':'MetS_count',
    'ROSEQ':'roseQ',
    'NO_NA':'no_na',
    'LUNG_DISEASE':'lung_disease',
    'BP_PRI':'bp_pri',
    'GLUCOSE_PRI':'glucose_pri',
    'LIPID_PRI':'lipid_pri',
    'ADIPOSITY_PRI':'adiposity_pri',
    'CVD_PRI':'cvd_pri',
    'BP_SEC':'bp_sec',
    'GLUCOSE_SEC':'glucose_sec',
    'LIPID_SEC':'lipid_sec',
    'ADIPOSITY_SEC':'adiposity_sec',
    'CVD_SEC':'cvd_sec',
    # Common admin/weight vars:
    'WTMEC2YR':'WTMEC2YR',
    'SDDSRVYR':'SDDSRVYR',
    'SDMVPSU':'SDMVPSU',
    'SDMVSTRA':'SDMVSTRA',
}

# Columns we should **never** rename (IDs/keys that already match)
protect_exact = set(['SEQN','SDDSRVYR','SDMVPSU','SDMVSTRA','WTMEC2YR'])

# ---------- Build mapping (your -> lu) ----------
mapping = {}          # final mapping to apply
used_targets = set()  # to avoid collisions (two src -> one dst)

for src in mine_cols:
    if src in protect_exact or src.endswith('_lu'):
        continue

    # 1) If exact Lu name already, keep as-is
    if src in lu_cols:
        continue

    # 2) Synonym override
    if src in synonyms and synonyms[src] in lu_cols and synonyms[src] not in used_targets and synonyms[src] not in mine_cols:
        mapping[src] = synonyms[src]
        used_targets.add(synonyms[src])
        continue

    # 3) Case-insensitive exact
    ci = next((dst for dst in lu_cols if isinstance(dst, str) and dst.lower() == src.lower()), None)
    if ci and ci not in used_targets and ci not in mine_cols:
        mapping[src] = ci
        used_targets.add(ci)
        continue

    # 4) Normalized name match
    nsrc = norm(src)
    if nsrc in lu_norm_to_name:
        dst = lu_norm_to_name[nsrc]
        if dst not in used_targets and dst not in mine_cols:
            mapping[src] = dst
            used_targets.add(dst)
            continue

    # 5) Fuzzy match for stragglers (safe threshold)
    # Only attempt for alphas; ignore obviously different admin columns you don't want changed
    candidates = difflib.get_close_matches(src, lu_cols, n=1, cutoff=0.92)
    if candidates:
        dst = candidates[0]
        if dst not in used_targets and dst not in mine_cols:
            mapping[src] = dst
            used_targets.add(dst)
            continue

# ---------- Apply rename ----------
mine_renamed = mine.rename(columns=mapping).copy()

# ---------- Reorder to Lu’s order (extras at end; keep *_lu at very end) ----------
ordered = [c for c in lu_cols if c in mine_renamed.columns]
extras  = [c for c in mine_renamed.columns if c not in ordered and not c.endswith('_lu')]
audit   = [c for c in mine_renamed.columns if c.endswith('_lu')]

df_my_cov_aligned = mine_renamed[ordered + extras + audit].copy()

# ---------- Report ----------
renamed_pairs = sorted(mapping.items(), key=lambda x: x[0].lower())
missing_in_mine = [c for c in lu_cols if c not in df_my_cov_aligned.columns]

print(f"Renamed {len(renamed_pairs)} columns automatically.")
print("Examples:", renamed_pairs[:10])
print("Still missing from your data (present in Lu):", missing_in_mine)
print("Final order starts with:", df_my_cov_aligned.columns[:15].tolist())


Renamed 15 columns automatically.
Examples: [('AGE_YR', 'age'), ('BMI', 'bmi'), ('CANCER', 'cancer'), ('DBP', 'dbp'), ('DIABETES', 'diabetes'), ('EDU', 'edu'), ('HDL', 'hdl'), ('LDL', 'ldl'), ('PIR', 'pir'), ('PIR_CAT', 'pir_cat')]
Still missing from your data (present in Lu): ['X', 'WTINT2YR', 'WTSAF2YR', 'wc', 'dm_self', 'hba1c', 'fpg', 'chf', 'chd', 'mi', 'stroke', 'emphysema', 'bronchitis', 'asthma', 're2', 'copd', 'dm_rx', 'chol_rx', 'angina_rx', 'htn_rx', 'roseQ', 'no_na', 'age_cat', 'edu2', 'lung_disease', 'tchol_hdl', 'angina', 'lipid_pri', 'adiposity_pri', 'bp_pri', 'glucose_pri', 'cvd_pri', 'lipid_sec', 'adiposity_sec', 'bp_sec', 'glucose_sec', 'cvd_sec', 'optimal_pri_count', 'intermediate_pri_count', 'poor_pri_count', 'optimal_sec_count', 'intermediate_sec_count', 'poor_sec_count', 'optimal_all', 'poor_all', 'optimal_all_sec', 'poor_all_sec', 'MetS_hdl', 'MetS_triglycerides', 'MetS_bp', 'MetS_wc', 'MetS_fpg', 'MetS_count', 'MetS']
Final order starts with: ['SEQN', 'SDDSRVYR'

#### Adding missing column merge from lu

In [217]:
import re, pandas as pd

# === 0) Start from the two originals ===
mine = df_my_cov_1999_2023.copy()
lu   = df_lu_cov_1999_2018.copy()

# === 1) Auto-rename YOUR columns to Lu's names (case/underscore-insensitive + synonyms) ===
def norm(s: str) -> str:
    return re.sub(r'[^0-9a-z]+', '', s.lower()) if isinstance(s, str) else s

lu_cols = list(lu.columns)
mine_cols = list(mine.columns)

# First pass: normalized name index for Lu
lu_norm_to_name = {}
for c in lu_cols:
    lu_norm_to_name.setdefault(norm(c), c)

# Synonyms (YOUR -> Lu)
synonyms = {
    'AGE_YR':'age','SEX':'sex','RACE_ETH':'re','EDU':'edu','PIR':'pir',
    'TCHOL':'tchol','HDL':'hdl','LDL':'ldl','TG':'tg',
    'WC':'wc','BMI':'bmi','SBP':'sbp','DBP':'dbp',
    'DIABETES':'diabetes','CVD':'CVD','CANCER':'cancer',
    'DM_RX':'dm_rx','CHOL_RX':'chol_rx','HTN_RX':'htn_rx','ANGINA_RX':'angina_rx','ANGINA':'angina',
    'AGE_CAT':'age_cat','PIR_CAT':'pir_cat','EDU2':'edu2',
    'METS_HDL':'MetS_hdl','METS_TRIGLYCERIDES':'MetS_triglycerides','METS_BP':'MetS_bp',
    'METS_WC':'MetS_wc','METS_FPG':'MetS_fpg','METS_COUNT':'MetS_count',
    'ROSEQ':'roseQ','NO_NA':'no_na','LUNG_DISEASE':'lung_disease',
    'BP_PRI':'bp_pri','GLUCOSE_PRI':'glucose_pri','LIPID_PRI':'lipid_pri','ADIPOSITY_PRI':'adiposity_pri',
    'CVD_PRI':'cvd_pri','BP_SEC':'bp_sec','GLUCOSE_SEC':'glucose_sec','LIPID_SEC':'lipid_sec',
    'ADIPOSITY_SEC':'adiposity_sec','CVD_SEC':'cvd_sec',
    # admin/weights that already match:
    'WTMEC2YR':'WTMEC2YR','SDDSRVYR':'SDDSRVYR','SDMVPSU':'SDMVPSU','SDMVSTRA':'SDMVSTRA'
}

protect_exact = {'SEQN','SDDSRVYR','SDMVPSU','SDMVSTRA','WTMEC2YR'}

mapping = {}
used_targets = set()
for src in mine_cols:
    if src in protect_exact or src.endswith('_lu'):
        continue
    if src in lu_cols:
        continue
    # synonyms first
    if src in synonyms and synonyms[src] in lu_cols and synonyms[src] not in used_targets and synonyms[src] not in mine.columns:
        mapping[src] = synonyms[src]; used_targets.add(synonyms[src]); continue
    # case-insensitive exact
    ci = next((dst for dst in lu_cols if isinstance(dst, str) and dst.lower()==src.lower()), None)
    if ci and ci not in used_targets and ci not in mine.columns:
        mapping[src] = ci; used_targets.add(ci); continue
    # normalized match
    nc = norm(src)
    if nc in lu_norm_to_name:
        dst = lu_norm_to_name[nc]
        if dst not in used_targets and dst not in mine.columns:
            mapping[src] = dst; used_targets.add(dst); continue

mine = mine.rename(columns=mapping)

# === 2) Identify Lu columns you still lack, and merge ONLY those in ===
missing = [c for c in lu_cols if c not in mine.columns]
# keys must exist in both:
for k in ['SEQN','SDDSRVYR']:
    if k not in mine.columns or k not in lu.columns:
        raise KeyError(f"Key {k} missing in one of the frames")

# subset Lu to keys + missing, drop dup keys, then merge
lu_sub = lu[['SEQN','SDDSRVYR'] + missing].copy()
dup_ct = lu_sub.duplicated(['SEQN','SDDSRVYR']).sum()
if dup_ct:
    print(f"[warn] Dropping {dup_ct} duplicate rows on keys in Lu subset")
    lu_sub = lu_sub.drop_duplicates(['SEQN','SDDSRVYR'], keep='first')

# Merge (no suffix needed—these cols are missing in 'mine')
aligned = mine.merge(lu_sub, on=['SEQN','SDDSRVYR'], how='left')

# === 3) Reorder to Lu order first, then any extras ===
order = [c for c in lu_cols if c in aligned.columns]
extras = [c for c in aligned.columns if c not in order]
df_my_cov_aligned = aligned[order + extras].copy()

# === 4) Quick report
print(f"Auto-renamed {len(mapping)} columns to Lu names.")
print("Filled from Lu (newly added):", missing[:20], "..." if len(missing)>20 else "")
still_missing = [c for c in lu_cols if c not in df_my_cov_aligned.columns]  # should be empty
print("Still missing Lu cols:", still_missing)
print("Final starts with:", df_my_cov_aligned.columns[:15].tolist())


Auto-renamed 15 columns to Lu names.
Filled from Lu (newly added): ['X', 'WTINT2YR', 'WTSAF2YR', 'wc', 'dm_self', 'hba1c', 'fpg', 'chf', 'chd', 'mi', 'stroke', 'emphysema', 'bronchitis', 'asthma', 're2', 'copd', 'dm_rx', 'chol_rx', 'angina_rx', 'htn_rx'] ...
Still missing Lu cols: []
Final starts with: ['X', 'SEQN', 'SDDSRVYR', 'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'SDMVPSU', 'SDMVSTRA', 'age', 'sex', 're', 'edu', 'pir', 'tchol', 'hdl']


#### clean and check merged file 

In [218]:
if 'X' in df_my_cov_aligned.columns:
    df_my_cov_aligned = df_my_cov_aligned.drop(columns=['X'])


In [219]:
import pandas as pd

# binary flags
bin_cols = ['dm_self','chf','chd','mi','stroke','emphysema','bronchitis','asthma',
            'copd','dm_rx','chol_rx','angina_rx','htn_rx','angina']
for c in df_my_cov_aligned.columns.intersection(bin_cols):
    df_my_cov_aligned[c] = pd.to_numeric(df_my_cov_aligned[c], errors='coerce').astype('Int8')

# labs/metrics
num_cols = ['wc','hba1c','fpg','tchol_hdl','MetS_hdl','MetS_triglycerides',
            'MetS_bp','MetS_wc','MetS_fpg','MetS_count']
for c in df_my_cov_aligned.columns.intersection(num_cols):
    df_my_cov_aligned[c] = pd.to_numeric(df_my_cov_aligned[c], errors='coerce')


In [220]:
# check 
df_my_cov_aligned.head(10)

Unnamed: 0,SEQN,SDDSRVYR,WTINT2YR,WTMEC2YR,WTSAF2YR,SDMVPSU,SDMVSTRA,age,sex,re,...,FS_HH,FS_ADULT,FS_FINAL,HHFDSEC,ADFDSEC,FS_HH4,FS_ADULT4,FS_SOURCE_HH,FS_SOURCE_FINAL,SNAP_SOURCE
0,1,1.0,9727.078709,10982.898896,75131.2,1.0,5.0,2.0,F,Other Hispanic,...,,,,,,,,,,
1,2,1.0,26678.636376,28325.384898,60586.147294,3.0,1.0,77.0,M,Mexican American,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,HHFDSEC,household,
2,3,1.0,43621.680548,46192.256945,121969.841152,2.0,7.0,10.0,F,Mexican American,...,,,,,,,,,,
3,4,1.0,10346.119327,10251.26002,4624.687273,1.0,2.0,1.0,M,Other Hispanic,...,,,,,,,,,,
4,5,1.0,91050.84662,99445.065735,234895.20565,2.0,8.0,49.0,M,Mexican American,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,HHFDSEC,household,
5,6,1.0,36508.250375,39656.600444,13379.8,2.0,2.0,19.0,F,NH Black,...,,,,,,,,,,
6,7,1.0,22352.08862,25525.423409,57661.621988,2.0,4.0,59.0,F,Other Hispanic,...,,,,,,,,,,
7,8,1.0,31600.089655,31510.587866,76026.438279,1.0,6.0,13.0,M,Mexican American,...,,,,,,,,,,
8,9,1.0,7529.435502,7575.870247,14694.924957,2.0,9.0,11.0,F,Other Hispanic,...,,,,,,,,,,
9,10,1.0,21071.164059,22445.808572,60202.416895,1.0,7.0,43.0,M,Other Hispanic,...,,,,,,,,,,


## Keep important column for this analysis 

In [221]:
import pandas as pd

df = df_my_cov_aligned.copy()

needed_core = [
    # final table / missing checks
    "RIDAGEYR","SEX","RACE","household_size",
    "SMK_AVG","SMK","ALCG2","met_hr",
    "bmic","DIABE","HYPERTEN","chol_rx","CVD","cancer",
    "probable_depression","sdoh_score","ahei_total",
]

needed_build = [
    # for sdoh_score
    "unemployment2","pir","SNAP","EDU","sdoh_access","ins","HOQ065","marriage",
    # for HYPERTEN
    "BPQ020","BPQ050A","sbp","dbp",
]

needed_survey = ["sdmvpsu","sdmvstra","wt", "wt10"]  # keep wt10 if you still reference it
needed_keys   = ["SEQN","SDDSRVYR"]

needed_optional = [
    "FS","SNAP3","dm_rx","angina","lung_disease","MORTSTAT",
    "hba1c","hdl","ldl","tg"
]

NEEDED = needed_core + needed_build + needed_survey + needed_keys + needed_optional

# Only keep those present; report what's missing
present = [c for c in NEEDED if c in df.columns]
missing = [c for c in NEEDED if c not in df.columns]

print(f"Keeping {len(present)} columns; missing {len(missing)}:")
print("Missing:", missing)

df_desc = df[present].copy()

# (Optional) sanity peek
print("df_desc shape:", df_desc.shape)
print("NA rates (top 10):")
print(df_desc.isna().mean().sort_values(ascending=False).head(10).round(3))


Keeping 19 columns; missing 26:
Missing: ['RIDAGEYR', 'SEX', 'RACE', 'household_size', 'SMK_AVG', 'SMK', 'ALCG2', 'met_hr', 'bmic', 'DIABE', 'HYPERTEN', 'probable_depression', 'sdoh_score', 'ahei_total', 'unemployment2', 'EDU', 'sdoh_access', 'ins', 'marriage', 'BPQ020', 'BPQ050A', 'sdmvpsu', 'sdmvstra', 'wt', 'wt10', 'SNAP3']
df_desc shape: (128809, 19)
NA rates (top 10):
SNAP            0.587
HOQ065          0.581
MORTSTAT        0.541
FS              0.518
chol_rx         0.213
dm_rx           0.213
ldl             0.213
hdl             0.213
hba1c           0.213
lung_disease    0.213
dtype: float64


In [222]:
# 1) Plain list (sorted)
cols = (df_my_cov_aligned.columns.tolist())
print(f"{len(cols)} columns:\n", cols)


135 columns:
 ['SEQN', 'SDDSRVYR', 'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'SDMVPSU', 'SDMVSTRA', 'age', 'sex', 're', 'edu', 'pir', 'tchol', 'hdl', 'ldl', 'tg', 'wc', 'bmi', 'dm_self', 'hba1c', 'fpg', 'chf', 'chd', 'mi', 'stroke', 'cancer', 'emphysema', 'bronchitis', 'asthma', 're2', 'copd', 'sbp', 'dbp', 'dm_rx', 'chol_rx', 'angina_rx', 'htn_rx', 'roseQ', 'no_na', 'age_cat', 'pir_cat', 'edu2', 'CVD', 'lung_disease', 'diabetes', 'tchol_hdl', 'angina', 'lipid_pri', 'adiposity_pri', 'bp_pri', 'glucose_pri', 'cvd_pri', 'lipid_sec', 'adiposity_sec', 'bp_sec', 'glucose_sec', 'cvd_sec', 'optimal_pri_count', 'intermediate_pri_count', 'poor_pri_count', 'optimal_sec_count', 'intermediate_sec_count', 'poor_sec_count', 'optimal_all', 'poor_all', 'optimal_all_sec', 'poor_all_sec', 'MetS_hdl', 'MetS_triglycerides', 'MetS_bp', 'MetS_wc', 'MetS_fpg', 'MetS_count', 'MetS', 'RIAGENDR', 'FEMALE', 'SMK_STATUS', 'CIGS_PER_DAY', 'PACK_YEARS', 'FORMER_SMOKER', 'DRINKS_PER_DAY', 'ALCOHOL_CAT', 'LTPA', 'METSCORE'

#### adjust column name 

In [223]:
import pandas as pd
import numpy as np

df = df_my_cov_aligned.copy()

# ---------- helpers ----------
def ci_pick(*names):
    """case-insensitive column pick: returns first match or None"""
    lowmap = {c.lower(): c for c in df.columns}
    for n in names:
        if n in df.columns: return n
        if n.lower() in lowmap: return lowmap[n.lower()]
    return None

def pick_best(cands):
    cols = []
    for c in cands:
        col = ci_pick(c)
        if col: cols.append(col)
    if not cols: return None
    cov = {c: df[c].notna().mean() for c in cols}
    return max(cov, key=cov.get)  # highest coverage

created = {}

# ---------- 1) alias to canonical (adds BPQ+MM fallbacks, AHEI fallbacks) ----------
aliases = {
    # IDs / survey
    "RIDAGEYR": ["RIDAGEYR","AGE_YR","age"],
    "SEX":      ["SEX","sex","RIAGENDR"],
    "RACE":     ["RACE","re2","re"],
    "household_size": ["household_size","DMDHHSIZ"],
    "sdmvpsu":  ["sdmvpsu","SDMVPSU"],
    "sdmvstra": ["sdmvstra","SDMVSTRA"],
    "wt":       ["wt","WTMEC2YR"],
    "wt10":     ["wt10","WTMEC2YR"],

    # behavior
    "SMK_AVG":  ["SMK_AVG","CIGS_PER_DAY"],
    "SMK":      ["SMK","FORMER_SMOKER","SMK_STATUS"],
    "ALCG2":    ["ALCG2","ALCOHOL_CAT"],
    "met_hr":   ["met_hr","METSCORE","LTPA"],

    # clinical
    "bmic":     ["bmic","BMI_CLAS"],
    "DIABE":    ["DIABE","DIABETES","diabetes"],
    "chol_rx":  ["chol_rx"],
    "CVD":      ["CVD"],
    "cancer":   ["cancer"],

    # outcomes / scores
    "probable_depression": ["probable_depression","DEP_HARMONIZED","PHQ9_GE10","DEP_IMP"],
    "sdoh_score": ["sdoh_score"],
    "ahei_total": ["ahei_total","AHEI","ahei","HEI2015_TOTAL_SCORE","HEI2015_TOTAL"],

    # building blocks (for possible compute later)
    "unemployment2": ["unemployment2","UNEMPLOYMENT","EMPLOY"],
    "pir":           ["pir"],
    "SNAP":          ["SNAP"],
    "EDU":           ["EDU","EDU_CAT","edu","edu2"],
    "sdoh_access":   ["sdoh_access","HUQ_ACCESS","huq_access"],
    "ins":           ["ins","INS"],
    "HOQ065":        ["HOQ065"],
    "marriage":      ["marriage","MARITAL","MARITAL_CAT"],

    # HTN components (case-insensitive)
    "BPQ020":        ["BPQ020","bpq020"],
    "BPQ050A":       ["BPQ050A","bpq050a"],
    "sbp":           ["sbp","SBP"],
    "dbp":           ["dbp","DBP"],

    # SNAP3 label variant
    "SNAP3":         ["SNAP3","SNAP"],
}

for target, cands in aliases.items():
    src = pick_best(cands)
    if src:
        if target != src:
            df[target] = df[src]
            created[target] = src

# Normalize SEX if it came from RIAGENDR (1/2)
if "SEX" in df.columns:
    if pd.api.types.is_numeric_dtype(df["SEX"]):
        df["SEX"] = df["SEX"].map({1: "Male", 2: "Female"}).fillna(df["SEX"])
    else:
        df["SEX"] = df["SEX"].astype(str).str.strip().str.capitalize()

# HYPERTEN compute if missing (your R rule)
if "HYPERTEN" not in df.columns:
    bpq020 = df[ci_pick("BPQ020","bpq020")] if ci_pick("BPQ020","bpq020") else pd.Series(np.nan, index=df.index)
    bpq050a= df[ci_pick("BPQ050A","bpq050a")] if ci_pick("BPQ050A","bpq050a") else pd.Series(np.nan, index=df.index)
    sbp = df[ci_pick("sbp","SBP")] if ci_pick("sbp","SBP") else pd.Series(np.nan, index=df.index)
    dbp = df[ci_pick("dbp","DBP")] if ci_pick("dbp","DBP") else pd.Series(np.nan, index=df.index)
    df["HYPERTEN"] = np.where(((bpq020==1) | (bpq050a==1) | (sbp>=130) | (dbp>=85)), 1,
                              np.where(bpq020.notna() | bpq050a.notna() | sbp.notna() | dbp.notna(), 0, np.nan))

# ---------- 2) keep BOTH canonical + source columns ----------
# Define source groups to retain alongside canonical
source_groups = {
    "SMK_AVG": ["CIGS_PER_DAY"],
    "SMK":     ["FORMER_SMOKER","SMK_STATUS"],
    "ALCG2":   ["ALCOHOL_CAT"],
    "met_hr":  ["METSCORE","LTPA"],
    "bmic":    ["BMI_CLAS","BMI"],        # keep BMI if you like for context
    "DIABE":   ["DIABETES","diabetes"],
    "sbp":     ["SBP"],
    "dbp":     ["DBP"],
    "ahei_total": ["AHEI","ahei","HEI2015_TOTAL_SCORE","HEI2015_TOTAL"],
}

# Core variables your R script needs
needed_core = [
  "RIDAGEYR","SEX","RACE","household_size",
  "SMK_AVG","SMK","ALCG2","met_hr",
  "bmic","DIABE","HYPERTEN","chol_rx","CVD","cancer",
  "probable_depression","sdoh_score","ahei_total",
  "unemployment2","pir","SNAP","EDU","sdoh_access","ins","HOQ065","marriage",
  "BPQ020","BPQ050A","sdmvpsu","sdmvstra","wt","wt10","SNAP3",
  "SEQN","SDDSRVYR"
]

retain = set()
# always keep canonicals that exist
retain.update([c for c in needed_core if c in df.columns])
# also keep sources if present
for canon, sources in source_groups.items():
    if canon in df.columns:
        for s in sources:
            s_real = ci_pick(s)
            if s_real: retain.add(s_real)

df_desc = df[list(retain)].copy()

# ---------- 3) report ----------
still_missing = [c for c in needed_core if c not in df_desc.columns]
print("Aliases/derivations created:", created)
print(f"df_desc columns kept: {len(df_desc.columns)}")
print("Still missing (not found in df):", still_missing)
print("Smoking-related kept:",
      [c for c in df_desc.columns if c.upper() in {"SMK_AVG","CIGS_PER_DAY","SMK","FORMER_SMOKER","SMK_STATUS"}])


Aliases/derivations created: {'RIDAGEYR': 'age', 'SEX': 'sex', 'RACE': 're', 'household_size': 'DMDHHSIZ', 'sdmvpsu': 'SDMVPSU', 'sdmvstra': 'SDMVSTRA', 'wt': 'WTMEC2YR', 'wt10': 'WTMEC2YR', 'SMK_AVG': 'CIGS_PER_DAY', 'SMK': 'FORMER_SMOKER', 'ALCG2': 'ALCOHOL_CAT', 'met_hr': 'METSCORE', 'bmic': 'BMI_CLAS', 'DIABE': 'diabetes', 'probable_depression': 'DEP_HARMONIZED', 'unemployment2': 'UNEMPLOYMENT', 'EDU': 'edu', 'ins': 'INS', 'marriage': 'MARITAL', 'SNAP3': 'SNAP'}
df_desc columns kept: 40
Still missing (not found in df): ['sdoh_score', 'ahei_total', 'sdoh_access', 'BPQ020', 'BPQ050A']
Smoking-related kept: ['SMK_STATUS', 'SMK', 'SMK_AVG', 'FORMER_SMOKER', 'CIGS_PER_DAY']


In [224]:
import pandas as pd
import numpy as np

df = df_my_cov_aligned.copy()

# ---------- helpers ----------
def ci_pick(*names):
    lowmap = {c.lower(): c for c in df.columns}
    for n in names:
        if n in df.columns: return n
        if n.lower() in lowmap: return lowmap[n.lower()]
    return None

def pick_best(cands):
    cols = []
    for c in cands:
        col = ci_pick(c)
        if col: cols.append(col)
    if not cols: return None
    cov = {c: df[c].notna().mean() for c in cols}
    return max(cov, key=cov.get)

created = {}

# ---------- 1) alias to canonical (NO coalescing of weights) ----------
aliases = {
    # IDs / survey
    "RIDAGEYR": ["RIDAGEYR","AGE_YR","age"],
    "SEX":      ["SEX","sex","RIAGENDR"],
    "RACE":     ["RACE","re2","re"],
    "household_size": ["household_size","DMDHHSIZ"],
    "sdmvpsu":  ["sdmvpsu","SDMVPSU"],
    "sdmvstra": ["sdmvstra","SDMVSTRA"],
    # keep user's wt/wt10 AS-IS if present
    "wt":       ["wt"],
    "wt10":     ["wt10"],
    # keep ALL NHANES weights separately (standardize casing only)
    "WTINT2YR": ["WTINT2YR","wtint2yr"],
    "WTMEC2YR": ["WTMEC2YR","wtmec2yr"],
    "WTSAF2YR": ["WTSAF2YR","wtsaf2yr"],
    "WTDRD1":   ["WTDRD1","wtdrd1","wtdrd1d"],
    "WTDR2D":   ["WTDR2D","wtdr2d"],

    # behavior
    "SMK_AVG":  ["SMK_AVG","CIGS_PER_DAY"],
    "SMK":      ["SMK","FORMER_SMOKER","SMK_STATUS"],
    "ALCG2":    ["ALCG2","ALCOHOL_CAT"],
    "met_hr":   ["met_hr","METSCORE","LTPA"],

    # clinical
    "bmic":     ["bmic","BMI_CLAS"],
    "DIABE":    ["DIABE","DIABETES","diabetes"],
    "chol_rx":  ["chol_rx"],
    "CVD":      ["CVD"],
    "cancer":   ["cancer"],

    # outcomes / scores
    "probable_depression": ["probable_depression","DEP_HARMONIZED","PHQ9_GE10","DEP_IMP"],
    "sdoh_score": ["sdoh_score"],
    "ahei_total": ["ahei_total","AHEI","ahei","HEI2015_TOTAL_SCORE","HEI2015_TOTAL"],

    # building blocks
    "unemployment2": ["unemployment2","UNEMPLOYMENT","EMPLOY"],
    "pir":           ["pir"],
    "SNAP":          ["SNAP"],
    "EDU":           ["EDU","EDU_CAT","edu","edu2"],
    "sdoh_access":   ["sdoh_access","HUQ_ACCESS","huq_access"],
    "ins":           ["ins","INS"],
    "HOQ065":        ["HOQ065"],
    "marriage":      ["marriage","MARITAL","MARITAL_CAT"],

    # HTN components (case-insensitive)
    "BPQ020":        ["BPQ020","bpq020"],
    "BPQ050A":       ["BPQ050A","bpq050a"],
    "sbp":           ["sbp","SBP"],
    "dbp":           ["dbp","DBP"],

    # SNAP3 alias
    "SNAP3":         ["SNAP3","SNAP"],
}

for target, cands in aliases.items():
    src = pick_best(cands)
    if src:
        if target != src:
            df[target] = df[src]
            created[target] = src

# Normalize SEX if it came from RIAGENDR (1/2)
if "SEX" in df.columns:
    if pd.api.types.is_numeric_dtype(df["SEX"]):
        df["SEX"] = df["SEX"].map({1: "Male", 2: "Female"}).fillna(df["SEX"])
    else:
        df["SEX"] = df["SEX"].astype(str).str.strip().str.capitalize()

# HYPERTEN compute if missing (same rule)
if "HYPERTEN" not in df.columns:
    bpq020 = df[ci_pick("BPQ020","bpq020")] if ci_pick("BPQ020","bpq020") else pd.Series(np.nan, index=df.index)
    bpq050a= df[ci_pick("BPQ050A","bpq050a")] if ci_pick("BPQ050A","bpq050a") else pd.Series(np.nan, index=df.index)
    sbp = df[ci_pick("sbp","SBP")] if ci_pick("sbp","SBP") else pd.Series(np.nan, index=df.index)
    dbp = df[ci_pick("dbp","DBP")] if ci_pick("dbp","DBP") else pd.Series(np.nan, index=df.index)
    df["HYPERTEN"] = np.where(((bpq020==1) | (bpq050a==1) | (sbp>=130) | (dbp>=85)), 1,
                              np.where(bpq020.notna() | bpq050a.notna() | sbp.notna() | dbp.notna(), 0, np.nan))

# ---------- 2) keep BOTH canonical + source columns ----------
source_groups = {
    "SMK_AVG":   ["CIGS_PER_DAY"],
    "SMK":       ["FORMER_SMOKER","SMK_STATUS"],
    "ALCG2":     ["ALCOHOL_CAT"],
    "met_hr":    ["METSCORE","LTPA"],
    "bmic":      ["BMI_CLAS","BMI"],
    "DIABE":     ["DIABETES","diabetes"],
    "sbp":       ["SBP"],
    "dbp":       ["DBP"],
    "ahei_total":["AHEI","ahei","HEI2015_TOTAL_SCORE","HEI2015_TOTAL"],
    # weights: keep raw variants too if present
    "WTINT2YR":  ["wtint2yr"],
    "WTMEC2YR":  ["wtmec2yr","wt","wt10"],  # keep user's wt/wt10 alongside
    "WTSAF2YR":  ["wtsaf2yr"],
    "WTDRD1":    ["wtdrd1","wtdrd1d"],
    "WTDR2D":    ["wtdr2d"],
}

# Core variables your R script needs + all distinct weights
needed_core = [
  "RIDAGEYR","SEX","RACE","household_size",
  "SMK_AVG","SMK","ALCG2","met_hr",
  "bmic","DIABE","HYPERTEN","chol_rx","CVD","cancer",
  "probable_depression","sdoh_score","ahei_total",
  "unemployment2","pir","SNAP","EDU","sdoh_access","ins","HOQ065","marriage",
  "BPQ020","BPQ050A","sdmvpsu","sdmvstra","SEQN","SDDSRVYR",
  # weights (keep all if present)
  "WTINT2YR","WTMEC2YR","WTSAF2YR","WTDRD1","WTDR2D","wt","wt10","WTINT4YR","WTMEC4YR"
]

retain = set()
retain.update([c for c in needed_core if c in df.columns])
for canon, sources in source_groups.items():
    if canon in df.columns:
        for s in sources:
            s_real = ci_pick(s)
            if s_real: retain.add(s_real)

df_desc = df[list(retain)].copy()

# ---------- 3) report ----------
still_missing = [c for c in needed_core if c not in df_desc.columns]
print("Aliases/derivations created:", created)
print(f"df_desc columns kept: {len(df_desc.columns)}")
print("Still missing (not found in df):", still_missing)

# quick peek at which weights you have
weight_cols = [c for c in ["WTINT2YR","WTMEC2YR","WTSAF2YR","WTDRD1","WTDR2D","wt","wt10","WTINT4YR","WTMEC4YR"] if c in df_desc.columns]
print("Weights present in df_desc:", weight_cols)


Aliases/derivations created: {'RIDAGEYR': 'age', 'SEX': 'sex', 'RACE': 're', 'household_size': 'DMDHHSIZ', 'sdmvpsu': 'SDMVPSU', 'sdmvstra': 'SDMVSTRA', 'SMK_AVG': 'CIGS_PER_DAY', 'SMK': 'FORMER_SMOKER', 'ALCG2': 'ALCOHOL_CAT', 'met_hr': 'METSCORE', 'bmic': 'BMI_CLAS', 'DIABE': 'diabetes', 'probable_depression': 'DEP_HARMONIZED', 'unemployment2': 'UNEMPLOYMENT', 'EDU': 'edu', 'ins': 'INS', 'marriage': 'MARITAL', 'SNAP3': 'SNAP'}
df_desc columns kept: 40
Still missing (not found in df): ['sdoh_score', 'ahei_total', 'sdoh_access', 'BPQ020', 'BPQ050A', 'WTDRD1', 'WTDR2D', 'wt', 'wt10', 'WTINT4YR', 'WTMEC4YR']
Weights present in df_desc: ['WTINT2YR', 'WTMEC2YR', 'WTSAF2YR']


#### add 2017-2020 special weight

In [225]:
import pandas as pd
import numpy as np

# Base DF
df = df_my_cov_aligned.copy()

# NHANES P_DEMO (2017–Mar 2020 pre-pandemic)
urls = [
    "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DEMO.xpt",
    "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2020/P_DEMO.XPT",
]

demo = None
for u in urls:
    try:
        demo = pd.read_sas(u, format="xport", encoding="utf-8")
        break
    except Exception:
        pass
if demo is None:
    raise RuntimeError("Failed to download P_DEMO.xpt from both URLs.")

# Keep + standardize
keep = ["SEQN", "WTINTPRP", "WTMECPRP", "SDMVPSU", "SDMVSTRA", "SDDSRVYR"]
demo = demo[keep].copy()

# Optional sanity check: SDDSRVYR==66 for this file
# assert demo["SDDSRVYR"].dropna().eq(66).all()

# Map to your 4-year convention
demo = demo.rename(columns={
    "WTINTPRP": "WTINT4YR",
    "WTMECPRP": "WTMEC4YR",
})

# Ensure target cols exist pre-merge (so we can suffix the right-hand side)
for col in ["WTINT4YR", "WTMEC4YR", "SDMVPSU", "SDMVSTRA"]:
    if col not in df.columns:
        df[col] = pd.NA

# Merge (right-hand columns get *_src)
df = df.merge(demo, on=["SEQN"], how="left", suffixes=("", "_src"))

# Fill only missing (mask assignment avoids FutureWarning), then drop *_src
for col in ["WTINT4YR", "WTMEC4YR", "SDMVPSU", "SDMVSTRA"]:
    src = f"{col}_src"
    if src in df.columns:
        mask = df[col].isna() & df[src].notna()
        if mask.any():
            df.loc[mask, col] = df.loc[mask, src]
        df.drop(columns=[src], inplace=True)

# Dtypes: weights as float; PSU/STRATA as nullable int
for col in ["WTINT4YR", "WTMEC4YR", "WTINT2YR", "WTMEC2YR", "WTSAF2YR", "WTDRD1", "WTDR2D"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")  # float

for col in ["SDMVPSU", "SDMVSTRA"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")

# Quick audit
have = [c for c in ["WTINT4YR","WTMEC4YR","SDMVPSU","SDMVSTRA"] if c in df.columns]
print("Added/updated:", have)
print("Non-missing rates:", {c: float(df[c].notna().mean()) for c in have})

df_my_cov_aligned = df



Added/updated: ['WTINT4YR', 'WTMEC4YR', 'SDMVPSU', 'SDMVSTRA']
Non-missing rates: {'WTINT4YR': 0.12079901249136318, 'WTMEC4YR': 0.12079901249136318, 'SDMVPSU': 1.0, 'SDMVSTRA': 1.0}


#### Add ahei score build in R

In [226]:
import pandas as pd
from pathlib import Path

DATA = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
cand = [DATA/"ahei_1999_2018_combined.csv", DATA/"ahei_9904_wjfrt_ssbfix.csv"]

ahei_path = next((p for p in cand if p.exists()), None)
if ahei_path is None:
    raise FileNotFoundError("No AHEI combined file found. Please write ahei_1999_2018_combined.csv in R.")

# --- read header only to discover exact column names (case-insensitive) ---
hdr = pd.read_csv(ahei_path, nrows=0, low_memory=False)
lower_map = {c.lower(): c for c in hdr.columns}

seqn_col = lower_map.get("seqn")
tot_col = next((lower_map[k] for k in ["ahei_all","ahei_all_recomp"] if k in lower_map), None)
if not seqn_col or not tot_col:
    raise ValueError(f"Couldn’t find SEQN or AHEI total in {ahei_path.name}. "
                     f"Columns present include: {list(hdr.columns)[:12]} ...")

# --- now read ONLY what we need, with dtypes set → no DtypeWarning ---
try:
    # use pyarrow if available (fast, no type guessing); falls back if not installed
    ahei = pd.read_csv(ahei_path, usecols=[seqn_col, tot_col],
                       dtype={seqn_col: "Int64"}, engine="pyarrow")
except Exception:
    ahei = pd.read_csv(ahei_path, usecols=[seqn_col, tot_col],
                       dtype={seqn_col: "Int64"}, low_memory=False)

ahei = ahei.rename(columns={seqn_col: "SEQN", tot_col: "ahei_total"})
ahei["ahei_total"] = pd.to_numeric(ahei["ahei_total"], errors="coerce")
ahei = ahei.dropna(subset=["SEQN"]).drop_duplicates("SEQN", keep="last")

# --- merge into your frame ---
df = df_my_cov_aligned.copy()
df["SEQN"] = pd.to_numeric(df["SEQN"], errors="coerce")
pre = df["ahei_total"].notna().sum() if "ahei_total" in df.columns else 0

df = df.merge(ahei, on="SEQN", how="left", suffixes=("", "_ahei"))
if "ahei_total_ahei" in df.columns:
    df["ahei_total"] = df["ahei_total"].where(df["ahei_total"].notna(), df["ahei_total_ahei"])
    df.drop(columns=["ahei_total_ahei"], inplace=True)

post = df["ahei_total"].notna().sum()
print(f"✓ ahei_total merged from {ahei_path.name}. Non-missing: {post} (added {post-pre}).")

df_my_cov_aligned = df


✓ ahei_total merged from ahei_1999_2018_combined.csv. Non-missing: 46169 (added 46169).


#### check again what is missing 

In [237]:
# Start from your earlier needed_core and trim
needed_core = [
  "RIDAGEYR","SEX","RACE","household_size",
  "SMK_AVG","SMK","ALCG2","met_hr",
  "bmic","DIABE","HYPERTEN","chol_rx","CVD","cancer",
  "probable_depression","sdoh_score","ahei_total",
  "unemployment2","pir","SNAP","EDU","sdoh_access","ins","HOQ065","marriage",
  "sdmvpsu","sdmvstra","SEQN","SDDSRVYR",
  # weights (NHANES only)
  "WTINT2YR","WTMEC2YR","WTSAF2YR","WTDRD1","WTDR2D","WTINT4YR","WTMEC4YR"
]

# Check what's still missing
present = [c for c in needed_core if c in df_my_cov_aligned.columns]
missing = [c for c in needed_core if c not in df_my_cov_aligned.columns]
weights_present = [c for c in ["WTINT2YR","WTMEC2YR","WTSAF2YR","WTDRD1","WTDR2D","WTINT4YR","WTMEC4YR"]
                   if c in df_my_cov_aligned.columns]

print("Present:", len(present), "Missing:", len(missing))
print("Still missing:", missing)
print("NHANES weights present:", weights_present)


Present: 14 Missing: 22
Still missing: ['RIDAGEYR', 'SEX', 'RACE', 'household_size', 'SMK_AVG', 'SMK', 'ALCG2', 'met_hr', 'bmic', 'DIABE', 'HYPERTEN', 'probable_depression', 'sdoh_score', 'unemployment2', 'EDU', 'sdoh_access', 'ins', 'marriage', 'sdmvpsu', 'sdmvstra', 'WTDRD1', 'WTDR2D']
NHANES weights present: ['WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'WTINT4YR', 'WTMEC4YR']


In [241]:
import textwrap
print(textwrap.fill(", ".join(map(str, df_my_cov_aligned.columns)), width=100))

SEQN, SDDSRVYR, WTINT2YR, WTMEC2YR, WTSAF2YR, SDMVPSU, SDMVSTRA, age, sex, re, edu, pir, tchol, hdl,
ldl, tg, wc, bmi, dm_self, hba1c, fpg, chf, chd, mi, stroke, cancer, emphysema, bronchitis, asthma,
re2, copd, sbp, dbp, dm_rx, chol_rx, angina_rx, htn_rx, roseQ, no_na, age_cat, pir_cat, edu2, CVD,
lung_disease, diabetes, tchol_hdl, angina, lipid_pri, adiposity_pri, bp_pri, glucose_pri, cvd_pri,
lipid_sec, adiposity_sec, bp_sec, glucose_sec, cvd_sec, optimal_pri_count, intermediate_pri_count,
poor_pri_count, optimal_sec_count, intermediate_sec_count, poor_sec_count, optimal_all, poor_all,
optimal_all_sec, poor_all_sec, MetS_hdl, MetS_triglycerides, MetS_bp, MetS_wc, MetS_fpg, MetS_count,
MetS, RIAGENDR, FEMALE, SMK_STATUS, CIGS_PER_DAY, PACK_YEARS, FORMER_SMOKER, DRINKS_PER_DAY,
ALCOHOL_CAT, LTPA, METSCORE, IMP, BMXWT, BMXHT, BMI_CLAS, HTN, HIGH_CHOL, DMDHHSIZ, ELIGSTAT,
MORTSTAT, PERMTH_EXM, PERMTH_INT, UCOD_LEADING, IS_POST2018, IS_ADULT, MORTALITY_COVERED, EVENT,
CENSORED, FU_YRS_EX

#### adjust column name and check missingness 

In [242]:
import pandas as pd

# Canonical set (HTN stays; BPQ020/050A removed per your earlier choice)
needed_core = [
  "RIDAGEYR","SEX","RACE","household_size",
  "SMK_AVG","SMK","ALCG2","met_hr",
  "bmic","DIABE","HYPERTEN","chol_rx","CVD","cancer",
  "probable_depression","sdoh_score","ahei_total",
  "unemployment2","pir","SNAP","EDU","sdoh_access","ins","HOQ065","marriage",
  "sdmvpsu","sdmvstra","SEQN","SDDSRVYR",
  "WTINT2YR","WTMEC2YR","WTSAF2YR","WTDRD1","WTDR2D","WTINT4YR","WTMEC4YR"
]

# Map your existing columns → canonical (no coalescing; just alias-by-best)
aliases = {
    "RIDAGEYR": ["RIDAGEYR","age"],
    "SEX":      ["SEX","RIAGENDR","sex"],
    "RACE":     ["RACE","re2","re"],
    "household_size": ["household_size","DMDHHSIZ"],
    "SMK_AVG":  ["SMK_AVG","CIGS_PER_DAY"],
    "SMK":      ["SMK","SMK_STATUS","FORMER_SMOKER"],
    "ALCG2":    ["ALCG2","ALCOHOL_CAT"],
    "met_hr":   ["met_hr","METSCORE","LTPA"],
    "bmic":     ["bmic","BMI_CLAS","bmi"],
    "DIABE":    ["DIABE","diabetes","DIABETES","dm_self"],
    "HYPERTEN": ["HYPERTEN","HTN"],
    "chol_rx":  ["chol_rx"],
    "CVD":      ["CVD"],
    "cancer":   ["cancer"],
    "probable_depression": ["probable_depression","DEP_HARMONIZED","PHQ9_GE10","DEP_IMP"],
    "sdoh_score": ["sdoh_score"],  # likely missing
    "ahei_total": ["ahei_total","AHEI","HEI2015_TOTAL_SCORE","HEI2015_TOTAL"],
    "unemployment2": ["unemployment2","UNEMPLOYMENT","EMPLOY"],
    "pir":      ["pir"],
    "SNAP":     ["SNAP"],
    "EDU":      ["EDU","edu","edu2","EDU_CAT"],
    "sdoh_access": ["sdoh_access","HOD050","HOQ065"],  # if you treat these as access proxies
    "ins":      ["ins","INS"],
    "HOQ065":   ["HOQ065"],
    "marriage": ["marriage","MARITAL","MARITAL_CAT"],
    "sdmvpsu":  ["sdmvpsu","SDMVPSU"],
    "sdmvstra": ["sdmvstra","SDMVSTRA"],
    "SEQN":     ["SEQN"],
    "SDDSRVYR": ["SDDSRVYR"],
    # weights
    "WTINT2YR": ["WTINT2YR"],
    "WTMEC2YR": ["WTMEC2YR"],
    "WTSAF2YR": ["WTSAF2YR","WTSCI2YR"],  # your frame has WTSCI2YR (safety)
    "WTDRD1":   ["WTDRD1"],               # likely missing
    "WTDR2D":   ["WTDR2D"],               # likely missing
    "WTINT4YR": ["WTINT4YR","WTINTPRP"],
    "WTMEC4YR": ["WTMEC4YR","WTMECPRP"],
}

def ci_pick(df, names):
    low = {c.lower(): c for c in df.columns}
    for n in names:
        if n in df.columns: return n
        if n.lower() in low: return low[n.lower()]
    return None

created = {}
for target, cands in aliases.items():
    src = ci_pick(df_my_cov_aligned, cands)
    if src and target not in df_my_cov_aligned.columns:
        df_my_cov_aligned[target] = df_my_cov_aligned[src]
        created[target] = src

present = [c for c in needed_core if c in df_my_cov_aligned.columns]
missing = [c for c in needed_core if c not in df_my_cov_aligned.columns]

print(f"Present: {len(present)}  Missing: {len(missing)}")
print("Aliased (canonical ← source):", {k:v for k,v in created.items()})
print("Still missing:", missing)


Present: 33  Missing: 3
Aliased (canonical ← source): {'RIDAGEYR': 'age', 'SEX': 'sex', 'RACE': 're2', 'household_size': 'DMDHHSIZ', 'SMK_AVG': 'CIGS_PER_DAY', 'SMK': 'SMK_STATUS', 'ALCG2': 'ALCOHOL_CAT', 'met_hr': 'METSCORE', 'bmic': 'BMI_CLAS', 'DIABE': 'diabetes', 'HYPERTEN': 'HTN', 'probable_depression': 'DEP_HARMONIZED', 'unemployment2': 'UNEMPLOYMENT', 'EDU': 'edu', 'sdoh_access': 'HOD050', 'ins': 'INS', 'marriage': 'MARITAL', 'sdmvpsu': 'SDMVPSU', 'sdmvstra': 'SDMVSTRA'}
Still missing: ['sdoh_score', 'WTDRD1', 'WTDR2D']


In [244]:
blocks = {
    "OCQ": ["EMPLOY","UNEMPLOYMENT"],
    "HOQ": ["HOD050","HOQ065"],
    "HIQ": ["INS"],
    "FSQ": ["FSDHH","HHFDSEC","ADFDSEC","FS_HH4","FS_ADULT4","FS_HH","FS_ADULT","FS_FINAL","FS_SOURCE_HH","FS_SOURCE_FINAL","SNAP"],
}

for name, cols in blocks.items():
    present = [c for c in cols if c in df_my_cov_aligned.columns]
    missing = [c for c in cols if c not in df_my_cov_aligned.columns]
    print(f"{name} → present: {present} | missing: {missing}")


OCQ → present: ['EMPLOY', 'UNEMPLOYMENT'] | missing: []
HOQ → present: ['HOD050', 'HOQ065'] | missing: []
HIQ → present: ['INS'] | missing: []
FSQ → present: ['FSDHH', 'HHFDSEC', 'ADFDSEC', 'FS_HH4', 'FS_ADULT4', 'FS_HH', 'FS_ADULT', 'FS_FINAL', 'FS_SOURCE_HH', 'FS_SOURCE_FINAL', 'SNAP'] | missing: []


In [243]:
df_my_cov_aligned.head(10)

Unnamed: 0,SEQN,SDDSRVYR,WTINT2YR,WTMEC2YR,WTSAF2YR,SDMVPSU,SDMVSTRA,age,sex,re,...,DIABE,HYPERTEN,probable_depression,unemployment2,EDU,sdoh_access,ins,marriage,sdmvpsu,sdmvstra
0,1,1.0,9727.078709,10982.898896,75131.2,1,5,2.0,F,Other Hispanic,...,0,0,,,,,,,1,5
1,2,1.0,26678.636376,28325.384898,60586.147294,3,1,77.0,M,Mexican American,...,0,0,,0.0,5.0,2.0,1.0,,3,1
2,3,1.0,43621.680548,46192.256945,121969.841152,2,7,10.0,F,Mexican American,...,0,0,,,3.0,,,,2,7
3,4,1.0,10346.119327,10251.26002,4624.687273,1,2,1.0,M,Other Hispanic,...,0,0,,,,,,,1,2
4,5,1.0,91050.84662,99445.065735,234895.20565,2,8,49.0,M,Mexican American,...,0,1,,0.0,5.0,7.0,1.0,1.0,2,8
5,6,1.0,36508.250375,39656.600444,13379.8,2,2,19.0,F,NH Black,...,0,0,,,15.0,,,5.0,2,2
6,7,1.0,22352.08862,25525.423409,57661.621988,2,4,59.0,F,Other Hispanic,...,0,0,,0.0,2.0,,,1.0,2,4
7,8,1.0,31600.089655,31510.587866,76026.438279,1,6,13.0,M,Mexican American,...,0,0,,,5.0,,,,1,6
8,9,1.0,7529.435502,7575.870247,14694.924957,2,9,11.0,F,Other Hispanic,...,0,0,,,5.0,,,,2,9
9,10,1.0,21071.164059,22445.808572,60202.416895,1,7,43.0,M,Other Hispanic,...,0,1,,0.0,3.0,4.0,2.0,4.0,1,7


#### Keep important column and name it df_my_cov_aligned_short

In [247]:
import pandas as pd
import numpy as np

src = df_my_cov_aligned.copy()

# ---- 1) Canonical columns to keep (core + key derived + requested raw fields) ----
canonical_order = [
    # IDs / survey design
    "SEQN","SDDSRVYR","sdmvpsu","sdmvstra",
    # demographics
    "RIDAGEYR","SEX","RACE","household_size","EDU","pir",
    # behavior (canonical)
    "SMK_AVG","SMK","ALCG2","met_hr",
    # behavior (raw fields to keep)
    "SMK_STATUS","CIGS_PER_DAY","PACK_YEARS","FORMER_SMOKER","DRINKS_PER_DAY","ALCOHOL_CAT",
    # clinical
    "bmic","DIABE","HYPERTEN","chol_rx","CVD","cancer",
    # scores/outcomes
    "probable_depression","sdoh_score","ahei_total",
    # SDOH / access / insurance / marital / SNAP / FS
    "unemployment2","sdoh_access","ins","HOQ065","marriage","SNAP","FS",
    # weights (interview/exam + pre-pandemic 4-yr + safety)
    "WTINT2YR","WTMEC2YR","WTSAF2YR","WTINT4YR","WTMEC4YR",
]

# ---- 2) Alias map: canonical -> possible sources in your frame ----
aliases = {
    "SEQN": ["SEQN"],
    "SDDSRVYR": ["SDDSRVYR","SDDSRVYR_src"],

    "sdmvpsu": ["sdmvpsu","SDMVPSU"],
    "sdmvstra":["sdmvstra","SDMVSTRA"],

    "RIDAGEYR": ["RIDAGEYR","age"],
    "SEX": ["SEX","RIAGENDR","sex"],
    "RACE": ["RACE","re2","re"],
    "household_size": ["household_size","DMDHHSIZ"],
    "EDU": ["EDU","edu","edu2","EDU_CAT"],
    "pir": ["pir"],

    # canonical behavior
    "SMK_AVG": ["SMK_AVG","CIGS_PER_DAY"],
    "SMK": ["SMK","SMK_STATUS","FORMER_SMOKER"],
    "ALCG2": ["ALCG2","ALCOHOL_CAT"],
    "met_hr": ["met_hr","METSCORE","LTPA"],

    # raw behavior fields (keep as-is)
    "SMK_STATUS": ["SMK_STATUS"],
    "CIGS_PER_DAY": ["CIGS_PER_DAY"],
    "PACK_YEARS": ["PACK_YEARS"],
    "FORMER_SMOKER": ["FORMER_SMOKER"],
    "DRINKS_PER_DAY": ["DRINKS_PER_DAY"],
    "ALCOHOL_CAT": ["ALCOHOL_CAT"],

    # clinical
    "bmic": ["bmic","BMI_CLAS","bmi"],
    "DIABE": ["DIABE","diabetes","DIABETES","dm_self"],
    "HYPERTEN": ["HYPERTEN","HTN"],
    "chol_rx": ["chol_rx"],
    "CVD": ["CVD"],
    "cancer": ["cancer"],

    # outcomes/scores
    "probable_depression": ["probable_depression","DEP_HARMONIZED","PHQ9_GE10","DEP_IMP"],
    "sdoh_score": ["sdoh_score"],
    "ahei_total": ["ahei_total","AHEI","HEI2015_TOTAL_SCORE","HEI2015_TOTAL"],

    # SDOH etc.
    "unemployment2": ["unemployment2","UNEMPLOYMENT","EMPLOY"],
    "sdoh_access": ["sdoh_access"],
    "ins": ["ins","INS"],
    "HOQ065": ["HOQ065"],
    "marriage": ["marriage","MARITAL","MARITAL_CAT"],

    "SNAP": ["SNAP"],
    "FS": ["FS_FINAL","FS"],  # prefer your final FS binary

    # weights
    "WTINT2YR": ["WTINT2YR"],
    "WTMEC2YR": ["WTMEC2YR"],
    "WTSAF2YR": ["WTSAF2YR","WTSCI2YR"],
    "WTINT4YR": ["WTINT4YR","WTINTPRP"],
    "WTMEC4YR": ["WTMEC4YR","WTMECPRP"],
}

def ci_pick(df, names):
    low = {c.lower(): c for c in df.columns}
    for n in names:
        if n in df.columns: return n
        if n.lower() in low: return low[n.lower()]
    return None

# ---- 3) Build df_short with canonical names, copying from the best source ----
short_cols = {}
created_from = {}
for canon, cands in aliases.items():
    src_col = ci_pick(src, cands)
    if src_col is not None:
        short_cols[canon] = src[src_col]
        created_from[canon] = src_col

df_my_cov_aligned_short = pd.DataFrame(short_cols)

# ---- 4) Light harmonization / typing ----
# SEX
if "SEX" in df_my_cov_aligned_short.columns:
    s = df_my_cov_aligned_short["SEX"]
    if pd.api.types.is_numeric_dtype(s):
        df_my_cov_aligned_short["SEX"] = s.map({1:"Male", 2:"Female"}).astype("string")
    else:
        df_my_cov_aligned_short["SEX"] = s.astype(str).str.strip().str.capitalize()

# Binary-ish ints
for col_bin in ["DIABE","HYPERTEN","CVD","cancer","SNAP","FS","unemployment2","FORMER_SMOKER"]:
    if col_bin in df_my_cov_aligned_short.columns:
        df_my_cov_aligned_short[col_bin] = pd.to_numeric(df_my_cov_aligned_short[col_bin], errors="coerce").astype("Int64")

# Continuous behavior
for col_num in ["CIGS_PER_DAY","PACK_YEARS","DRINKS_PER_DAY","SMK_AVG","met_hr","pir"]:
    if col_num in df_my_cov_aligned_short.columns:
        df_my_cov_aligned_short[col_num] = pd.to_numeric(df_my_cov_aligned_short[col_num], errors="coerce")

# PSU/STRATA → nullable ints; weights → float
for col in ["sdmvpsu","sdmvstra"]:
    if col in df_my_cov_aligned_short.columns:
        df_my_cov_aligned_short[col] = pd.to_numeric(df_my_cov_aligned_short[col], errors="coerce").astype("Int64")

for col in ["WTINT2YR","WTMEC2YR","WTSAF2YR","WTINT4YR","WTMEC4YR"]:
    if col in df_my_cov_aligned_short.columns:
        df_my_cov_aligned_short[col] = pd.to_numeric(df_my_cov_aligned_short[col], errors="coerce")

# Keep final column order (only those that exist)
cols_final = [c for c in canonical_order if c in df_my_cov_aligned_short.columns]
df_my_cov_aligned_short = df_my_cov_aligned_short[cols_final].copy()

# ---- 5) Report coverage ----
missing_after = [c for c in canonical_order if c not in df_my_cov_aligned_short.columns]
print("df_my_cov_aligned_short shape:", df_my_cov_aligned_short.shape)
print("Created (canonical ← source):", created_from)
print("Still missing after aliasing:", missing_after)


df_my_cov_aligned_short shape: (128809, 40)
Created (canonical ← source): {'SEQN': 'SEQN', 'SDDSRVYR': 'SDDSRVYR', 'sdmvpsu': 'sdmvpsu', 'sdmvstra': 'sdmvstra', 'RIDAGEYR': 'RIDAGEYR', 'SEX': 'SEX', 'RACE': 'RACE', 'household_size': 'household_size', 'EDU': 'EDU', 'pir': 'pir', 'SMK_AVG': 'SMK_AVG', 'SMK': 'SMK', 'ALCG2': 'ALCG2', 'met_hr': 'met_hr', 'SMK_STATUS': 'SMK_STATUS', 'CIGS_PER_DAY': 'CIGS_PER_DAY', 'PACK_YEARS': 'PACK_YEARS', 'FORMER_SMOKER': 'FORMER_SMOKER', 'DRINKS_PER_DAY': 'DRINKS_PER_DAY', 'ALCOHOL_CAT': 'ALCOHOL_CAT', 'bmic': 'bmic', 'DIABE': 'DIABE', 'HYPERTEN': 'HYPERTEN', 'chol_rx': 'chol_rx', 'CVD': 'CVD', 'cancer': 'cancer', 'probable_depression': 'probable_depression', 'ahei_total': 'ahei_total', 'unemployment2': 'unemployment2', 'sdoh_access': 'sdoh_access', 'ins': 'ins', 'HOQ065': 'HOQ065', 'marriage': 'marriage', 'SNAP': 'SNAP', 'FS': 'FS_FINAL', 'WTINT2YR': 'WTINT2YR', 'WTMEC2YR': 'WTMEC2YR', 'WTSAF2YR': 'WTSAF2YR', 'WTINT4YR': 'WTINT4YR', 'WTMEC4YR': 'WTMEC4

In [227]:
#### Checked why SMK_AVG etc missing 
## Pre-2018 cycles have ~10–13% non-missing (expected: only current smokers report cigs/day)


## Check what column missing post 2018

In [260]:
 df_my_cov_aligned.loc[df_my_cov_aligned["SDDSRVYR"].eq(1), ["DMDHHSIZ","probable_depression","RIDAGEYR", "CIDI_12M_MDE"]].tail(20)


Unnamed: 0,DMDHHSIZ,probable_depression,RIDAGEYR,CIDI_12M_MDE
9945,3.0,,22.0,
9946,7.0,0.0,35.0,0.0
9947,4.0,,2.0,
9948,4.0,,39.0,
9949,3.0,,14.0,
9950,6.0,,15.0,
9951,1.0,,23.0,
9952,7.0,,14.0,
9953,5.0,,36.0,
9954,4.0,,37.0,


In [251]:

# df_my_cov_aligned.loc[df_my_cov_aligned["SDDSRVYR"].eq(66), ["DMDHHSIZ","WTINT4YR","WTMEC4YR"]].tail(9000)
# df_my_cov_aligned[["DMDHHSIZ",'WTINT4YR', 'WTMEC4YR']].tail(20)
# df_my_cov_aligned[["DMDHHSIZ", "household_size", "HOQ065"]].tail(10) 

# df_lu_cov_1999_2018[[  'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR']].head(10) 
# 'SEQN','SDDSRVYR', 'SDMVPSU', 'SDMVSTRA'

In [253]:
import pandas as pd
import numpy as np

df = df_my_cov_aligned_short.copy()
df['SDDSRVYR'] = pd.to_numeric(df['SDDSRVYR'], errors='coerce')

# 1) Cycle counts
cycle_counts = (df.groupby('SDDSRVYR', dropna=False).size()
                  .rename('n').reset_index().sort_values('SDDSRVYR'))
print("Distinct cycles present (SDDSRVYR):", cycle_counts['SDDSRVYR'].tolist())
display(cycle_counts)

# 2) Block coverage (non-missing rates)
key_groups = {
    "design": ["sdmvpsu","sdmvstra","WTINT2YR","WTMEC2YR","WTSAF2YR","WTINT4YR","WTMEC4YR"],
    "demo":   ["RIDAGEYR","SEX","RACE","household_size","EDU","pir"],
    "behav":  ["SMK","ALCG2","met_hr","SMK_STATUS","CIGS_PER_DAY","DRINKS_PER_DAY"],
    "clin":   ["bmic","DIABE","HYPERTEN","chol_rx","CVD","cancer"],
    "scores": ["probable_depression","ahei_total"],  # sdoh_score later
    "sdoh":   ["unemployment2","sdoh_access","ins","HOQ065","marriage","SNAP","FS"],
}

rows = []
for block, cols in key_groups.items():
    cols_present = [c for c in cols if c in df.columns]
    if not cols_present:
        continue
    # non-missing rate per column per cycle
    nm = (df.groupby('SDDSRVYR', dropna=False)[cols_present]
            .agg(lambda s: s.notna().mean()))
    nm['block_mean'] = nm.mean(axis=1)  # average within the block
    nm = nm.reset_index()
    nm.insert(1, 'block', block)
    rows.append(nm)

coverage = pd.concat(rows, ignore_index=True)

# 2a) Wide summary: average non-missing per block per cycle
block_wide = (coverage.pivot(index='SDDSRVYR', columns='block', values='block_mean')
                        .sort_index())
print("\nAverage non-missing rate per block (by cycle):")
display((block_wide*100).round(1))

# 3) Sentinel columns
sentinel_cols = ["WTMEC2YR","WTMEC4YR","FS","SNAP","probable_depression","ahei_total","met_hr","EDU","ins"]
sentinel_cols = [c for c in sentinel_cols if c in df.columns]
if sentinel_cols:
    sentinel = (df.groupby('SDDSRVYR', dropna=False)[sentinel_cols]
                  .agg(lambda s: s.notna().mean())
                  .rename(columns=lambda c: f"{c}_nonmiss")
                  .reset_index()
                  .sort_values('SDDSRVYR'))
    print("\nSentinel column non-missing rates (by cycle):")
    display((sentinel.set_index('SDDSRVYR')*100).round(1))


Distinct cycles present (SDDSRVYR): [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 12.0, 66.0]


Unnamed: 0,SDDSRVYR,n
0,1.0,9965
1,2.0,11039
2,3.0,10122
3,4.0,10348
4,5.0,10149
5,6.0,10537
6,7.0,9756
7,8.0,10175
8,9.0,9971
9,10.0,9254



Average non-missing rate per block (by cycle):


block,behav,clin,demo,design,scores,sdoh
SDDSRVYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,37.6,97.5,94.9,71.4,44.1,43.7
2.0,37.9,96.9,95.9,71.4,44.6,44.1
3.0,38.6,97.6,96.2,71.4,41.2,51.5
4.0,37.6,97.7,96.0,71.4,41.0,49.8
5.0,45.7,97.9,95.7,71.4,45.1,57.7
6.0,46.4,98.2,95.7,71.4,46.6,58.2
7.0,45.2,98.0,95.8,71.4,44.0,56.3
8.0,45.5,98.2,96.1,71.4,44.0,55.7
9.0,45.3,98.0,95.6,71.4,43.1,55.3
10.0,41.8,97.8,95.3,71.4,44.8,57.4



Sentinel column non-missing rates (by cycle):


Unnamed: 0_level_0,WTMEC2YR_nonmiss,WTMEC4YR_nonmiss,FS_nonmiss,SNAP_nonmiss,probable_depression_nonmiss,ahei_total_nonmiss,met_hr_nonmiss,EDU_nonmiss,ins_nonmiss
SDDSRVYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1.0,100.0,0.0,47.6,4.8,7.2,81.0,49.0,84.3,47.6
2.0,100.0,0.0,45.8,4.2,7.4,81.8,49.0,82.8,47.7
3.0,100.0,0.0,47.6,48.9,6.8,75.5,49.8,82.8,49.0
4.0,100.0,0.0,47.6,47.5,51.5,30.5,48.1,81.3,45.4
5.0,100.0,0.0,57.9,57.9,59.1,31.1,58.5,83.1,55.5
6.0,100.0,0.0,58.3,58.3,60.4,32.9,59.0,83.6,55.4
7.0,100.0,0.0,56.7,56.5,57.6,30.5,57.0,83.6,53.3
8.0,100.0,0.0,55.9,55.9,58.2,29.9,56.7,84.2,53.1
9.0,100.0,0.0,55.3,55.0,57.5,28.7,57.4,83.9,51.3
10.0,100.0,0.0,56.7,56.2,59.8,29.9,60.2,85.1,55.3


In [None]:
## NOTE: From 1999–2004, NHANES used a CIDI subsample (adults 20–39y), so depression data appear ~90% missing overall

#### 1) check missingness for all column 

In [277]:
df = df_my_cov_aligned_short.copy()

# exclude identifier-like columns from audit
IGNORE = {"SEQN", "SDDSRVYR"}   # <- add SDDSRVYR here
cols_all = [c for c in df.columns if c not in IGNORE]

# WIDE: % non-missing per cycle × column
rate_wide = (
    df.groupby("SDDSRVYR")[cols_all]
      .apply(lambda g: g.notna().mean().mul(100))
      .round(1)
      .sort_index()
)

# LONG: safe melt
rate_long = (
    rate_wide
      .reset_index()                         # now SDDSRVYR is only in the index, not in columns
      .melt(id_vars="SDDSRVYR", var_name="column", value_name="pct_nonmiss")
      .sort_values(["SDDSRVYR","pct_nonmiss","column"])
)

print(rate_wide.head())
print(rate_long.head())


          sdmvpsu  sdmvstra  RIDAGEYR    SEX   RACE  household_size   EDU  \
SDDSRVYR                                                                    
1.0         100.0     100.0     100.0  100.0  100.0           100.0  84.3   
2.0         100.0     100.0     100.0  100.0  100.0           100.0  82.8   
3.0         100.0     100.0     100.0  100.0  100.0           100.0  82.8   
4.0         100.0     100.0     100.0  100.0  100.0           100.0  81.3   
5.0         100.0     100.0     100.0  100.0  100.0           100.0  83.1   

           pir  SMK_AVG   SMK  ...   ins  HOQ065  marriage  SNAP    FS  \
SDDSRVYR                       ...                                       
1.0       85.1     10.0  48.8  ...  47.6    48.0      60.9   4.8  47.6   
2.0       92.8     10.6  48.9  ...  47.7    48.1      65.7   4.2  45.8   
3.0       94.2     11.0  49.7  ...  49.0    49.2      66.8  48.9  47.6   
4.0       94.8     10.4  48.1  ...  45.4    47.6      64.7  47.5  47.6   
5.0       91.2  

In [None]:
#### 2) first focus on post 2018 missing 

In [278]:
df = df_my_cov_aligned_short.copy()
POST = {12.0, 66.0}              # 2017–Mar 2020, 2021–2022
df_post = df[df['SDDSRVYR'].isin(POST)]


In [279]:
IGNORE = {'SEQN','SDDSRVYR'}
cols_all = [c for c in df_post.columns if c not in IGNORE]

rate_wide_post = (
    df_post.groupby('SDDSRVYR')[cols_all]
           .apply(lambda g: g.notna().mean().mul(100))
           .round(1)
           .sort_index()
)
print(rate_wide_post.head())   # wide table


          sdmvpsu  sdmvstra  RIDAGEYR    SEX  RACE  household_size   EDU  \
SDDSRVYR                                                                   
12.0        100.0     100.0     100.0  100.0   0.0           100.0  65.3   
66.0        100.0     100.0     100.0  100.0   0.0             0.0  59.3   

           pir  SMK_AVG  SMK  ...  ins  HOQ065  marriage  SNAP    FS  \
SDDSRVYR                      ...                                      
12.0      82.9      0.0  0.0  ...  5.6     0.0       0.0   0.0   0.0   
66.0      85.9      0.0  0.0  ...  9.4     0.0       0.0  54.5  54.9   

          WTINT2YR  WTMEC2YR  WTSAF2YR  WTINT4YR  WTMEC4YR  
SDDSRVYR                                                    
12.0           0.0     100.0       0.0       0.0       0.0  
66.0           0.0       0.0       0.0     100.0     100.0  

[2 rows x 38 columns]


In [280]:
# 0% non-missing per cycle
zero_mask = rate_wide_post.eq(0.0)
print("Columns with 0% non-missing by cycle:\n", zero_mask.loc[:, zero_mask.any()].T)

# Consistently 0% across ALL post-2018 cycles
consistently_zero = zero_mask.all(axis=0)
print("\nConsistently 0% (both 12 & 66):\n", consistently_zero[consistently_zero].index.tolist())

# Low coverage threshold (tune as needed)
THRESH = 10.0
low_mask = rate_wide_post.lt(THRESH)
print("\nColumns <10% non-missing by cycle:\n", low_mask.loc[:, low_mask.any()].T)

# Consistently low across ALL post-2018 cycles
consistently_low = low_mask.all(axis=0)
print("\nConsistently <10% (both 12 & 66):\n", consistently_low[consistently_low].index.tolist())


Columns with 0% non-missing by cycle:
 SDDSRVYR         12.0   66.0
RACE             True   True
household_size  False   True
SMK_AVG          True   True
SMK              True   True
SMK_STATUS       True   True
CIGS_PER_DAY     True   True
PACK_YEARS       True   True
FORMER_SMOKER    True   True
DRINKS_PER_DAY   True   True
bmic             True   True
DIABE            True   True
HYPERTEN         True   True
chol_rx          True   True
CVD              True   True
cancer           True   True
ahei_total       True   True
sdoh_access     False   True
HOQ065           True   True
marriage         True   True
SNAP             True  False
FS               True  False
WTINT2YR         True   True
WTMEC2YR        False   True
WTSAF2YR         True   True
WTINT4YR         True  False
WTMEC4YR         True  False

Consistently 0% (both 12 & 66):
 ['RACE', 'SMK_AVG', 'SMK', 'SMK_STATUS', 'CIGS_PER_DAY', 'PACK_YEARS', 'FORMER_SMOKER', 'DRINKS_PER_DAY', 'bmic', 'DIABE', 'HYPERTEN', 'chol_rx'

## Fetch missing post 2018   

In [281]:
import io, requests, pandas as pd

def fetch_xpt(year_folder, filebase):  # year_folder: "2017" or "2021"
    url = f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{year_folder}/DataFiles/{filebase}.xpt"
    r = requests.get(url); r.raise_for_status()
    df = pd.read_sas(io.BytesIO(r.content), format="xport", encoding="latin1")
    df.columns = [c.upper() for c in df.columns]
    return df

# Examples:
prepandemic_demo = fetch_xpt("2017", "P_DEMO")   # 2017–Mar 2020 combined (SDDSRVYR=66). :contentReference[oaicite:3]{index=3}
prepandemic_hiq  = fetch_xpt("2017", "P_HIQ")    # Health insurance, 2017–Mar 2020. :contentReference[oaicite:4]{index=4}
demographics_21  = fetch_xpt("2021", "DEMO_L")   # 2021–2023. :contentReference[oaicite:5]{index=5}
hiq_21           = fetch_xpt("2021", "HIQ_L")    # 2021–2023. :contentReference[oaicite:6]{index=6}


## Fetch missing: weights 

In [313]:
df = df_my_cov_aligned_short.copy()

In [314]:
df.columns 

Index(['SEQN', 'SDDSRVYR', 'sdmvpsu', 'sdmvstra', 'RIDAGEYR', 'SEX', 'RACE',
       'household_size', 'EDU', 'pir', 'SMK_AVG', 'SMK', 'ALCG2', 'met_hr',
       'SMK_STATUS', 'CIGS_PER_DAY', 'PACK_YEARS', 'FORMER_SMOKER',
       'DRINKS_PER_DAY', 'ALCOHOL_CAT', 'bmic', 'DIABE', 'HYPERTEN', 'chol_rx',
       'CVD', 'cancer', 'probable_depression', 'ahei_total', 'unemployment2',
       'sdoh_access', 'ins', 'HOQ065', 'marriage', 'SNAP', 'FS', 'WTINT2YR',
       'WTMEC2YR', 'WTSAF2YR', 'WTINT4YR', 'WTMEC4YR'],
      dtype='object')

In [317]:
df = df_my_cov_aligned_short.copy()

cov4 = (df.groupby("SDDSRVYR")[["WTINT4YR","WTMEC4YR"]]
          .apply(lambda g: g.notna().mean().mul(100).round(1)))
print("4-year weight non-missing (%) by cycle:\n", cov4)


4-year weight non-missing (%) by cycle:
           WTINT4YR  WTMEC4YR
SDDSRVYR                    
1.0            0.0       0.0
2.0            0.0       0.0
3.0            0.0       0.0
4.0            0.0       0.0
5.0            0.0       0.0
6.0            0.0       0.0
7.0            0.0       0.0
8.0            0.0       0.0
9.0            0.0       0.0
10.0           0.0       0.0
12.0           0.0       0.0
66.0         100.0     100.0


In [320]:
import pandas as pd

df = df_my_cov_aligned_short.copy()
cols = ["WTINT2YR","WTMEC2YR","WTSAF2YR"]
present = [c for c in cols if c in df.columns]

# % non-missing by cycle
pct_nonmiss = (
    df.groupby("SDDSRVYR")[present]
      .apply(lambda g: g.notna().mean().mul(100))
      .round(1)
      .sort_index()
)
print("Percent non-missing by cycle (%):\n", pct_nonmiss)

# (optional) counts non-missing + total N per cycle
nonmiss_n = df.groupby("SDDSRVYR")[present].apply(lambda g: g.notna().sum()).astype("Int64")
N = df.groupby("SDDSRVYR").size().rename("N")
print("\nNon-missing counts and total N per cycle:\n", nonmiss_n.join(N))


Percent non-missing by cycle (%):
           WTINT2YR  WTMEC2YR  WTSAF2YR
SDDSRVYR                              
1.0          100.0     100.0     100.0
2.0          100.0     100.0     100.0
3.0          100.0     100.0     100.0
4.0          100.0     100.0     100.0
5.0          100.0     100.0     100.0
6.0          100.0     100.0     100.0
7.0          100.0     100.0     100.0
8.0          100.0     100.0     100.0
9.0          100.0     100.0     100.0
10.0         100.0     100.0     100.0
12.0           0.0     100.0       0.0
66.0           0.0       0.0       0.0

Non-missing counts and total N per cycle:
           WTINT2YR  WTMEC2YR  WTSAF2YR      N
SDDSRVYR                                     
1.0           9965      9965      9965   9965
2.0          11039     11039     11039  11039
3.0          10122     10122     10122  10122
4.0          10348     10348     10348  10348
5.0          10149     10149     10149  10149
6.0          10537     10537     10537  10537
7.0    

#### fetch weight from demo for post 2018

In [325]:
import io, requests, pandas as pd, numpy as np

def fetch_xpt(year_folder, filebase):
    url = f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{year_folder}/DataFiles/{filebase}.xpt"
    r = requests.get(url); r.raise_for_status()
    out = pd.read_sas(io.BytesIO(r.content), format="xport", encoding="latin1")
    out.columns = [c.upper() for c in out.columns]
    out["SEQN"] = pd.to_numeric(out["SEQN"], errors="coerce").astype("Int64")
    return out

def safe_cov(df, cols):
    have = [c for c in cols if c in df.columns]
    if not have: 
        return pd.DataFrame()
    return (df.groupby("SDDSRVYR")[have]
              .apply(lambda g: g.notna().mean().mul(100).round(1))
              .sort_index())

df = df_my_cov_aligned_short.copy()
df["SEQN"] = pd.to_numeric(df["SEQN"], errors="coerce").astype("Int64")

# ---- cycle 12 (Aug 2021–Aug 2023): DEMO_L -> WTINT2YR/WTMEC2YR ; lab file -> WTPH2YR ; fasting labs -> WTSAF2YR
mask12 = df["SDDSRVYR"].eq(12.0)

# core interview/exam weights
demo12 = fetch_xpt("2021","DEMO_L").set_index("SEQN")
for col in ["WTINT2YR","WTMEC2YR"]:
    if col in demo12:
        df.loc[mask12, col] = df.loc[mask12, "SEQN"].map(demo12[col])

# phlebotomy weight (WTPH2YR) is included in blood lab files (e.g., HDL_L, TCHOL_L, CBC_L)
WTPH2YR_map = None
for base in ["HDL_L", "TCHOL_L", "CBC_L", "GHB_L"]:
    try:
        lab = fetch_xpt("2021", base)
        if "WTPH2YR" in lab.columns:
            WTPH2YR_map = lab.set_index("SEQN")["WTPH2YR"]
            break
    except Exception as e:
        print(f"[warn] {base} fetch failed: {e}")

if WTPH2YR_map is not None:
    df.loc[mask12, "WTPH2YR"] = df.loc[mask12, "SEQN"].map(WTPH2YR_map)

# fasting weight (WTSAF2YR) lives in fasting lab components (GLU_L and/or INS_L); NOT in DEMO/FASTQX_L
WTSAF2YR_map = None
for base in ["GLU_L", "INS_L"]:
    try:
        lab = fetch_xpt("2021", base)
        if "WTSAF2YR" in lab.columns:
            WTSAF2YR_map = lab.set_index("SEQN")["WTSAF2YR"]
            break
    except Exception as e:
        print(f"[warn] {base} fetch failed: {e}")

if WTSAF2YR_map is not None:
    df.loc[mask12, "WTSAF2YR"] = df.loc[mask12, "SEQN"].map(WTSAF2YR_map)

# ---- cycle 66 (2017–Mar 2020 pre-pandemic): P_DEMO -> WTINTPRP/WTMECPRP ; fasting lab -> WTSAFPRP
mask66 = df["SDDSRVYR"].eq(66.0)
demo66 = fetch_xpt("2017","P_DEMO").set_index("SEQN")
for col in ["WTINTPRP","WTMECPRP"]:
    if col in demo66:
        df.loc[mask66, col] = df.loc[mask66, "SEQN"].map(demo66[col])

WTSAFPRP_map = None
for base in ["P_GLU", "P_INS", "P_TRIGLY"]:
    try:
        lab = fetch_xpt("2017", base)
        if "WTSAFPRP" in lab.columns:
            WTSAFPRP_map = lab.set_index("SEQN")["WTSAFPRP"]
            break
    except Exception as e:
        print(f"[warn] {base} fetch failed: {e}")

if WTSAFPRP_map is not None:
    df.loc[mask66, "WTSAFPRP"] = df.loc[mask66, "SEQN"].map(WTSAFPRP_map)

# ---- treat 0 weights as missing (eligible but no specimen)
for w in ["WTMEC2YR","WTSAF2YR","WTPH2YR","WTMECPRP","WTSAFPRP"]:
    if w in df.columns:
        df[w] = df[w].mask(df[w].eq(0))

# ---- unified convenience weights
df["wt_int"]        = np.where(mask66, df.get("WTINTPRP"),  df.get("WTINT2YR"))
df["wt_mec"]        = np.where(mask66, df.get("WTMECPRP"),  df.get("WTMEC2YR"))
df["wt_fasting"]    = np.where(mask66, df.get("WTSAFPRP"),  df.get("WTSAF2YR"))
df["wt_phlebotomy"] = np.where(mask12, df.get("WTPH2YR"),   np.nan)   # only 2021–2023

# ---- quick coverage check
print("Post-fetch coverage (%):\n",
      safe_cov(df, ["WTINT2YR","WTMEC2YR","WTPH2YR","WTSAF2YR",
                    "WTINTPRP","WTMECPRP","WTSAFPRP",
                    "wt_int","wt_mec","wt_fasting","wt_phlebotomy"]))

df_my_cov_aligned_short = df


Post-fetch coverage (%):
           WTINT2YR  WTMEC2YR  WTPH2YR  WTSAF2YR  WTINTPRP  WTMECPRP  WTSAFPRP  \
SDDSRVYR                                                                        
1.0          100.0      93.1      0.0      86.4       0.0       0.0       0.0   
2.0          100.0      94.9      0.0      88.2       0.0       0.0       0.0   
3.0          100.0      95.3      0.0      88.2       0.0       0.0       0.0   
4.0          100.0      96.2      0.0      87.0       0.0       0.0       0.0   
5.0          100.0      96.2      0.0      86.2       0.0       0.0       0.0   
6.0          100.0      97.3      0.0      87.7       0.0       0.0       0.0   
7.0          100.0      95.7      0.0      88.1       0.0       0.0       0.0   
8.0          100.0      96.4      0.0      87.9       0.0       0.0       0.0   
9.0          100.0      95.7      0.0      87.1       0.0       0.0       0.0   
10.0         100.0      94.1      0.0      88.0       0.0       0.0       0.0   
12

#### sanity check for weight fetching 

In [327]:
# sanity warnings
if (df["SDDSRVYR"].eq(12.0) & df["wt_phlebotomy"].isna()).mean() > 0.4:
    print("[warn] Many 2021–23 rows lack WTPH2YR. Make sure you merged from a blood lab used in your sample.")

# example: ensure fasting weight used only when a fasting analyte is present
fasting_vars_present = any(c in df.columns for c in ["GLU", "INS", "TRIGLY"])  # adapt to your var names
if not fasting_vars_present and df["wt_fasting"].notna().any():
    print("[note] wt_fasting present but no fasting analytes detected in frame.")


[note] wt_fasting present but no fasting analytes detected in frame.


## Fetch missing: ins

#### Step 1 — pre-check coverage for ins (cycles 12 & 66)

In [282]:
import pandas as pd
import numpy as np

df = df_my_cov_aligned_short.copy()

POST = {12.0, 66.0}
def pct_nonmiss(d, col): 
    return d.groupby("SDDSRVYR")[col].apply(lambda s: s.notna().mean()*100).round(1)

print("Before fill — % non-missing 'ins':")
print(pct_nonmiss(df[df.SDDSRVYR.isin(POST)], "ins"))


Before fill — % non-missing 'ins':
SDDSRVYR
12.0    5.6
66.0    9.4
Name: ins, dtype: float64


In [283]:
df_my_cov_aligned_short["ins"]

0         <NA>
1            1
2         <NA>
3         <NA>
4            1
          ... 
128804    <NA>
128805    <NA>
128806    <NA>
128807    <NA>
128808    <NA>
Name: ins, Length: 128809, dtype: Int64

#### Step 2 helper: fetch + standardize

In [294]:
import pandas as pd, numpy as np, io, requests

def fetch_xpt(year_folder: str, filebase: str) -> pd.DataFrame:
    url = f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{year_folder}/DataFiles/{filebase}.xpt"
    r = requests.get(url); r.raise_for_status()
    out = pd.read_sas(io.BytesIO(r.content), format="xport", encoding="latin1")
    out.columns = [c.upper() for c in out.columns]
    out["SEQN"] = pd.to_numeric(out["SEQN"], errors="coerce").astype("Int64")
    return out

def choose_hiq_source_for_cycle(df, cycle):
    candidates = [("2017","P_HIQ"), ("2021","HIQ_L")]
    sel = df["SDDSRVYR"].eq(float(cycle))
    seqn_cycle = pd.Index(df.loc[sel, "SEQN"].dropna().astype("Int64"))
    best, best_overlap = None, -1
    for yr, fb in candidates:
        try:
            hiq = fetch_xpt(yr, fb)
            ov = len(seqn_cycle.intersection(hiq["SEQN"]))
            if ov > best_overlap:
                best, best_overlap = (yr, fb, hiq), ov
        except Exception as e:
            print(f"[warn] {yr}/{fb} fetch failed: {e}")
    if best is None or best_overlap == 0:
        print(f"[cycle {cycle}] No matching HIQ (overlap=0).")
        return None
    yr, fb, hiq = best
    print(f"[cycle {cycle}] Using {yr}/{fb} (overlap={best_overlap})")
    return hiq


#### Step 3 — overwrite ins for a cycle 

In [297]:
def overwrite_ins_for_cycle(df_in: pd.DataFrame, cycle: int) -> pd.DataFrame:
    df_out = df_in.copy()
    df_out["SEQN"] = pd.to_numeric(df_out["SEQN"], errors="coerce").astype("Int64")
    if "ins" not in df_out: df_out["ins"] = pd.NA

    hiq = choose_hiq_source_for_cycle(df_out, cycle)
    if hiq is None or "HIQ011" not in hiq.columns: 
        return df_out

    # Map HIQ011 -> 1/0 (leave others as NA)
    src = hiq.set_index("SEQN")["HIQ011"].map({1:1, 2:0}).astype("Int8")

    sel_idx = df_out.index[df_out["SDDSRVYR"].eq(float(cycle))]
    # backup for audit
    df_out.loc[sel_idx, "ins_prev"] = df_out.loc[sel_idx, "ins"].astype("Int8")

    # aligned overwrite for the whole cycle
    mapped = df_out.loc[sel_idx, "SEQN"].map(src)
    df_out.loc[sel_idx, "ins"] = mapped.values

    # compact dtype
    try: df_out["ins"] = df_out["ins"].astype("Int8")
    except: pass

    # tiny audit
    before = df_in.loc[sel_idx, "ins"].notna().mean()*100
    after  = df_out.loc[sel_idx, "ins"].notna().mean()*100
    changed = (df_out.loc[sel_idx, "ins"] != df_out.loc[sel_idx, "ins_prev"]).sum()

    # with this NA-safe version:
    changed = (df_out.loc[sel_idx, "ins"].fillna(-1) !=
           df_out.loc[sel_idx, "ins_prev"].fillna(-1)).sum()
    print(f"[cycle {cycle}] non-miss ins: {before:.1f}% → {after:.1f}% | rows changed: {changed}")
    return df_out


#### Step 4 — run for 12 and 66 + quick check

In [298]:
def pct_nonmiss(d, col): 
    return d.groupby("SDDSRVYR")[col].apply(lambda s: s.notna().mean()*100).round(1)

df = df_my_cov_aligned_short.copy()
print("Before:", pct_nonmiss(df[df.SDDSRVYR.isin([12.0,66.0])], "ins"))

df = overwrite_ins_for_cycle(df, 12)
df = overwrite_ins_for_cycle(df, 66)

print("After :", pct_nonmiss(df[df.SDDSRVYR.isin([12.0,66.0])], "ins"))


Before: SDDSRVYR
12.0    5.6
66.0    9.4
Name: ins, dtype: float64
[cycle 12] Using 2021/HIQ_L (overlap=11933)
[cycle 12] non-miss ins: 5.6% → 99.5% | rows changed: 11203
[cycle 66] Using 2017/P_HIQ (overlap=15560)
[cycle 66] non-miss ins: 9.4% → 99.8% | rows changed: 14056
After : SDDSRVYR
12.0    99.5
66.0    99.8
Name: ins, dtype: float64


#### Step 5 - merge back 

In [306]:
# 1) fill on a working copy using your overwrite_ins_for_cycle(...)
df_ins = df_my_cov_aligned_short.copy()
for cyc in (12, 66):
    df_ins = overwrite_ins_for_cycle(df_ins, cyc)  # overwrites entire cycle

# 2) merge filled values back into your main df
mask = df_my_cov_aligned_short['SDDSRVYR'].isin([12.0, 66.0])
df_my_cov_aligned_short.loc[mask, 'ins'] = df_ins.loc[mask, 'ins'].astype('Int8')

# (optional) drop helper audit column if it exists
df_my_cov_aligned_short.drop(columns=['ins_prev'], errors='ignore', inplace=True)

# 3) quick check
print(pct_nonmiss(df_my_cov_aligned_short[df_my_cov_aligned_short.SDDSRVYR.isin([12.0,66.0])], "ins"))


[cycle 12] Using 2021/HIQ_L (overlap=11933)
[cycle 12] non-miss ins: 5.6% → 99.5% | rows changed: 11203
[cycle 66] Using 2017/P_HIQ (overlap=15560)
[cycle 66] non-miss ins: 9.4% → 99.8% | rows changed: 14056
SDDSRVYR
12.0    99.5
66.0    99.8
Name: ins, dtype: float64


#### step 6 check merged ins result 

In [329]:
import numpy as np
import pandas as pd

df = df_my_cov_aligned_short.copy()

def pct_ins_by_cycle(df, weight_col=None, mask=None):
    # keep valid 0/1 ins
    base = df.loc[df['ins'].isin([0, 1])].copy()

    # align mask to base index if provided
    if mask is not None:
        mask_aligned = pd.Series(mask, index=df.index).reindex(base.index, fill_value=False)
        base = base.loc[mask_aligned]

    if weight_col is None:
        out = (base.groupby('SDDSRVYR', observed=True)['ins']
                   .mean()
                   .mul(100)
                   .round(1)
                   .rename('pct_ins_unweighted'))
        return out

    def wmean(g):
        if weight_col not in g:
            return np.nan
        w = g[weight_col]
        x = g['ins'].astype(float)
        m = x.notna() & w.notna() & (w > 0)
        if not m.any():
            return np.nan
        return np.average(x[m], weights=w[m]) * 100.0

    out = (base.groupby('SDDSRVYR', observed=True)
               .apply(wmean, include_groups=False)
               .round(1)
               .rename(f'pct_ins_wt_{weight_col}'))
    return out

# Unweighted
unw = pct_ins_by_cycle(df)

# Interview-weighted using your unified weight
wt = pct_ins_by_cycle(df, weight_col='wt_int')

# Adults only
adults_mask = df['RIDAGEYR'] >= 18
wt_adult = pct_ins_by_cycle(df, weight_col='wt_int', mask=adults_mask)

print("Unweighted % insured by cycle:\n", unw)
print("\nWeighted % insured by cycle (wt_int):\n", wt)
print("\nWeighted % insured by cycle — adults 18+ (wt_int):\n", wt_adult)


Unweighted % insured by cycle:
 SDDSRVYR
1.0     69.8
2.0     72.4
3.0     68.4
4.0     67.0
5.0     63.4
6.0     61.1
7.0     63.1
8.0     66.8
9.0     70.0
10.0    70.5
12.0    92.7
66.0    88.1
Name: pct_ins_unweighted, dtype: Float64

Weighted % insured by cycle (wt_int):
 SDDSRVYR
1.0     76.4
2.0     77.6
3.0     75.2
4.0     75.0
5.0     74.1
6.0     72.0
7.0     71.9
8.0     74.0
9.0     79.2
10.0    77.0
12.0    91.7
66.0    88.6
Name: pct_ins_wt_wt_int, dtype: float64

Weighted % insured by cycle — adults 18+ (wt_int):
 SDDSRVYR
1.0     76.4
2.0     77.6
3.0     75.2
4.0     75.0
5.0     74.1
6.0     72.0
7.0     71.9
8.0     74.0
9.0     79.2
10.0    77.0
12.0    90.4
66.0    86.7
Name: pct_ins_wt_wt_int, dtype: float64


## Fetch missing: marriage from DEMO

In [303]:
import pandas as pd, numpy as np, io, requests

def fetch_xpt(year_folder: str, filebase: str) -> pd.DataFrame:
    url = f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{year_folder}/DataFiles/{filebase}.xpt"
    r = requests.get(url); r.raise_for_status()
    out = pd.read_sas(io.BytesIO(r.content), format="xport", encoding="latin1")
    out.columns = [c.upper() for c in out.columns]
    out["SEQN"] = pd.to_numeric(out["SEQN"], errors="coerce").astype("Int64")
    return out

def choose_demo_source_for_cycle(df, cycle):
    candidates = [("2017","P_DEMO"), ("2021","DEMO_L")]  # pre-pandemic vs 2021–22
    seqn_cycle = pd.Index(df.loc[df["SDDSRVYR"].eq(float(cycle)), "SEQN"].dropna().astype("Int64"))
    best, best_overlap = None, -1
    for yr, fb in candidates:
        try:
            demo = fetch_xpt(yr, fb)
            ov = len(seqn_cycle.intersection(demo["SEQN"]))
            if ov > best_overlap:
                best, best_overlap = (yr, fb, demo), ov
        except Exception as e:
            print(f"[warn] {yr}/{fb} fetch failed: {e}")
    if best is None or best_overlap == 0:
        print(f"[cycle {cycle}] No matching DEMO (overlap=0)."); return None
    yr, fb, demo = best
    print(f"[cycle {cycle}] Using {yr}/{fb} (overlap={best_overlap})")
    return demo


#### Step 1 — helpers (fetch + choose correct DEMO file)

In [345]:
import pandas as pd
import numpy as np
# --- helper: pull marital status from a DEMO file, tolerant to name changes
def get_marriage_series(year_folder: str, demo_file: str) -> pd.Series:
    """
    Returns a Series of marital status indexed by SEQN from the requested DEMO file.
    Tries common NHANES column names across vintages.
    """
    demo = fetch_xpt(year_folder, demo_file).set_index("SEQN")
    for cand in ["DMDMARTZ", "P_MARITL", "DMDMARTL", "MARITALZ", "MARITAL"]:
        if cand in demo.columns:
            s = pd.to_numeric(demo[cand], errors="coerce").astype("Int64")
            s.name = cand
            return s
    raise KeyError(f"No marital-status column found in {year_folder}/{demo_file}.xpt")

def overwrite_marriage_for_cycle(df_in: pd.DataFrame,
                                 cycle_code: int,
                                 year_folder: str,
                                 demo_file: str,
                                 add_labels: bool = True,
                                 keep_prev: bool = True) -> pd.DataFrame:
    """
    HARD OVERWRITE: replace ALL rows' `marriage` for SDDSRVYR == cycle_code
    with codes from DEMO (joined on SEQN). Keeps 77/99 as-is.
    """
    df = df_in.copy()
    df["SEQN"] = pd.to_numeric(df["SEQN"], errors="coerce").astype("Int64")

    # ensure target column exists
    if "marriage" not in df.columns:
        df["marriage"] = pd.Series(pd.NA, index=df.index, dtype="Int64")

    mask = df["SDDSRVYR"].eq(float(cycle_code))
    idx = df.index[mask]

    before = df.loc[idx, "marriage"].notna().mean() * 100
    if keep_prev:
        df.loc[idx, "marriage_prev"] = df.loc[idx, "marriage"]

    # source from DEMO
    src = get_marriage_series(year_folder, demo_file)  # Int64, indexed by SEQN
    src_col = src.name
    seqn = df.loc[idx, "SEQN"]
    overlap = seqn.isin(src.index).sum()

    # map & overwrite entire cycle
    mapped = seqn.map(src)  # Int64 with NA where no match
    df.loc[idx, "marriage"] = mapped.values.astype("Int64")

    after = df.loc[idx, "marriage"].notna().mean() * 100
    changed = (
        (df.loc[idx, "marriage"].fillna(-1) !=
         (df.loc[idx, "marriage_prev"].fillna(-1) if keep_prev else df_in.loc[idx, "marriage"].fillna(-1)))
        .sum()
    )

    print(f"[cycle {cycle_code}] Using {year_folder}/{demo_file} col={src_col} (overlap={overlap})")
    print(f"[cycle {cycle_code}] marriage non-miss: {before:.1f}% → {after:.1f}% | rows changed: {changed}")

    if add_labels:
        labels = {
            1: "Married", 2: "Widowed", 3: "Divorced", 4: "Separated",
            5: "Never married", 6: "Living with partner",
            77: "Refused", 99: "Don't know"
        }
        df.loc[idx, "marriage_label"] = df.loc[idx, "marriage"].map(labels)

    return df


#### Step 2 - Build a correct marriage3 

In [346]:
# Collapse rules:
# - If detailed codes present (any of {4,5,6}): 1/6 -> 1; 2/3/4 -> 2; 5 -> 3
# - If already collapsed (only {1,2,3}): keep as-is
detailed_map  = {1:1, 6:1, 2:2, 3:2, 4:2, 5:3}
collapsed_map = {1:1, 2:2, 3:3}

def make_marriage3_by_cycle(df: pd.DataFrame, src_col: str = "marriage") -> pd.Series:
    out = pd.Series(pd.NA, index=df.index, dtype="Int64")
    # iterate by cycle for correct logic per vintage
    for cyc, gidx in df.groupby("SDDSRVYR").groups.items():
        vals = set(pd.Series(df.loc[gidx, src_col]).dropna().unique().tolist())
        if {4,5,6} & vals:  # detailed present
            mapped = df.loc[gidx, src_col].map(detailed_map)
        else:               # already collapsed
            mapped = df.loc[gidx, src_col].map(collapsed_map)
        out.loc[gidx] = mapped.astype("Int64")
    return out


#### Step 3 - Run the overwrite for your two special cycles, then build marriage3

In [347]:
# Work on a copy (or do it in place if you prefer)
df_my_cov_aligned_short = overwrite_marriage_for_cycle(
    df_my_cov_aligned_short, 12, "2021", "DEMO_L",
    add_labels=True, keep_prev=True
)
df_my_cov_aligned_short = overwrite_marriage_for_cycle(
    df_my_cov_aligned_short, 66, "2017", "P_DEMO",
    add_labels=True, keep_prev=True
)

# Build the 3-level variable across all cycles
df_my_cov_aligned_short["marriage3"] = make_marriage3_by_cycle(df_my_cov_aligned_short, "marriage")


[cycle 12] Using 2021/DEMO_L col=DMDMARTZ (overlap=11933)
[cycle 12] marriage non-miss: 65.3% → 65.3% | rows changed: 0
[cycle 66] Using 2017/P_DEMO col=DMDMARTZ (overlap=15560)
[cycle 66] marriage non-miss: 59.3% → 59.3% | rows changed: 0


#### Step 4 - Sanity checks 

In [348]:
# Adult (>=18) coverage
adults_mask = df_my_cov_aligned_short["RIDAGEYR"] >= 18
cov_adult = (df_my_cov_aligned_short.loc[adults_mask]
             .groupby("SDDSRVYR", observed=True)["marriage"]
             .apply(lambda s: s.notna().mean()*100, include_groups=False)
             .round(1))
print("Adult (18+) marriage coverage by cycle (% non-missing):\n", cov_adult)

# 77/99 among non-missing adults
def code_shares(s: pd.Series) -> pd.Series:
    s2 = s.dropna()
    if s2.empty:
        return pd.Series({"p77": np.nan, "p99": np.nan})
    return pd.Series({"p77": (s2.eq(77).mean()*100).round(2),
                      "p99": (s2.eq(99).mean()*100).round(2)})

adult_codes = (df_my_cov_aligned_short.loc[adults_mask]
               .groupby("SDDSRVYR", observed=True)["marriage"]
               .apply(code_shares, include_groups=False))
print("\nAdult 77/99 (%) among non-missing:\n", adult_codes)

# Collapsed 3-level distribution (all cycles)
dist3_all = (df_my_cov_aligned_short
             .groupby("SDDSRVYR", observed=True)["marriage3"]
             .value_counts(dropna=False)
             .sort_index())
print("\nCollapsed 3-level distribution (all cycles):\n", dist3_all)


Adult (18+) marriage coverage by cycle (% non-missing):
 SDDSRVYR
1.0      89.8
2.0      99.9
3.0      99.9
4.0     100.0
5.0      95.3
6.0      95.3
7.0      94.8
8.0      94.4
9.0      95.4
10.0     95.1
12.0     95.6
66.0     95.2
Name: marriage, dtype: float64

Adult 77/99 (%) among non-missing:
 SDDSRVYR     
1.0       p77    0.14
          p99    0.04
2.0       p77    0.07
          p99    0.00
3.0       p77    0.00
          p99    0.00
4.0       p77    0.13
          p99    0.00
5.0       p77    0.07
          p99    0.00
6.0       p77    0.05
          p99    0.02
7.0       p77    0.11
          p99    0.02
8.0       p77    0.03
          p99    0.02
9.0       p77    0.03
          p99    0.02
10.0      p77    0.11
          p99    0.00
12.0      p77    0.05
          p99    0.06
66.0      p77    0.09
          p99    0.02
Name: marriage, dtype: float64

Collapsed 3-level distribution (all cycles):
 SDDSRVYR  marriage3
1.0       1            2717
          2            1030
  

In [349]:
def wt_prop_by_cycle(df: pd.DataFrame, col: str, value, weight: str = "wt_int", mask=None) -> pd.Series:
    """
    Weighted % of df[col] == value within each SDDSRVYR.
    Denominator = sum of weights among rows with non-missing df[col] in that cycle.
    """
    base = df
    if mask is not None:
        base = base.loc[pd.Series(mask, index=df.index)]
    base = base.loc[base[weight].notna() & (base[weight] > 0)]

    def prop(g: pd.DataFrame):
        w = g[weight]
        den = w[g[col].notna()].sum()
        if den == 0:
            return np.nan
        num = w[g[col].eq(value)].sum()
        return (num / den) * 100.0

    return (base.groupby("SDDSRVYR", observed=True)
                .apply(prop, include_groups=False)
                .round(1))

# Example: adult weighted % married/partner (marriage3==1)
adult_weighted_married = wt_prop_by_cycle(
    df_my_cov_aligned_short, col="marriage3", value=1,
    weight="wt_int", mask=adults_mask
).rename("pct_married3_wt")

print("\nAdult weighted % married/partner (marriage3==1) by cycle:\n", adult_weighted_married)



Adult weighted % married/partner (marriage3==1) by cycle:
 SDDSRVYR
1.0     59.9
2.0     62.1
3.0     61.1
4.0     63.5
5.0     63.4
6.0     63.3
7.0     61.4
8.0     62.1
9.0     63.9
10.0    62.0
12.0    60.3
66.0    61.6
Name: pct_married3_wt, dtype: float64


## save df_my_cov_aligned_short 

In [350]:
from pathlib import Path
OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
OUT.mkdir(parents=True, exist_ok=True)

handoff = OUT / "cov_concise_99_23.parquet"
df_my_cov_aligned_short.to_parquet(handoff, index=False)
print("✓ Saved:", handoff)

✓ Saved: /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_concise_99_23.parquet
