## set up 

In [121]:
from pathlib import Path
import pandas as pd

ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
OUT  = ROOT / "output"

MY_PATH = OUT / "demo_mt_cov_dp_sdoh.parquet"
LU_PATH = ROOT / "data/cov/nhanes_primary_anal_full_singleimputation_v2.csv"

df_my_cov_1999_2023 = pd.read_parquet(MY_PATH)
df_lu_cov_1999_2018 = pd.read_csv(LU_PATH)

print("mine:", df_my_cov_1999_2023.shape, "| lu:", df_lu_cov_1999_2018.shape)
print("Loaded:", MY_PATH)
print("Loaded:", LU_PATH)



mine: (128809, 82) | lu: (101316, 75)
Loaded: /Users/dengshuyue/Desktop/SDOH/analysis/output/demo_mt_cov_dp_sdoh.parquet
Loaded: /Users/dengshuyue/Desktop/SDOH/analysis/data/cov/nhanes_primary_anal_full_singleimputation_v2.csv


In [122]:
df_my_cov_1999_2023.head(10)

Unnamed: 0,SEQN,SDDSRVYR,SDMVPSU,SDMVSTRA,WTMEC2YR,AGE_YR,RIAGENDR,SEX,FEMALE,SMK_STATUS,...,FS_HH,FS_ADULT,FS_FINAL,HHFDSEC,ADFDSEC,FS_HH4,FS_ADULT4,FS_SOURCE_HH,FS_SOURCE_FINAL,SNAP_SOURCE
0,1,1.0,1.0,5.0,10982.898896,2.0,2,F,1,,...,,,,,,,,,,
1,2,1.0,3.0,1.0,28325.384898,77.0,1,M,0,NEVER,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,HHFDSEC,household,
2,3,1.0,2.0,7.0,46192.256945,10.0,2,F,1,,...,,,,,,,,,,
3,4,1.0,1.0,2.0,10251.26002,1.0,1,M,0,,...,,,,,,,,,,
4,5,1.0,2.0,8.0,99445.065735,49.0,1,M,0,FORMER,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,HHFDSEC,household,
5,6,1.0,2.0,2.0,39656.600444,19.0,2,F,1,,...,,,,,,,,,,
6,7,1.0,2.0,4.0,25525.423409,59.0,2,F,1,FORMER,...,,,,,,,,,,
7,8,1.0,1.0,6.0,31510.587866,13.0,1,M,0,,...,,,,,,,,,,
8,9,1.0,2.0,9.0,7575.870247,11.0,2,F,1,,...,,,,,,,,,,
9,10,1.0,1.0,7.0,22445.808572,43.0,1,M,0,CURRENT,...,,,,,,,,,,


#### 1) Helpers (used later)

In [211]:
# 1) HELPERS (robust + imports np)
import numpy as np
import pandas as pd

def _norm_str_col(series: pd.Series) -> pd.Series:
    """Lowercase + strip + turn 'nan' into actual NaN."""
    s = series.astype(str).str.strip().str.lower()
    return s.replace({"nan": np.nan})

def _num_summary(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    """Lightweight numeric summary for a set of columns."""
    rows = []
    for c in cols:
        s = pd.to_numeric(df[c], errors="coerce")
        rows.append({
            "column": c,
            "n": len(s),
            "na_rate": float(s.isna().mean()),
            "min": np.nanmin(s.values),
            "p25": np.nanpercentile(s.values, 25),
            "median": np.nanmedian(s.values),
            "p75": np.nanpercentile(s.values, 75),
            "max": np.nanmax(s.values),
            "mean": np.nanmean(s.values),
            "std": np.nanstd(s.values),
            "unique_non_na": int(s.nunique(dropna=True)),
        })
    return pd.DataFrame(rows)

def _binary_sig(series: pd.Series) -> str | None:
    """Detect common binary encodings."""
    vals = set(_norm_str_col(series).dropna().unique())
    if vals <= {"0","1"}: return "0/1"
    if vals <= {"yes","no"}: return "yes/no"
    if vals <= {"true","false"}: return "true/false"
    if vals <= {"male","female"}: return "male/female"
    return None


#### 2) Column set differences (a)

In [212]:
# 2) COLUMN SET DIFFERENCES (a)

cols_my = set(df_my_cov_1999_2023.columns)
cols_lu = set(df_lu_cov_1999_2018.columns)

audit_only_in_lu = sorted(cols_lu - cols_my)
audit_only_in_my = sorted(cols_my - cols_lu)
audit_in_both    = sorted(cols_my & cols_lu)

print(f"[Columns] only_in_lu: {len(audit_only_in_lu)} | only_in_my: {len(audit_only_in_my)} | in_both: {len(audit_in_both)}")
print("• only_in_lu (first 20):", audit_only_in_lu[:20])
print("• only_in_my (first 20):", audit_only_in_my[:20])



[Columns] only_in_lu: 69 | only_in_my: 76 | in_both: 6
• only_in_lu (first 20): ['MetS', 'MetS_bp', 'MetS_count', 'MetS_fpg', 'MetS_hdl', 'MetS_triglycerides', 'MetS_wc', 'WTINT2YR', 'WTSAF2YR', 'X', 'adiposity_pri', 'adiposity_sec', 'age', 'age_cat', 'angina', 'angina_rx', 'asthma', 'bmi', 'bp_pri', 'bp_sec']
• only_in_my (first 20): ['ADFDSEC', 'AGE_YR', 'ALCOHOL_CAT', 'BMI', 'BMI_CLAS', 'BMXHT', 'BMXWT', 'CANCER', 'CENSORED', 'CIDI_12M_MDE', 'CIDI_SCORE_RAW', 'CIGS_PER_DAY', 'DBP', 'DEP_HARMONIZED', 'DEP_IMP', 'DEP_SOURCE', 'DIABETES', 'DMDHHSIZ', 'DPQ_CAT', 'DRINKS_PER_DAY']


In [213]:
# mine
print(len(df_my_cov_1999_2023.columns), "columns")
print(df_my_cov_1999_2023.columns.tolist())

# lu
print(len(df_lu_cov_1999_2018.columns), "columns")
print(df_lu_cov_1999_2018.columns.tolist())

82 columns
['SEQN', 'SDDSRVYR', 'SDMVPSU', 'SDMVSTRA', 'WTMEC2YR', 'AGE_YR', 'RIAGENDR', 'SEX', 'FEMALE', 'SMK_STATUS', 'CIGS_PER_DAY', 'PACK_YEARS', 'FORMER_SMOKER', 'DRINKS_PER_DAY', 'ALCOHOL_CAT', 'LTPA', 'METSCORE', 'IMP', 'BMXWT', 'BMXHT', 'BMI', 'BMI_CLAS', 'DIABETES', 'HTN', 'HIGH_CHOL', 'CVD', 'CANCER', 'SBP', 'DBP', 'TCHOL', 'HDL', 'LDL', 'TG', 'DMDHHSIZ', 'ELIGSTAT', 'MORTSTAT', 'PERMTH_EXM', 'PERMTH_INT', 'UCOD_LEADING', 'IS_POST2018', 'IS_ADULT', 'MORTALITY_COVERED', 'EVENT', 'CENSORED', 'FU_YRS_EXM', 'FU_YRS_INT', 'UCOD_LABEL', 'PHQ9', 'PHQ9_GE10', 'DPQ_CAT', 'DEP_IMP', 'CIDI_SCORE_RAW', 'CIDI_12M_MDE', 'WTSCI2YR', 'DEP_HARMONIZED', 'DEP_SOURCE', 'PIR', 'PIR_CAT', 'INDFMINC', 'EDU', 'EDU_CAT', 'RACE_ETH', 'MARITAL', 'MARITAL_CAT', 'EMPLOY', 'UNEMPLOYMENT', 'HOD050', 'HOQ065', 'INS', 'SNAP', 'FSDHH', 'FS', 'FS_HH', 'FS_ADULT', 'FS_FINAL', 'HHFDSEC', 'ADFDSEC', 'FS_HH4', 'FS_ADULT4', 'FS_SOURCE_HH', 'FS_SOURCE_FINAL', 'SNAP_SOURCE']
75 columns
['X', 'SEQN', 'SDDSRVYR', 'WTIN

In [214]:
# %whos DataFrame


In [215]:
# df_my_cov_1999_2023[['MORTALITY_COVERED', 'EVENT', "UCOD_LABEL", "CANCER"]].head(20)

## Align column same as lu 

In [216]:
import re, difflib
import pandas as pd

# === Inputs (your two originals) ===
mine = df_my_cov_1999_2023.copy()
lu   = df_lu_cov_1999_2018.copy()

lu_cols = list(lu.columns)
mine_cols = list(mine.columns)

# ---------- Helpers ----------
def norm(s: str) -> str:
    # Lower, drop non-alnum (so AGE_YR -> ageyr; RACE_ETH -> raceeth)
    return re.sub(r'[^0-9a-z]+', '', s.lower()) if isinstance(s, str) else s

lu_norm_to_name = {}
for c in lu_cols:
    lu_norm_to_name.setdefault(norm(c), c)  # keep first occurrence

# Minimal synonyms (expand as needed if you see mismatches)
# YOUR column name -> Lu column name
synonyms = {
    'AGE_YR':'age',
    'SEX':'sex',
    'RACE_ETH':'re',
    'EDU':'edu',
    'PIR':'pir',
    'TCHOL':'tchol',
    'HDL':'hdl',
    'LDL':'ldl',
    'TG':'tg',
    'WC':'wc',
    'BMI':'bmi',
    'SBP':'sbp',
    'DBP':'dbp',
    'DIABETES':'diabetes',
    'CVD':'CVD',          # Lu uses uppercase "CVD" in your list
    'CANCER':'cancer',
    'DM_RX':'dm_rx',
    'CHOL_RX':'chol_rx',
    'HTN_RX':'htn_rx',
    'ANGINA_RX':'angina_rx',
    'ANGINA':'angina',
    'AGE_CAT':'age_cat',
    'PIR_CAT':'pir_cat',
    'EDU2':'edu2',
    'METS_HDL':'MetS_hdl',
    'METS_TRIGLYCERIDES':'MetS_triglycerides',
    'METS_BP':'MetS_bp',
    'METS_WC':'MetS_wc',
    'METS_FPG':'MetS_fpg',
    'METS_COUNT':'MetS_count',
    'ROSEQ':'roseQ',
    'NO_NA':'no_na',
    'LUNG_DISEASE':'lung_disease',
    'BP_PRI':'bp_pri',
    'GLUCOSE_PRI':'glucose_pri',
    'LIPID_PRI':'lipid_pri',
    'ADIPOSITY_PRI':'adiposity_pri',
    'CVD_PRI':'cvd_pri',
    'BP_SEC':'bp_sec',
    'GLUCOSE_SEC':'glucose_sec',
    'LIPID_SEC':'lipid_sec',
    'ADIPOSITY_SEC':'adiposity_sec',
    'CVD_SEC':'cvd_sec',
    # Common admin/weight vars:
    'WTMEC2YR':'WTMEC2YR',
    'SDDSRVYR':'SDDSRVYR',
    'SDMVPSU':'SDMVPSU',
    'SDMVSTRA':'SDMVSTRA',
}

# Columns we should **never** rename (IDs/keys that already match)
protect_exact = set(['SEQN','SDDSRVYR','SDMVPSU','SDMVSTRA','WTMEC2YR'])

# ---------- Build mapping (your -> lu) ----------
mapping = {}          # final mapping to apply
used_targets = set()  # to avoid collisions (two src -> one dst)

for src in mine_cols:
    if src in protect_exact or src.endswith('_lu'):
        continue

    # 1) If exact Lu name already, keep as-is
    if src in lu_cols:
        continue

    # 2) Synonym override
    if src in synonyms and synonyms[src] in lu_cols and synonyms[src] not in used_targets and synonyms[src] not in mine_cols:
        mapping[src] = synonyms[src]
        used_targets.add(synonyms[src])
        continue

    # 3) Case-insensitive exact
    ci = next((dst for dst in lu_cols if isinstance(dst, str) and dst.lower() == src.lower()), None)
    if ci and ci not in used_targets and ci not in mine_cols:
        mapping[src] = ci
        used_targets.add(ci)
        continue

    # 4) Normalized name match
    nsrc = norm(src)
    if nsrc in lu_norm_to_name:
        dst = lu_norm_to_name[nsrc]
        if dst not in used_targets and dst not in mine_cols:
            mapping[src] = dst
            used_targets.add(dst)
            continue

    # 5) Fuzzy match for stragglers (safe threshold)
    # Only attempt for alphas; ignore obviously different admin columns you don't want changed
    candidates = difflib.get_close_matches(src, lu_cols, n=1, cutoff=0.92)
    if candidates:
        dst = candidates[0]
        if dst not in used_targets and dst not in mine_cols:
            mapping[src] = dst
            used_targets.add(dst)
            continue

# ---------- Apply rename ----------
mine_renamed = mine.rename(columns=mapping).copy()

# ---------- Reorder to Lu’s order (extras at end; keep *_lu at very end) ----------
ordered = [c for c in lu_cols if c in mine_renamed.columns]
extras  = [c for c in mine_renamed.columns if c not in ordered and not c.endswith('_lu')]
audit   = [c for c in mine_renamed.columns if c.endswith('_lu')]

df_my_cov_aligned = mine_renamed[ordered + extras + audit].copy()

# ---------- Report ----------
renamed_pairs = sorted(mapping.items(), key=lambda x: x[0].lower())
missing_in_mine = [c for c in lu_cols if c not in df_my_cov_aligned.columns]

print(f"Renamed {len(renamed_pairs)} columns automatically.")
print("Examples:", renamed_pairs[:10])
print("Still missing from your data (present in Lu):", missing_in_mine)
print("Final order starts with:", df_my_cov_aligned.columns[:15].tolist())


Renamed 15 columns automatically.
Examples: [('AGE_YR', 'age'), ('BMI', 'bmi'), ('CANCER', 'cancer'), ('DBP', 'dbp'), ('DIABETES', 'diabetes'), ('EDU', 'edu'), ('HDL', 'hdl'), ('LDL', 'ldl'), ('PIR', 'pir'), ('PIR_CAT', 'pir_cat')]
Still missing from your data (present in Lu): ['X', 'WTINT2YR', 'WTSAF2YR', 'wc', 'dm_self', 'hba1c', 'fpg', 'chf', 'chd', 'mi', 'stroke', 'emphysema', 'bronchitis', 'asthma', 're2', 'copd', 'dm_rx', 'chol_rx', 'angina_rx', 'htn_rx', 'roseQ', 'no_na', 'age_cat', 'edu2', 'lung_disease', 'tchol_hdl', 'angina', 'lipid_pri', 'adiposity_pri', 'bp_pri', 'glucose_pri', 'cvd_pri', 'lipid_sec', 'adiposity_sec', 'bp_sec', 'glucose_sec', 'cvd_sec', 'optimal_pri_count', 'intermediate_pri_count', 'poor_pri_count', 'optimal_sec_count', 'intermediate_sec_count', 'poor_sec_count', 'optimal_all', 'poor_all', 'optimal_all_sec', 'poor_all_sec', 'MetS_hdl', 'MetS_triglycerides', 'MetS_bp', 'MetS_wc', 'MetS_fpg', 'MetS_count', 'MetS']
Final order starts with: ['SEQN', 'SDDSRVYR'

#### Adding missing column merge from lu

In [217]:
import re, pandas as pd

# === 0) Start from the two originals ===
mine = df_my_cov_1999_2023.copy()
lu   = df_lu_cov_1999_2018.copy()

# === 1) Auto-rename YOUR columns to Lu's names (case/underscore-insensitive + synonyms) ===
def norm(s: str) -> str:
    return re.sub(r'[^0-9a-z]+', '', s.lower()) if isinstance(s, str) else s

lu_cols = list(lu.columns)
mine_cols = list(mine.columns)

# First pass: normalized name index for Lu
lu_norm_to_name = {}
for c in lu_cols:
    lu_norm_to_name.setdefault(norm(c), c)

# Synonyms (YOUR -> Lu)
synonyms = {
    'AGE_YR':'age','SEX':'sex','RACE_ETH':'re','EDU':'edu','PIR':'pir',
    'TCHOL':'tchol','HDL':'hdl','LDL':'ldl','TG':'tg',
    'WC':'wc','BMI':'bmi','SBP':'sbp','DBP':'dbp',
    'DIABETES':'diabetes','CVD':'CVD','CANCER':'cancer',
    'DM_RX':'dm_rx','CHOL_RX':'chol_rx','HTN_RX':'htn_rx','ANGINA_RX':'angina_rx','ANGINA':'angina',
    'AGE_CAT':'age_cat','PIR_CAT':'pir_cat','EDU2':'edu2',
    'METS_HDL':'MetS_hdl','METS_TRIGLYCERIDES':'MetS_triglycerides','METS_BP':'MetS_bp',
    'METS_WC':'MetS_wc','METS_FPG':'MetS_fpg','METS_COUNT':'MetS_count',
    'ROSEQ':'roseQ','NO_NA':'no_na','LUNG_DISEASE':'lung_disease',
    'BP_PRI':'bp_pri','GLUCOSE_PRI':'glucose_pri','LIPID_PRI':'lipid_pri','ADIPOSITY_PRI':'adiposity_pri',
    'CVD_PRI':'cvd_pri','BP_SEC':'bp_sec','GLUCOSE_SEC':'glucose_sec','LIPID_SEC':'lipid_sec',
    'ADIPOSITY_SEC':'adiposity_sec','CVD_SEC':'cvd_sec',
    # admin/weights that already match:
    'WTMEC2YR':'WTMEC2YR','SDDSRVYR':'SDDSRVYR','SDMVPSU':'SDMVPSU','SDMVSTRA':'SDMVSTRA'
}

protect_exact = {'SEQN','SDDSRVYR','SDMVPSU','SDMVSTRA','WTMEC2YR'}

mapping = {}
used_targets = set()
for src in mine_cols:
    if src in protect_exact or src.endswith('_lu'):
        continue
    if src in lu_cols:
        continue
    # synonyms first
    if src in synonyms and synonyms[src] in lu_cols and synonyms[src] not in used_targets and synonyms[src] not in mine.columns:
        mapping[src] = synonyms[src]; used_targets.add(synonyms[src]); continue
    # case-insensitive exact
    ci = next((dst for dst in lu_cols if isinstance(dst, str) and dst.lower()==src.lower()), None)
    if ci and ci not in used_targets and ci not in mine.columns:
        mapping[src] = ci; used_targets.add(ci); continue
    # normalized match
    nc = norm(src)
    if nc in lu_norm_to_name:
        dst = lu_norm_to_name[nc]
        if dst not in used_targets and dst not in mine.columns:
            mapping[src] = dst; used_targets.add(dst); continue

mine = mine.rename(columns=mapping)

# === 2) Identify Lu columns you still lack, and merge ONLY those in ===
missing = [c for c in lu_cols if c not in mine.columns]
# keys must exist in both:
for k in ['SEQN','SDDSRVYR']:
    if k not in mine.columns or k not in lu.columns:
        raise KeyError(f"Key {k} missing in one of the frames")

# subset Lu to keys + missing, drop dup keys, then merge
lu_sub = lu[['SEQN','SDDSRVYR'] + missing].copy()
dup_ct = lu_sub.duplicated(['SEQN','SDDSRVYR']).sum()
if dup_ct:
    print(f"[warn] Dropping {dup_ct} duplicate rows on keys in Lu subset")
    lu_sub = lu_sub.drop_duplicates(['SEQN','SDDSRVYR'], keep='first')

# Merge (no suffix needed—these cols are missing in 'mine')
aligned = mine.merge(lu_sub, on=['SEQN','SDDSRVYR'], how='left')

# === 3) Reorder to Lu order first, then any extras ===
order = [c for c in lu_cols if c in aligned.columns]
extras = [c for c in aligned.columns if c not in order]
df_my_cov_aligned = aligned[order + extras].copy()

# === 4) Quick report
print(f"Auto-renamed {len(mapping)} columns to Lu names.")
print("Filled from Lu (newly added):", missing[:20], "..." if len(missing)>20 else "")
still_missing = [c for c in lu_cols if c not in df_my_cov_aligned.columns]  # should be empty
print("Still missing Lu cols:", still_missing)
print("Final starts with:", df_my_cov_aligned.columns[:15].tolist())


Auto-renamed 15 columns to Lu names.
Filled from Lu (newly added): ['X', 'WTINT2YR', 'WTSAF2YR', 'wc', 'dm_self', 'hba1c', 'fpg', 'chf', 'chd', 'mi', 'stroke', 'emphysema', 'bronchitis', 'asthma', 're2', 'copd', 'dm_rx', 'chol_rx', 'angina_rx', 'htn_rx'] ...
Still missing Lu cols: []
Final starts with: ['X', 'SEQN', 'SDDSRVYR', 'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'SDMVPSU', 'SDMVSTRA', 'age', 'sex', 're', 'edu', 'pir', 'tchol', 'hdl']


#### clean and check merged file 

In [218]:
if 'X' in df_my_cov_aligned.columns:
    df_my_cov_aligned = df_my_cov_aligned.drop(columns=['X'])


In [219]:
import pandas as pd

# binary flags
bin_cols = ['dm_self','chf','chd','mi','stroke','emphysema','bronchitis','asthma',
            'copd','dm_rx','chol_rx','angina_rx','htn_rx','angina']
for c in df_my_cov_aligned.columns.intersection(bin_cols):
    df_my_cov_aligned[c] = pd.to_numeric(df_my_cov_aligned[c], errors='coerce').astype('Int8')

# labs/metrics
num_cols = ['wc','hba1c','fpg','tchol_hdl','MetS_hdl','MetS_triglycerides',
            'MetS_bp','MetS_wc','MetS_fpg','MetS_count']
for c in df_my_cov_aligned.columns.intersection(num_cols):
    df_my_cov_aligned[c] = pd.to_numeric(df_my_cov_aligned[c], errors='coerce')


In [220]:
# check 
df_my_cov_aligned.head(10)

Unnamed: 0,SEQN,SDDSRVYR,WTINT2YR,WTMEC2YR,WTSAF2YR,SDMVPSU,SDMVSTRA,age,sex,re,...,FS_HH,FS_ADULT,FS_FINAL,HHFDSEC,ADFDSEC,FS_HH4,FS_ADULT4,FS_SOURCE_HH,FS_SOURCE_FINAL,SNAP_SOURCE
0,1,1.0,9727.078709,10982.898896,75131.2,1.0,5.0,2.0,F,Other Hispanic,...,,,,,,,,,,
1,2,1.0,26678.636376,28325.384898,60586.147294,3.0,1.0,77.0,M,Mexican American,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,HHFDSEC,household,
2,3,1.0,43621.680548,46192.256945,121969.841152,2.0,7.0,10.0,F,Mexican American,...,,,,,,,,,,
3,4,1.0,10346.119327,10251.26002,4624.687273,1.0,2.0,1.0,M,Other Hispanic,...,,,,,,,,,,
4,5,1.0,91050.84662,99445.065735,234895.20565,2.0,8.0,49.0,M,Mexican American,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,HHFDSEC,household,
5,6,1.0,36508.250375,39656.600444,13379.8,2.0,2.0,19.0,F,NH Black,...,,,,,,,,,,
6,7,1.0,22352.08862,25525.423409,57661.621988,2.0,4.0,59.0,F,Other Hispanic,...,,,,,,,,,,
7,8,1.0,31600.089655,31510.587866,76026.438279,1.0,6.0,13.0,M,Mexican American,...,,,,,,,,,,
8,9,1.0,7529.435502,7575.870247,14694.924957,2.0,9.0,11.0,F,Other Hispanic,...,,,,,,,,,,
9,10,1.0,21071.164059,22445.808572,60202.416895,1.0,7.0,43.0,M,Other Hispanic,...,,,,,,,,,,


## Keep important column for this analysis 

In [221]:
import pandas as pd

df = df_my_cov_aligned.copy()

needed_core = [
    # final table / missing checks
    "RIDAGEYR","SEX","RACE","household_size",
    "SMK_AVG","SMK","ALCG2","met_hr",
    "bmic","DIABE","HYPERTEN","chol_rx","CVD","cancer",
    "probable_depression","sdoh_score","ahei_total",
]

needed_build = [
    # for sdoh_score
    "unemployment2","pir","SNAP","EDU","sdoh_access","ins","HOQ065","marriage",
    # for HYPERTEN
    "BPQ020","BPQ050A","sbp","dbp",
]

needed_survey = ["sdmvpsu","sdmvstra","wt", "wt10"]  # keep wt10 if you still reference it
needed_keys   = ["SEQN","SDDSRVYR"]

needed_optional = [
    "FS","SNAP3","dm_rx","angina","lung_disease","MORTSTAT",
    "hba1c","hdl","ldl","tg"
]

NEEDED = needed_core + needed_build + needed_survey + needed_keys + needed_optional

# Only keep those present; report what's missing
present = [c for c in NEEDED if c in df.columns]
missing = [c for c in NEEDED if c not in df.columns]

print(f"Keeping {len(present)} columns; missing {len(missing)}:")
print("Missing:", missing)

df_desc = df[present].copy()

# (Optional) sanity peek
print("df_desc shape:", df_desc.shape)
print("NA rates (top 10):")
print(df_desc.isna().mean().sort_values(ascending=False).head(10).round(3))


Keeping 19 columns; missing 26:
Missing: ['RIDAGEYR', 'SEX', 'RACE', 'household_size', 'SMK_AVG', 'SMK', 'ALCG2', 'met_hr', 'bmic', 'DIABE', 'HYPERTEN', 'probable_depression', 'sdoh_score', 'ahei_total', 'unemployment2', 'EDU', 'sdoh_access', 'ins', 'marriage', 'BPQ020', 'BPQ050A', 'sdmvpsu', 'sdmvstra', 'wt', 'wt10', 'SNAP3']
df_desc shape: (128809, 19)
NA rates (top 10):
SNAP            0.587
HOQ065          0.581
MORTSTAT        0.541
FS              0.518
chol_rx         0.213
dm_rx           0.213
ldl             0.213
hdl             0.213
hba1c           0.213
lung_disease    0.213
dtype: float64


In [222]:
# 1) Plain list (sorted)
cols = (df_my_cov_aligned.columns.tolist())
print(f"{len(cols)} columns:\n", cols)


135 columns:
 ['SEQN', 'SDDSRVYR', 'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'SDMVPSU', 'SDMVSTRA', 'age', 'sex', 're', 'edu', 'pir', 'tchol', 'hdl', 'ldl', 'tg', 'wc', 'bmi', 'dm_self', 'hba1c', 'fpg', 'chf', 'chd', 'mi', 'stroke', 'cancer', 'emphysema', 'bronchitis', 'asthma', 're2', 'copd', 'sbp', 'dbp', 'dm_rx', 'chol_rx', 'angina_rx', 'htn_rx', 'roseQ', 'no_na', 'age_cat', 'pir_cat', 'edu2', 'CVD', 'lung_disease', 'diabetes', 'tchol_hdl', 'angina', 'lipid_pri', 'adiposity_pri', 'bp_pri', 'glucose_pri', 'cvd_pri', 'lipid_sec', 'adiposity_sec', 'bp_sec', 'glucose_sec', 'cvd_sec', 'optimal_pri_count', 'intermediate_pri_count', 'poor_pri_count', 'optimal_sec_count', 'intermediate_sec_count', 'poor_sec_count', 'optimal_all', 'poor_all', 'optimal_all_sec', 'poor_all_sec', 'MetS_hdl', 'MetS_triglycerides', 'MetS_bp', 'MetS_wc', 'MetS_fpg', 'MetS_count', 'MetS', 'RIAGENDR', 'FEMALE', 'SMK_STATUS', 'CIGS_PER_DAY', 'PACK_YEARS', 'FORMER_SMOKER', 'DRINKS_PER_DAY', 'ALCOHOL_CAT', 'LTPA', 'METSCORE'

#### adjust column name 

In [223]:
import pandas as pd
import numpy as np

df = df_my_cov_aligned.copy()

# ---------- helpers ----------
def ci_pick(*names):
    """case-insensitive column pick: returns first match or None"""
    lowmap = {c.lower(): c for c in df.columns}
    for n in names:
        if n in df.columns: return n
        if n.lower() in lowmap: return lowmap[n.lower()]
    return None

def pick_best(cands):
    cols = []
    for c in cands:
        col = ci_pick(c)
        if col: cols.append(col)
    if not cols: return None
    cov = {c: df[c].notna().mean() for c in cols}
    return max(cov, key=cov.get)  # highest coverage

created = {}

# ---------- 1) alias to canonical (adds BPQ+MM fallbacks, AHEI fallbacks) ----------
aliases = {
    # IDs / survey
    "RIDAGEYR": ["RIDAGEYR","AGE_YR","age"],
    "SEX":      ["SEX","sex","RIAGENDR"],
    "RACE":     ["RACE","re2","re"],
    "household_size": ["household_size","DMDHHSIZ"],
    "sdmvpsu":  ["sdmvpsu","SDMVPSU"],
    "sdmvstra": ["sdmvstra","SDMVSTRA"],
    "wt":       ["wt","WTMEC2YR"],
    "wt10":     ["wt10","WTMEC2YR"],

    # behavior
    "SMK_AVG":  ["SMK_AVG","CIGS_PER_DAY"],
    "SMK":      ["SMK","FORMER_SMOKER","SMK_STATUS"],
    "ALCG2":    ["ALCG2","ALCOHOL_CAT"],
    "met_hr":   ["met_hr","METSCORE","LTPA"],

    # clinical
    "bmic":     ["bmic","BMI_CLAS"],
    "DIABE":    ["DIABE","DIABETES","diabetes"],
    "chol_rx":  ["chol_rx"],
    "CVD":      ["CVD"],
    "cancer":   ["cancer"],

    # outcomes / scores
    "probable_depression": ["probable_depression","DEP_HARMONIZED","PHQ9_GE10","DEP_IMP"],
    "sdoh_score": ["sdoh_score"],
    "ahei_total": ["ahei_total","AHEI","ahei","HEI2015_TOTAL_SCORE","HEI2015_TOTAL"],

    # building blocks (for possible compute later)
    "unemployment2": ["unemployment2","UNEMPLOYMENT","EMPLOY"],
    "pir":           ["pir"],
    "SNAP":          ["SNAP"],
    "EDU":           ["EDU","EDU_CAT","edu","edu2"],
    "sdoh_access":   ["sdoh_access","HUQ_ACCESS","huq_access"],
    "ins":           ["ins","INS"],
    "HOQ065":        ["HOQ065"],
    "marriage":      ["marriage","MARITAL","MARITAL_CAT"],

    # HTN components (case-insensitive)
    "BPQ020":        ["BPQ020","bpq020"],
    "BPQ050A":       ["BPQ050A","bpq050a"],
    "sbp":           ["sbp","SBP"],
    "dbp":           ["dbp","DBP"],

    # SNAP3 label variant
    "SNAP3":         ["SNAP3","SNAP"],
}

for target, cands in aliases.items():
    src = pick_best(cands)
    if src:
        if target != src:
            df[target] = df[src]
            created[target] = src

# Normalize SEX if it came from RIAGENDR (1/2)
if "SEX" in df.columns:
    if pd.api.types.is_numeric_dtype(df["SEX"]):
        df["SEX"] = df["SEX"].map({1: "Male", 2: "Female"}).fillna(df["SEX"])
    else:
        df["SEX"] = df["SEX"].astype(str).str.strip().str.capitalize()

# HYPERTEN compute if missing (your R rule)
if "HYPERTEN" not in df.columns:
    bpq020 = df[ci_pick("BPQ020","bpq020")] if ci_pick("BPQ020","bpq020") else pd.Series(np.nan, index=df.index)
    bpq050a= df[ci_pick("BPQ050A","bpq050a")] if ci_pick("BPQ050A","bpq050a") else pd.Series(np.nan, index=df.index)
    sbp = df[ci_pick("sbp","SBP")] if ci_pick("sbp","SBP") else pd.Series(np.nan, index=df.index)
    dbp = df[ci_pick("dbp","DBP")] if ci_pick("dbp","DBP") else pd.Series(np.nan, index=df.index)
    df["HYPERTEN"] = np.where(((bpq020==1) | (bpq050a==1) | (sbp>=130) | (dbp>=85)), 1,
                              np.where(bpq020.notna() | bpq050a.notna() | sbp.notna() | dbp.notna(), 0, np.nan))

# ---------- 2) keep BOTH canonical + source columns ----------
# Define source groups to retain alongside canonical
source_groups = {
    "SMK_AVG": ["CIGS_PER_DAY"],
    "SMK":     ["FORMER_SMOKER","SMK_STATUS"],
    "ALCG2":   ["ALCOHOL_CAT"],
    "met_hr":  ["METSCORE","LTPA"],
    "bmic":    ["BMI_CLAS","BMI"],        # keep BMI if you like for context
    "DIABE":   ["DIABETES","diabetes"],
    "sbp":     ["SBP"],
    "dbp":     ["DBP"],
    "ahei_total": ["AHEI","ahei","HEI2015_TOTAL_SCORE","HEI2015_TOTAL"],
}

# Core variables your R script needs
needed_core = [
  "RIDAGEYR","SEX","RACE","household_size",
  "SMK_AVG","SMK","ALCG2","met_hr",
  "bmic","DIABE","HYPERTEN","chol_rx","CVD","cancer",
  "probable_depression","sdoh_score","ahei_total",
  "unemployment2","pir","SNAP","EDU","sdoh_access","ins","HOQ065","marriage",
  "BPQ020","BPQ050A","sdmvpsu","sdmvstra","wt","wt10","SNAP3",
  "SEQN","SDDSRVYR"
]

retain = set()
# always keep canonicals that exist
retain.update([c for c in needed_core if c in df.columns])
# also keep sources if present
for canon, sources in source_groups.items():
    if canon in df.columns:
        for s in sources:
            s_real = ci_pick(s)
            if s_real: retain.add(s_real)

df_desc = df[list(retain)].copy()

# ---------- 3) report ----------
still_missing = [c for c in needed_core if c not in df_desc.columns]
print("Aliases/derivations created:", created)
print(f"df_desc columns kept: {len(df_desc.columns)}")
print("Still missing (not found in df):", still_missing)
print("Smoking-related kept:",
      [c for c in df_desc.columns if c.upper() in {"SMK_AVG","CIGS_PER_DAY","SMK","FORMER_SMOKER","SMK_STATUS"}])


Aliases/derivations created: {'RIDAGEYR': 'age', 'SEX': 'sex', 'RACE': 're', 'household_size': 'DMDHHSIZ', 'sdmvpsu': 'SDMVPSU', 'sdmvstra': 'SDMVSTRA', 'wt': 'WTMEC2YR', 'wt10': 'WTMEC2YR', 'SMK_AVG': 'CIGS_PER_DAY', 'SMK': 'FORMER_SMOKER', 'ALCG2': 'ALCOHOL_CAT', 'met_hr': 'METSCORE', 'bmic': 'BMI_CLAS', 'DIABE': 'diabetes', 'probable_depression': 'DEP_HARMONIZED', 'unemployment2': 'UNEMPLOYMENT', 'EDU': 'edu', 'ins': 'INS', 'marriage': 'MARITAL', 'SNAP3': 'SNAP'}
df_desc columns kept: 40
Still missing (not found in df): ['sdoh_score', 'ahei_total', 'sdoh_access', 'BPQ020', 'BPQ050A']
Smoking-related kept: ['SMK_STATUS', 'SMK', 'SMK_AVG', 'FORMER_SMOKER', 'CIGS_PER_DAY']


In [224]:
import pandas as pd
import numpy as np

df = df_my_cov_aligned.copy()

# ---------- helpers ----------
def ci_pick(*names):
    lowmap = {c.lower(): c for c in df.columns}
    for n in names:
        if n in df.columns: return n
        if n.lower() in lowmap: return lowmap[n.lower()]
    return None

def pick_best(cands):
    cols = []
    for c in cands:
        col = ci_pick(c)
        if col: cols.append(col)
    if not cols: return None
    cov = {c: df[c].notna().mean() for c in cols}
    return max(cov, key=cov.get)

created = {}

# ---------- 1) alias to canonical (NO coalescing of weights) ----------
aliases = {
    # IDs / survey
    "RIDAGEYR": ["RIDAGEYR","AGE_YR","age"],
    "SEX":      ["SEX","sex","RIAGENDR"],
    "RACE":     ["RACE","re2","re"],
    "household_size": ["household_size","DMDHHSIZ"],
    "sdmvpsu":  ["sdmvpsu","SDMVPSU"],
    "sdmvstra": ["sdmvstra","SDMVSTRA"],
    # keep user's wt/wt10 AS-IS if present
    "wt":       ["wt"],
    "wt10":     ["wt10"],
    # keep ALL NHANES weights separately (standardize casing only)
    "WTINT2YR": ["WTINT2YR","wtint2yr"],
    "WTMEC2YR": ["WTMEC2YR","wtmec2yr"],
    "WTSAF2YR": ["WTSAF2YR","wtsaf2yr"],
    "WTDRD1":   ["WTDRD1","wtdrd1","wtdrd1d"],
    "WTDR2D":   ["WTDR2D","wtdr2d"],

    # behavior
    "SMK_AVG":  ["SMK_AVG","CIGS_PER_DAY"],
    "SMK":      ["SMK","FORMER_SMOKER","SMK_STATUS"],
    "ALCG2":    ["ALCG2","ALCOHOL_CAT"],
    "met_hr":   ["met_hr","METSCORE","LTPA"],

    # clinical
    "bmic":     ["bmic","BMI_CLAS"],
    "DIABE":    ["DIABE","DIABETES","diabetes"],
    "chol_rx":  ["chol_rx"],
    "CVD":      ["CVD"],
    "cancer":   ["cancer"],

    # outcomes / scores
    "probable_depression": ["probable_depression","DEP_HARMONIZED","PHQ9_GE10","DEP_IMP"],
    "sdoh_score": ["sdoh_score"],
    "ahei_total": ["ahei_total","AHEI","ahei","HEI2015_TOTAL_SCORE","HEI2015_TOTAL"],

    # building blocks
    "unemployment2": ["unemployment2","UNEMPLOYMENT","EMPLOY"],
    "pir":           ["pir"],
    "SNAP":          ["SNAP"],
    "EDU":           ["EDU","EDU_CAT","edu","edu2"],
    "sdoh_access":   ["sdoh_access","HUQ_ACCESS","huq_access"],
    "ins":           ["ins","INS"],
    "HOQ065":        ["HOQ065"],
    "marriage":      ["marriage","MARITAL","MARITAL_CAT"],

    # HTN components (case-insensitive)
    "BPQ020":        ["BPQ020","bpq020"],
    "BPQ050A":       ["BPQ050A","bpq050a"],
    "sbp":           ["sbp","SBP"],
    "dbp":           ["dbp","DBP"],

    # SNAP3 alias
    "SNAP3":         ["SNAP3","SNAP"],
}

for target, cands in aliases.items():
    src = pick_best(cands)
    if src:
        if target != src:
            df[target] = df[src]
            created[target] = src

# Normalize SEX if it came from RIAGENDR (1/2)
if "SEX" in df.columns:
    if pd.api.types.is_numeric_dtype(df["SEX"]):
        df["SEX"] = df["SEX"].map({1: "Male", 2: "Female"}).fillna(df["SEX"])
    else:
        df["SEX"] = df["SEX"].astype(str).str.strip().str.capitalize()

# HYPERTEN compute if missing (same rule)
if "HYPERTEN" not in df.columns:
    bpq020 = df[ci_pick("BPQ020","bpq020")] if ci_pick("BPQ020","bpq020") else pd.Series(np.nan, index=df.index)
    bpq050a= df[ci_pick("BPQ050A","bpq050a")] if ci_pick("BPQ050A","bpq050a") else pd.Series(np.nan, index=df.index)
    sbp = df[ci_pick("sbp","SBP")] if ci_pick("sbp","SBP") else pd.Series(np.nan, index=df.index)
    dbp = df[ci_pick("dbp","DBP")] if ci_pick("dbp","DBP") else pd.Series(np.nan, index=df.index)
    df["HYPERTEN"] = np.where(((bpq020==1) | (bpq050a==1) | (sbp>=130) | (dbp>=85)), 1,
                              np.where(bpq020.notna() | bpq050a.notna() | sbp.notna() | dbp.notna(), 0, np.nan))

# ---------- 2) keep BOTH canonical + source columns ----------
source_groups = {
    "SMK_AVG":   ["CIGS_PER_DAY"],
    "SMK":       ["FORMER_SMOKER","SMK_STATUS"],
    "ALCG2":     ["ALCOHOL_CAT"],
    "met_hr":    ["METSCORE","LTPA"],
    "bmic":      ["BMI_CLAS","BMI"],
    "DIABE":     ["DIABETES","diabetes"],
    "sbp":       ["SBP"],
    "dbp":       ["DBP"],
    "ahei_total":["AHEI","ahei","HEI2015_TOTAL_SCORE","HEI2015_TOTAL"],
    # weights: keep raw variants too if present
    "WTINT2YR":  ["wtint2yr"],
    "WTMEC2YR":  ["wtmec2yr","wt","wt10"],  # keep user's wt/wt10 alongside
    "WTSAF2YR":  ["wtsaf2yr"],
    "WTDRD1":    ["wtdrd1","wtdrd1d"],
    "WTDR2D":    ["wtdr2d"],
}

# Core variables your R script needs + all distinct weights
needed_core = [
  "RIDAGEYR","SEX","RACE","household_size",
  "SMK_AVG","SMK","ALCG2","met_hr",
  "bmic","DIABE","HYPERTEN","chol_rx","CVD","cancer",
  "probable_depression","sdoh_score","ahei_total",
  "unemployment2","pir","SNAP","EDU","sdoh_access","ins","HOQ065","marriage",
  "BPQ020","BPQ050A","sdmvpsu","sdmvstra","SEQN","SDDSRVYR",
  # weights (keep all if present)
  "WTINT2YR","WTMEC2YR","WTSAF2YR","WTDRD1","WTDR2D","wt","wt10","WTINT4YR","WTMEC4YR"
]

retain = set()
retain.update([c for c in needed_core if c in df.columns])
for canon, sources in source_groups.items():
    if canon in df.columns:
        for s in sources:
            s_real = ci_pick(s)
            if s_real: retain.add(s_real)

df_desc = df[list(retain)].copy()

# ---------- 3) report ----------
still_missing = [c for c in needed_core if c not in df_desc.columns]
print("Aliases/derivations created:", created)
print(f"df_desc columns kept: {len(df_desc.columns)}")
print("Still missing (not found in df):", still_missing)

# quick peek at which weights you have
weight_cols = [c for c in ["WTINT2YR","WTMEC2YR","WTSAF2YR","WTDRD1","WTDR2D","wt","wt10","WTINT4YR","WTMEC4YR"] if c in df_desc.columns]
print("Weights present in df_desc:", weight_cols)


Aliases/derivations created: {'RIDAGEYR': 'age', 'SEX': 'sex', 'RACE': 're', 'household_size': 'DMDHHSIZ', 'sdmvpsu': 'SDMVPSU', 'sdmvstra': 'SDMVSTRA', 'SMK_AVG': 'CIGS_PER_DAY', 'SMK': 'FORMER_SMOKER', 'ALCG2': 'ALCOHOL_CAT', 'met_hr': 'METSCORE', 'bmic': 'BMI_CLAS', 'DIABE': 'diabetes', 'probable_depression': 'DEP_HARMONIZED', 'unemployment2': 'UNEMPLOYMENT', 'EDU': 'edu', 'ins': 'INS', 'marriage': 'MARITAL', 'SNAP3': 'SNAP'}
df_desc columns kept: 40
Still missing (not found in df): ['sdoh_score', 'ahei_total', 'sdoh_access', 'BPQ020', 'BPQ050A', 'WTDRD1', 'WTDR2D', 'wt', 'wt10', 'WTINT4YR', 'WTMEC4YR']
Weights present in df_desc: ['WTINT2YR', 'WTMEC2YR', 'WTSAF2YR']


#### add 2017-2020 special weight

In [225]:
import pandas as pd
import numpy as np

# Base DF
df = df_my_cov_aligned.copy()

# NHANES P_DEMO (2017–Mar 2020 pre-pandemic)
urls = [
    "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DEMO.xpt",
    "https://wwwn.cdc.gov/Nchs/Nhanes/2017-2020/P_DEMO.XPT",
]

demo = None
for u in urls:
    try:
        demo = pd.read_sas(u, format="xport", encoding="utf-8")
        break
    except Exception:
        pass
if demo is None:
    raise RuntimeError("Failed to download P_DEMO.xpt from both URLs.")

# Keep + standardize
keep = ["SEQN", "WTINTPRP", "WTMECPRP", "SDMVPSU", "SDMVSTRA", "SDDSRVYR"]
demo = demo[keep].copy()

# Optional sanity check: SDDSRVYR==66 for this file
# assert demo["SDDSRVYR"].dropna().eq(66).all()

# Map to your 4-year convention
demo = demo.rename(columns={
    "WTINTPRP": "WTINT4YR",
    "WTMECPRP": "WTMEC4YR",
})

# Ensure target cols exist pre-merge (so we can suffix the right-hand side)
for col in ["WTINT4YR", "WTMEC4YR", "SDMVPSU", "SDMVSTRA"]:
    if col not in df.columns:
        df[col] = pd.NA

# Merge (right-hand columns get *_src)
df = df.merge(demo, on=["SEQN"], how="left", suffixes=("", "_src"))

# Fill only missing (mask assignment avoids FutureWarning), then drop *_src
for col in ["WTINT4YR", "WTMEC4YR", "SDMVPSU", "SDMVSTRA"]:
    src = f"{col}_src"
    if src in df.columns:
        mask = df[col].isna() & df[src].notna()
        if mask.any():
            df.loc[mask, col] = df.loc[mask, src]
        df.drop(columns=[src], inplace=True)

# Dtypes: weights as float; PSU/STRATA as nullable int
for col in ["WTINT4YR", "WTMEC4YR", "WTINT2YR", "WTMEC2YR", "WTSAF2YR", "WTDRD1", "WTDR2D"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")  # float

for col in ["SDMVPSU", "SDMVSTRA"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")

# Quick audit
have = [c for c in ["WTINT4YR","WTMEC4YR","SDMVPSU","SDMVSTRA"] if c in df.columns]
print("Added/updated:", have)
print("Non-missing rates:", {c: float(df[c].notna().mean()) for c in have})

df_my_cov_aligned = df



Added/updated: ['WTINT4YR', 'WTMEC4YR', 'SDMVPSU', 'SDMVSTRA']
Non-missing rates: {'WTINT4YR': 0.12079901249136318, 'WTMEC4YR': 0.12079901249136318, 'SDMVPSU': 1.0, 'SDMVSTRA': 1.0}


#### Add ahei score build in R

In [226]:
import pandas as pd
from pathlib import Path

DATA = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
cand = [DATA/"ahei_1999_2018_combined.csv", DATA/"ahei_9904_wjfrt_ssbfix.csv"]

ahei_path = next((p for p in cand if p.exists()), None)
if ahei_path is None:
    raise FileNotFoundError("No AHEI combined file found. Please write ahei_1999_2018_combined.csv in R.")

# --- read header only to discover exact column names (case-insensitive) ---
hdr = pd.read_csv(ahei_path, nrows=0, low_memory=False)
lower_map = {c.lower(): c for c in hdr.columns}

seqn_col = lower_map.get("seqn")
tot_col = next((lower_map[k] for k in ["ahei_all","ahei_all_recomp"] if k in lower_map), None)
if not seqn_col or not tot_col:
    raise ValueError(f"Couldn’t find SEQN or AHEI total in {ahei_path.name}. "
                     f"Columns present include: {list(hdr.columns)[:12]} ...")

# --- now read ONLY what we need, with dtypes set → no DtypeWarning ---
try:
    # use pyarrow if available (fast, no type guessing); falls back if not installed
    ahei = pd.read_csv(ahei_path, usecols=[seqn_col, tot_col],
                       dtype={seqn_col: "Int64"}, engine="pyarrow")
except Exception:
    ahei = pd.read_csv(ahei_path, usecols=[seqn_col, tot_col],
                       dtype={seqn_col: "Int64"}, low_memory=False)

ahei = ahei.rename(columns={seqn_col: "SEQN", tot_col: "ahei_total"})
ahei["ahei_total"] = pd.to_numeric(ahei["ahei_total"], errors="coerce")
ahei = ahei.dropna(subset=["SEQN"]).drop_duplicates("SEQN", keep="last")

# --- merge into your frame ---
df = df_my_cov_aligned.copy()
df["SEQN"] = pd.to_numeric(df["SEQN"], errors="coerce")
pre = df["ahei_total"].notna().sum() if "ahei_total" in df.columns else 0

df = df.merge(ahei, on="SEQN", how="left", suffixes=("", "_ahei"))
if "ahei_total_ahei" in df.columns:
    df["ahei_total"] = df["ahei_total"].where(df["ahei_total"].notna(), df["ahei_total_ahei"])
    df.drop(columns=["ahei_total_ahei"], inplace=True)

post = df["ahei_total"].notna().sum()
print(f"✓ ahei_total merged from {ahei_path.name}. Non-missing: {post} (added {post-pre}).")

df_my_cov_aligned = df


✓ ahei_total merged from ahei_1999_2018_combined.csv. Non-missing: 46169 (added 46169).


#### check again what is missing 

In [237]:
# Start from your earlier needed_core and trim
needed_core = [
  "RIDAGEYR","SEX","RACE","household_size",
  "SMK_AVG","SMK","ALCG2","met_hr",
  "bmic","DIABE","HYPERTEN","chol_rx","CVD","cancer",
  "probable_depression","sdoh_score","ahei_total",
  "unemployment2","pir","SNAP","EDU","sdoh_access","ins","HOQ065","marriage",
  "sdmvpsu","sdmvstra","SEQN","SDDSRVYR",
  # weights (NHANES only)
  "WTINT2YR","WTMEC2YR","WTSAF2YR","WTDRD1","WTDR2D","WTINT4YR","WTMEC4YR"
]

# Check what's still missing
present = [c for c in needed_core if c in df_my_cov_aligned.columns]
missing = [c for c in needed_core if c not in df_my_cov_aligned.columns]
weights_present = [c for c in ["WTINT2YR","WTMEC2YR","WTSAF2YR","WTDRD1","WTDR2D","WTINT4YR","WTMEC4YR"]
                   if c in df_my_cov_aligned.columns]

print("Present:", len(present), "Missing:", len(missing))
print("Still missing:", missing)
print("NHANES weights present:", weights_present)


Present: 14 Missing: 22
Still missing: ['RIDAGEYR', 'SEX', 'RACE', 'household_size', 'SMK_AVG', 'SMK', 'ALCG2', 'met_hr', 'bmic', 'DIABE', 'HYPERTEN', 'probable_depression', 'sdoh_score', 'unemployment2', 'EDU', 'sdoh_access', 'ins', 'marriage', 'sdmvpsu', 'sdmvstra', 'WTDRD1', 'WTDR2D']
NHANES weights present: ['WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'WTINT4YR', 'WTMEC4YR']


In [241]:
import textwrap
print(textwrap.fill(", ".join(map(str, df_my_cov_aligned.columns)), width=100))

SEQN, SDDSRVYR, WTINT2YR, WTMEC2YR, WTSAF2YR, SDMVPSU, SDMVSTRA, age, sex, re, edu, pir, tchol, hdl,
ldl, tg, wc, bmi, dm_self, hba1c, fpg, chf, chd, mi, stroke, cancer, emphysema, bronchitis, asthma,
re2, copd, sbp, dbp, dm_rx, chol_rx, angina_rx, htn_rx, roseQ, no_na, age_cat, pir_cat, edu2, CVD,
lung_disease, diabetes, tchol_hdl, angina, lipid_pri, adiposity_pri, bp_pri, glucose_pri, cvd_pri,
lipid_sec, adiposity_sec, bp_sec, glucose_sec, cvd_sec, optimal_pri_count, intermediate_pri_count,
poor_pri_count, optimal_sec_count, intermediate_sec_count, poor_sec_count, optimal_all, poor_all,
optimal_all_sec, poor_all_sec, MetS_hdl, MetS_triglycerides, MetS_bp, MetS_wc, MetS_fpg, MetS_count,
MetS, RIAGENDR, FEMALE, SMK_STATUS, CIGS_PER_DAY, PACK_YEARS, FORMER_SMOKER, DRINKS_PER_DAY,
ALCOHOL_CAT, LTPA, METSCORE, IMP, BMXWT, BMXHT, BMI_CLAS, HTN, HIGH_CHOL, DMDHHSIZ, ELIGSTAT,
MORTSTAT, PERMTH_EXM, PERMTH_INT, UCOD_LEADING, IS_POST2018, IS_ADULT, MORTALITY_COVERED, EVENT,
CENSORED, FU_YRS_EX

#### adjust column name and check missingness 

In [242]:
import pandas as pd

# Canonical set (HTN stays; BPQ020/050A removed per your earlier choice)
needed_core = [
  "RIDAGEYR","SEX","RACE","household_size",
  "SMK_AVG","SMK","ALCG2","met_hr",
  "bmic","DIABE","HYPERTEN","chol_rx","CVD","cancer",
  "probable_depression","sdoh_score","ahei_total",
  "unemployment2","pir","SNAP","EDU","sdoh_access","ins","HOQ065","marriage",
  "sdmvpsu","sdmvstra","SEQN","SDDSRVYR",
  "WTINT2YR","WTMEC2YR","WTSAF2YR","WTDRD1","WTDR2D","WTINT4YR","WTMEC4YR"
]

# Map your existing columns → canonical (no coalescing; just alias-by-best)
aliases = {
    "RIDAGEYR": ["RIDAGEYR","age"],
    "SEX":      ["SEX","RIAGENDR","sex"],
    "RACE":     ["RACE","re2","re"],
    "household_size": ["household_size","DMDHHSIZ"],
    "SMK_AVG":  ["SMK_AVG","CIGS_PER_DAY"],
    "SMK":      ["SMK","SMK_STATUS","FORMER_SMOKER"],
    "ALCG2":    ["ALCG2","ALCOHOL_CAT"],
    "met_hr":   ["met_hr","METSCORE","LTPA"],
    "bmic":     ["bmic","BMI_CLAS","bmi"],
    "DIABE":    ["DIABE","diabetes","DIABETES","dm_self"],
    "HYPERTEN": ["HYPERTEN","HTN"],
    "chol_rx":  ["chol_rx"],
    "CVD":      ["CVD"],
    "cancer":   ["cancer"],
    "probable_depression": ["probable_depression","DEP_HARMONIZED","PHQ9_GE10","DEP_IMP"],
    "sdoh_score": ["sdoh_score"],  # likely missing
    "ahei_total": ["ahei_total","AHEI","HEI2015_TOTAL_SCORE","HEI2015_TOTAL"],
    "unemployment2": ["unemployment2","UNEMPLOYMENT","EMPLOY"],
    "pir":      ["pir"],
    "SNAP":     ["SNAP"],
    "EDU":      ["EDU","edu","edu2","EDU_CAT"],
    "sdoh_access": ["sdoh_access","HOD050","HOQ065"],  # if you treat these as access proxies
    "ins":      ["ins","INS"],
    "HOQ065":   ["HOQ065"],
    "marriage": ["marriage","MARITAL","MARITAL_CAT"],
    "sdmvpsu":  ["sdmvpsu","SDMVPSU"],
    "sdmvstra": ["sdmvstra","SDMVSTRA"],
    "SEQN":     ["SEQN"],
    "SDDSRVYR": ["SDDSRVYR"],
    # weights
    "WTINT2YR": ["WTINT2YR"],
    "WTMEC2YR": ["WTMEC2YR"],
    "WTSAF2YR": ["WTSAF2YR","WTSCI2YR"],  # your frame has WTSCI2YR (safety)
    "WTDRD1":   ["WTDRD1"],               # likely missing
    "WTDR2D":   ["WTDR2D"],               # likely missing
    "WTINT4YR": ["WTINT4YR","WTINTPRP"],
    "WTMEC4YR": ["WTMEC4YR","WTMECPRP"],
}

def ci_pick(df, names):
    low = {c.lower(): c for c in df.columns}
    for n in names:
        if n in df.columns: return n
        if n.lower() in low: return low[n.lower()]
    return None

created = {}
for target, cands in aliases.items():
    src = ci_pick(df_my_cov_aligned, cands)
    if src and target not in df_my_cov_aligned.columns:
        df_my_cov_aligned[target] = df_my_cov_aligned[src]
        created[target] = src

present = [c for c in needed_core if c in df_my_cov_aligned.columns]
missing = [c for c in needed_core if c not in df_my_cov_aligned.columns]

print(f"Present: {len(present)}  Missing: {len(missing)}")
print("Aliased (canonical ← source):", {k:v for k,v in created.items()})
print("Still missing:", missing)


Present: 33  Missing: 3
Aliased (canonical ← source): {'RIDAGEYR': 'age', 'SEX': 'sex', 'RACE': 're2', 'household_size': 'DMDHHSIZ', 'SMK_AVG': 'CIGS_PER_DAY', 'SMK': 'SMK_STATUS', 'ALCG2': 'ALCOHOL_CAT', 'met_hr': 'METSCORE', 'bmic': 'BMI_CLAS', 'DIABE': 'diabetes', 'HYPERTEN': 'HTN', 'probable_depression': 'DEP_HARMONIZED', 'unemployment2': 'UNEMPLOYMENT', 'EDU': 'edu', 'sdoh_access': 'HOD050', 'ins': 'INS', 'marriage': 'MARITAL', 'sdmvpsu': 'SDMVPSU', 'sdmvstra': 'SDMVSTRA'}
Still missing: ['sdoh_score', 'WTDRD1', 'WTDR2D']


In [244]:
blocks = {
    "OCQ": ["EMPLOY","UNEMPLOYMENT"],
    "HOQ": ["HOD050","HOQ065"],
    "HIQ": ["INS"],
    "FSQ": ["FSDHH","HHFDSEC","ADFDSEC","FS_HH4","FS_ADULT4","FS_HH","FS_ADULT","FS_FINAL","FS_SOURCE_HH","FS_SOURCE_FINAL","SNAP"],
}

for name, cols in blocks.items():
    present = [c for c in cols if c in df_my_cov_aligned.columns]
    missing = [c for c in cols if c not in df_my_cov_aligned.columns]
    print(f"{name} → present: {present} | missing: {missing}")


OCQ → present: ['EMPLOY', 'UNEMPLOYMENT'] | missing: []
HOQ → present: ['HOD050', 'HOQ065'] | missing: []
HIQ → present: ['INS'] | missing: []
FSQ → present: ['FSDHH', 'HHFDSEC', 'ADFDSEC', 'FS_HH4', 'FS_ADULT4', 'FS_HH', 'FS_ADULT', 'FS_FINAL', 'FS_SOURCE_HH', 'FS_SOURCE_FINAL', 'SNAP'] | missing: []


In [243]:
df_my_cov_aligned.head(10)

Unnamed: 0,SEQN,SDDSRVYR,WTINT2YR,WTMEC2YR,WTSAF2YR,SDMVPSU,SDMVSTRA,age,sex,re,...,DIABE,HYPERTEN,probable_depression,unemployment2,EDU,sdoh_access,ins,marriage,sdmvpsu,sdmvstra
0,1,1.0,9727.078709,10982.898896,75131.2,1,5,2.0,F,Other Hispanic,...,0,0,,,,,,,1,5
1,2,1.0,26678.636376,28325.384898,60586.147294,3,1,77.0,M,Mexican American,...,0,0,,0.0,5.0,2.0,1.0,,3,1
2,3,1.0,43621.680548,46192.256945,121969.841152,2,7,10.0,F,Mexican American,...,0,0,,,3.0,,,,2,7
3,4,1.0,10346.119327,10251.26002,4624.687273,1,2,1.0,M,Other Hispanic,...,0,0,,,,,,,1,2
4,5,1.0,91050.84662,99445.065735,234895.20565,2,8,49.0,M,Mexican American,...,0,1,,0.0,5.0,7.0,1.0,1.0,2,8
5,6,1.0,36508.250375,39656.600444,13379.8,2,2,19.0,F,NH Black,...,0,0,,,15.0,,,5.0,2,2
6,7,1.0,22352.08862,25525.423409,57661.621988,2,4,59.0,F,Other Hispanic,...,0,0,,0.0,2.0,,,1.0,2,4
7,8,1.0,31600.089655,31510.587866,76026.438279,1,6,13.0,M,Mexican American,...,0,0,,,5.0,,,,1,6
8,9,1.0,7529.435502,7575.870247,14694.924957,2,9,11.0,F,Other Hispanic,...,0,0,,,5.0,,,,2,9
9,10,1.0,21071.164059,22445.808572,60202.416895,1,7,43.0,M,Other Hispanic,...,0,1,,0.0,3.0,4.0,2.0,4.0,1,7


#### Keep important column and name it df_my_cov_aligned_short

In [247]:
import pandas as pd
import numpy as np

src = df_my_cov_aligned.copy()

# ---- 1) Canonical columns to keep (core + key derived + requested raw fields) ----
canonical_order = [
    # IDs / survey design
    "SEQN","SDDSRVYR","sdmvpsu","sdmvstra",
    # demographics
    "RIDAGEYR","SEX","RACE","household_size","EDU","pir",
    # behavior (canonical)
    "SMK_AVG","SMK","ALCG2","met_hr",
    # behavior (raw fields to keep)
    "SMK_STATUS","CIGS_PER_DAY","PACK_YEARS","FORMER_SMOKER","DRINKS_PER_DAY","ALCOHOL_CAT",
    # clinical
    "bmic","DIABE","HYPERTEN","chol_rx","CVD","cancer",
    # scores/outcomes
    "probable_depression","sdoh_score","ahei_total",
    # SDOH / access / insurance / marital / SNAP / FS
    "unemployment2","sdoh_access","ins","HOQ065","marriage","SNAP","FS",
    # weights (interview/exam + pre-pandemic 4-yr + safety)
    "WTINT2YR","WTMEC2YR","WTSAF2YR","WTINT4YR","WTMEC4YR",
]

# ---- 2) Alias map: canonical -> possible sources in your frame ----
aliases = {
    "SEQN": ["SEQN"],
    "SDDSRVYR": ["SDDSRVYR","SDDSRVYR_src"],

    "sdmvpsu": ["sdmvpsu","SDMVPSU"],
    "sdmvstra":["sdmvstra","SDMVSTRA"],

    "RIDAGEYR": ["RIDAGEYR","age"],
    "SEX": ["SEX","RIAGENDR","sex"],
    "RACE": ["RACE","re2","re"],
    "household_size": ["household_size","DMDHHSIZ"],
    "EDU": ["EDU","edu","edu2","EDU_CAT"],
    "pir": ["pir"],

    # canonical behavior
    "SMK_AVG": ["SMK_AVG","CIGS_PER_DAY"],
    "SMK": ["SMK","SMK_STATUS","FORMER_SMOKER"],
    "ALCG2": ["ALCG2","ALCOHOL_CAT"],
    "met_hr": ["met_hr","METSCORE","LTPA"],

    # raw behavior fields (keep as-is)
    "SMK_STATUS": ["SMK_STATUS"],
    "CIGS_PER_DAY": ["CIGS_PER_DAY"],
    "PACK_YEARS": ["PACK_YEARS"],
    "FORMER_SMOKER": ["FORMER_SMOKER"],
    "DRINKS_PER_DAY": ["DRINKS_PER_DAY"],
    "ALCOHOL_CAT": ["ALCOHOL_CAT"],

    # clinical
    "bmic": ["bmic","BMI_CLAS","bmi"],
    "DIABE": ["DIABE","diabetes","DIABETES","dm_self"],
    "HYPERTEN": ["HYPERTEN","HTN"],
    "chol_rx": ["chol_rx"],
    "CVD": ["CVD"],
    "cancer": ["cancer"],

    # outcomes/scores
    "probable_depression": ["probable_depression","DEP_HARMONIZED","PHQ9_GE10","DEP_IMP"],
    "sdoh_score": ["sdoh_score"],
    "ahei_total": ["ahei_total","AHEI","HEI2015_TOTAL_SCORE","HEI2015_TOTAL"],

    # SDOH etc.
    "unemployment2": ["unemployment2","UNEMPLOYMENT","EMPLOY"],
    "sdoh_access": ["sdoh_access"],
    "ins": ["ins","INS"],
    "HOQ065": ["HOQ065"],
    "marriage": ["marriage","MARITAL","MARITAL_CAT"],

    "SNAP": ["SNAP"],
    "FS": ["FS_FINAL","FS"],  # prefer your final FS binary

    # weights
    "WTINT2YR": ["WTINT2YR"],
    "WTMEC2YR": ["WTMEC2YR"],
    "WTSAF2YR": ["WTSAF2YR","WTSCI2YR"],
    "WTINT4YR": ["WTINT4YR","WTINTPRP"],
    "WTMEC4YR": ["WTMEC4YR","WTMECPRP"],
}

def ci_pick(df, names):
    low = {c.lower(): c for c in df.columns}
    for n in names:
        if n in df.columns: return n
        if n.lower() in low: return low[n.lower()]
    return None

# ---- 3) Build df_short with canonical names, copying from the best source ----
short_cols = {}
created_from = {}
for canon, cands in aliases.items():
    src_col = ci_pick(src, cands)
    if src_col is not None:
        short_cols[canon] = src[src_col]
        created_from[canon] = src_col

df_my_cov_aligned_short = pd.DataFrame(short_cols)

# ---- 4) Light harmonization / typing ----
# SEX
if "SEX" in df_my_cov_aligned_short.columns:
    s = df_my_cov_aligned_short["SEX"]
    if pd.api.types.is_numeric_dtype(s):
        df_my_cov_aligned_short["SEX"] = s.map({1:"Male", 2:"Female"}).astype("string")
    else:
        df_my_cov_aligned_short["SEX"] = s.astype(str).str.strip().str.capitalize()

# Binary-ish ints
for col_bin in ["DIABE","HYPERTEN","CVD","cancer","SNAP","FS","unemployment2","FORMER_SMOKER"]:
    if col_bin in df_my_cov_aligned_short.columns:
        df_my_cov_aligned_short[col_bin] = pd.to_numeric(df_my_cov_aligned_short[col_bin], errors="coerce").astype("Int64")

# Continuous behavior
for col_num in ["CIGS_PER_DAY","PACK_YEARS","DRINKS_PER_DAY","SMK_AVG","met_hr","pir"]:
    if col_num in df_my_cov_aligned_short.columns:
        df_my_cov_aligned_short[col_num] = pd.to_numeric(df_my_cov_aligned_short[col_num], errors="coerce")

# PSU/STRATA → nullable ints; weights → float
for col in ["sdmvpsu","sdmvstra"]:
    if col in df_my_cov_aligned_short.columns:
        df_my_cov_aligned_short[col] = pd.to_numeric(df_my_cov_aligned_short[col], errors="coerce").astype("Int64")

for col in ["WTINT2YR","WTMEC2YR","WTSAF2YR","WTINT4YR","WTMEC4YR"]:
    if col in df_my_cov_aligned_short.columns:
        df_my_cov_aligned_short[col] = pd.to_numeric(df_my_cov_aligned_short[col], errors="coerce")

# Keep final column order (only those that exist)
cols_final = [c for c in canonical_order if c in df_my_cov_aligned_short.columns]
df_my_cov_aligned_short = df_my_cov_aligned_short[cols_final].copy()

# ---- 5) Report coverage ----
missing_after = [c for c in canonical_order if c not in df_my_cov_aligned_short.columns]
print("df_my_cov_aligned_short shape:", df_my_cov_aligned_short.shape)
print("Created (canonical ← source):", created_from)
print("Still missing after aliasing:", missing_after)


df_my_cov_aligned_short shape: (128809, 40)
Created (canonical ← source): {'SEQN': 'SEQN', 'SDDSRVYR': 'SDDSRVYR', 'sdmvpsu': 'sdmvpsu', 'sdmvstra': 'sdmvstra', 'RIDAGEYR': 'RIDAGEYR', 'SEX': 'SEX', 'RACE': 'RACE', 'household_size': 'household_size', 'EDU': 'EDU', 'pir': 'pir', 'SMK_AVG': 'SMK_AVG', 'SMK': 'SMK', 'ALCG2': 'ALCG2', 'met_hr': 'met_hr', 'SMK_STATUS': 'SMK_STATUS', 'CIGS_PER_DAY': 'CIGS_PER_DAY', 'PACK_YEARS': 'PACK_YEARS', 'FORMER_SMOKER': 'FORMER_SMOKER', 'DRINKS_PER_DAY': 'DRINKS_PER_DAY', 'ALCOHOL_CAT': 'ALCOHOL_CAT', 'bmic': 'bmic', 'DIABE': 'DIABE', 'HYPERTEN': 'HYPERTEN', 'chol_rx': 'chol_rx', 'CVD': 'CVD', 'cancer': 'cancer', 'probable_depression': 'probable_depression', 'ahei_total': 'ahei_total', 'unemployment2': 'unemployment2', 'sdoh_access': 'sdoh_access', 'ins': 'ins', 'HOQ065': 'HOQ065', 'marriage': 'marriage', 'SNAP': 'SNAP', 'FS': 'FS_FINAL', 'WTINT2YR': 'WTINT2YR', 'WTMEC2YR': 'WTMEC2YR', 'WTSAF2YR': 'WTSAF2YR', 'WTINT4YR': 'WTINT4YR', 'WTMEC4YR': 'WTMEC4

In [None]:
#### HERE !!!! KEEP working on this!!!!

In [227]:
#### Checked why SMK_AVG etc missing 
## Pre-2018 cycles have ~10–13% non-missing (expected: only current smokers report cigs/day)


## Check what column missing post 2018

In [195]:


df_lu_cov_1999_2018[[  'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR']].head(10) 

# 'SEQN','SDDSRVYR', 'SDMVPSU', 'SDMVSTRA'

Unnamed: 0,WTINT2YR,WTMEC2YR,WTSAF2YR
0,9727.078709,10982.898896,75131.2
1,26678.636376,28325.384898,60586.147294
2,43621.680548,46192.256945,121969.841152
3,10346.119327,10251.26002,4624.687273
4,91050.84662,99445.065735,234895.20565
5,36508.250375,39656.600444,13379.8
6,22352.08862,25525.423409,57661.621988
7,31600.089655,31510.587866,76026.438279
8,7529.435502,7575.870247,14694.924957
9,21071.164059,22445.808572,60202.416895


In [236]:


df_my_cov_aligned.loc[df_my_cov_aligned["SDDSRVYR"].eq(66), ["DMDHHSIZ","WTINT4YR","WTMEC4YR"]].tail(9000)


df_my_cov_aligned[["DMDHHSIZ",'WTINT4YR', 'WTMEC4YR']].tail(20)

Unnamed: 0,DMDHHSIZ,WTINT4YR,WTMEC4YR
128789,4.0,,
128790,7.0,,
128791,4.0,,
128792,2.0,,
128793,1.0,,
128794,4.0,,
128795,1.0,,
128796,3.0,,
128797,2.0,,
128798,5.0,,


In [197]:
df_my_cov_aligned[["DMDHHSIZ", "household_size", "HOQ065"]].tail(10) 

Unnamed: 0,DMDHHSIZ,household_size,HOQ065
128799,1.0,1.0,
128800,2.0,2.0,
128801,2.0,2.0,
128802,5.0,5.0,
128803,4.0,4.0,
128804,2.0,2.0,
128805,5.0,5.0,
128806,3.0,3.0,
128807,5.0,5.0,
128808,2.0,2.0,


## Fill columns missing post 2018   