## set up 

In [70]:
from pathlib import Path
import pandas as pd

ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
OUT  = ROOT / "output"

MY_PATH = OUT / "demo_mt_cov_dp_sdoh.parquet"
LU_PATH = ROOT / "data/cov/nhanes_primary_anal_full_singleimputation_v2.csv"

df_my_cov_1999_2023 = pd.read_parquet(MY_PATH)
df_lu_cov_1999_2018 = pd.read_csv(LU_PATH)

print("mine:", df_my_cov_1999_2023.shape, "| lu:", df_lu_cov_1999_2018.shape)
print("Loaded:", MY_PATH)
print("Loaded:", LU_PATH)



mine: (128809, 82) | lu: (101316, 75)
Loaded: /Users/dengshuyue/Desktop/SDOH/analysis/output/demo_mt_cov_dp_sdoh.parquet
Loaded: /Users/dengshuyue/Desktop/SDOH/analysis/data/cov/nhanes_primary_anal_full_singleimputation_v2.csv


In [71]:
df_my_cov_1999_2023.head(10)

Unnamed: 0,SEQN,SDDSRVYR,SDMVPSU,SDMVSTRA,WTMEC2YR,AGE_YR,RIAGENDR,SEX,FEMALE,SMK_STATUS,...,FS_HH,FS_ADULT,FS_FINAL,HHFDSEC,ADFDSEC,FS_HH4,FS_ADULT4,FS_SOURCE_HH,FS_SOURCE_FINAL,SNAP_SOURCE
0,1,1.0,1.0,5.0,10982.898896,2.0,2,F,1,,...,,,,,,,,,,
1,2,1.0,3.0,1.0,28325.384898,77.0,1,M,0,NEVER,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,HHFDSEC,household,
2,3,1.0,2.0,7.0,46192.256945,10.0,2,F,1,,...,,,,,,,,,,
3,4,1.0,1.0,2.0,10251.26002,1.0,1,M,0,,...,,,,,,,,,,
4,5,1.0,2.0,8.0,99445.065735,49.0,1,M,0,FORMER,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,HHFDSEC,household,
5,6,1.0,2.0,2.0,39656.600444,19.0,2,F,1,,...,,,,,,,,,,
6,7,1.0,2.0,4.0,25525.423409,59.0,2,F,1,FORMER,...,,,,,,,,,,
7,8,1.0,1.0,6.0,31510.587866,13.0,1,M,0,,...,,,,,,,,,,
8,9,1.0,2.0,9.0,7575.870247,11.0,2,F,1,,...,,,,,,,,,,
9,10,1.0,1.0,7.0,22445.808572,43.0,1,M,0,CURRENT,...,,,,,,,,,,


#### 1) Helpers (used later)

In [65]:
# 1) HELPERS (robust + imports np)
import numpy as np
import pandas as pd

def _norm_str_col(series: pd.Series) -> pd.Series:
    """Lowercase + strip + turn 'nan' into actual NaN."""
    s = series.astype(str).str.strip().str.lower()
    return s.replace({"nan": np.nan})

def _num_summary(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    """Lightweight numeric summary for a set of columns."""
    rows = []
    for c in cols:
        s = pd.to_numeric(df[c], errors="coerce")
        rows.append({
            "column": c,
            "n": len(s),
            "na_rate": float(s.isna().mean()),
            "min": np.nanmin(s.values),
            "p25": np.nanpercentile(s.values, 25),
            "median": np.nanmedian(s.values),
            "p75": np.nanpercentile(s.values, 75),
            "max": np.nanmax(s.values),
            "mean": np.nanmean(s.values),
            "std": np.nanstd(s.values),
            "unique_non_na": int(s.nunique(dropna=True)),
        })
    return pd.DataFrame(rows)

def _binary_sig(series: pd.Series) -> str | None:
    """Detect common binary encodings."""
    vals = set(_norm_str_col(series).dropna().unique())
    if vals <= {"0","1"}: return "0/1"
    if vals <= {"yes","no"}: return "yes/no"
    if vals <= {"true","false"}: return "true/false"
    if vals <= {"male","female"}: return "male/female"
    return None


#### 2) Column set differences (a)

In [80]:
# 2) COLUMN SET DIFFERENCES (a)

cols_my = set(df_my_cov_1999_2023.columns)
cols_lu = set(df_lu_cov_1999_2018.columns)

audit_only_in_lu = sorted(cols_lu - cols_my)
audit_only_in_my = sorted(cols_my - cols_lu)
audit_in_both    = sorted(cols_my & cols_lu)

print(f"[Columns] only_in_lu: {len(audit_only_in_lu)} | only_in_my: {len(audit_only_in_my)} | in_both: {len(audit_in_both)}")
print("• only_in_lu (first 20):", audit_only_in_lu[:20])
print("• only_in_my (first 20):", audit_only_in_my[:20])



[Columns] only_in_lu: 69 | only_in_my: 76 | in_both: 6
• only_in_lu (first 20): ['MetS', 'MetS_bp', 'MetS_count', 'MetS_fpg', 'MetS_hdl', 'MetS_triglycerides', 'MetS_wc', 'WTINT2YR', 'WTSAF2YR', 'X', 'adiposity_pri', 'adiposity_sec', 'age', 'age_cat', 'angina', 'angina_rx', 'asthma', 'bmi', 'bp_pri', 'bp_sec']
• only_in_my (first 20): ['ADFDSEC', 'AGE_YR', 'ALCOHOL_CAT', 'BMI', 'BMI_CLAS', 'BMXHT', 'BMXWT', 'CANCER', 'CENSORED', 'CIDI_12M_MDE', 'CIDI_SCORE_RAW', 'CIGS_PER_DAY', 'DBP', 'DEP_HARMONIZED', 'DEP_IMP', 'DEP_SOURCE', 'DIABETES', 'DMDHHSIZ', 'DPQ_CAT', 'DRINKS_PER_DAY']


In [81]:
# mine
print(len(df_my_cov_1999_2023.columns), "columns")
print(df_my_cov_1999_2023.columns.tolist())

# lu
print(len(df_lu_cov_1999_2018.columns), "columns")
print(df_lu_cov_1999_2018.columns.tolist())

82 columns
['SEQN', 'SDDSRVYR', 'SDMVPSU', 'SDMVSTRA', 'WTMEC2YR', 'AGE_YR', 'RIAGENDR', 'SEX', 'FEMALE', 'SMK_STATUS', 'CIGS_PER_DAY', 'PACK_YEARS', 'FORMER_SMOKER', 'DRINKS_PER_DAY', 'ALCOHOL_CAT', 'LTPA', 'METSCORE', 'IMP', 'BMXWT', 'BMXHT', 'BMI', 'BMI_CLAS', 'DIABETES', 'HTN', 'HIGH_CHOL', 'CVD', 'CANCER', 'SBP', 'DBP', 'TCHOL', 'HDL', 'LDL', 'TG', 'DMDHHSIZ', 'ELIGSTAT', 'MORTSTAT', 'PERMTH_EXM', 'PERMTH_INT', 'UCOD_LEADING', 'IS_POST2018', 'IS_ADULT', 'MORTALITY_COVERED', 'EVENT', 'CENSORED', 'FU_YRS_EXM', 'FU_YRS_INT', 'UCOD_LABEL', 'PHQ9', 'PHQ9_GE10', 'DPQ_CAT', 'DEP_IMP', 'CIDI_SCORE_RAW', 'CIDI_12M_MDE', 'WTSCI2YR', 'DEP_HARMONIZED', 'DEP_SOURCE', 'PIR', 'PIR_CAT', 'INDFMINC', 'EDU', 'EDU_CAT', 'RACE_ETH', 'MARITAL', 'MARITAL_CAT', 'EMPLOY', 'UNEMPLOYMENT', 'HOD050', 'HOQ065', 'INS', 'SNAP', 'FSDHH', 'FS', 'FS_HH', 'FS_ADULT', 'FS_FINAL', 'HHFDSEC', 'ADFDSEC', 'FS_HH4', 'FS_ADULT4', 'FS_SOURCE_HH', 'FS_SOURCE_FINAL', 'SNAP_SOURCE']
75 columns
['X', 'SEQN', 'SDDSRVYR', 'WTIN

In [78]:
# %whos DataFrame


Variable              Type         Data/Info
--------------------------------------------
df_lu_cov_1999_2018   DataFrame    Shape: (101316, 75)
df_my_cov_1999_2023   DataFrame    Shape: (128809, 82)


In [37]:
# df_my_cov_1999_2023[['MORTALITY_COVERED', 'EVENT', "UCOD_LABEL", "CANCER"]].head(20)

## Align column same as lu 

In [82]:
import re, difflib
import pandas as pd

# === Inputs (your two originals) ===
mine = df_my_cov_1999_2023.copy()
lu   = df_lu_cov_1999_2018.copy()

lu_cols = list(lu.columns)
mine_cols = list(mine.columns)

# ---------- Helpers ----------
def norm(s: str) -> str:
    # Lower, drop non-alnum (so AGE_YR -> ageyr; RACE_ETH -> raceeth)
    return re.sub(r'[^0-9a-z]+', '', s.lower()) if isinstance(s, str) else s

lu_norm_to_name = {}
for c in lu_cols:
    lu_norm_to_name.setdefault(norm(c), c)  # keep first occurrence

# Minimal synonyms (expand as needed if you see mismatches)
# YOUR column name -> Lu column name
synonyms = {
    'AGE_YR':'age',
    'SEX':'sex',
    'RACE_ETH':'re',
    'EDU':'edu',
    'PIR':'pir',
    'TCHOL':'tchol',
    'HDL':'hdl',
    'LDL':'ldl',
    'TG':'tg',
    'WC':'wc',
    'BMI':'bmi',
    'SBP':'sbp',
    'DBP':'dbp',
    'DIABETES':'diabetes',
    'CVD':'CVD',          # Lu uses uppercase "CVD" in your list
    'CANCER':'cancer',
    'DM_RX':'dm_rx',
    'CHOL_RX':'chol_rx',
    'HTN_RX':'htn_rx',
    'ANGINA_RX':'angina_rx',
    'ANGINA':'angina',
    'AGE_CAT':'age_cat',
    'PIR_CAT':'pir_cat',
    'EDU2':'edu2',
    'METS_HDL':'MetS_hdl',
    'METS_TRIGLYCERIDES':'MetS_triglycerides',
    'METS_BP':'MetS_bp',
    'METS_WC':'MetS_wc',
    'METS_FPG':'MetS_fpg',
    'METS_COUNT':'MetS_count',
    'ROSEQ':'roseQ',
    'NO_NA':'no_na',
    'LUNG_DISEASE':'lung_disease',
    'BP_PRI':'bp_pri',
    'GLUCOSE_PRI':'glucose_pri',
    'LIPID_PRI':'lipid_pri',
    'ADIPOSITY_PRI':'adiposity_pri',
    'CVD_PRI':'cvd_pri',
    'BP_SEC':'bp_sec',
    'GLUCOSE_SEC':'glucose_sec',
    'LIPID_SEC':'lipid_sec',
    'ADIPOSITY_SEC':'adiposity_sec',
    'CVD_SEC':'cvd_sec',
    # Common admin/weight vars:
    'WTMEC2YR':'WTMEC2YR',
    'SDDSRVYR':'SDDSRVYR',
    'SDMVPSU':'SDMVPSU',
    'SDMVSTRA':'SDMVSTRA',
}

# Columns we should **never** rename (IDs/keys that already match)
protect_exact = set(['SEQN','SDDSRVYR','SDMVPSU','SDMVSTRA','WTMEC2YR'])

# ---------- Build mapping (your -> lu) ----------
mapping = {}          # final mapping to apply
used_targets = set()  # to avoid collisions (two src -> one dst)

for src in mine_cols:
    if src in protect_exact or src.endswith('_lu'):
        continue

    # 1) If exact Lu name already, keep as-is
    if src in lu_cols:
        continue

    # 2) Synonym override
    if src in synonyms and synonyms[src] in lu_cols and synonyms[src] not in used_targets and synonyms[src] not in mine_cols:
        mapping[src] = synonyms[src]
        used_targets.add(synonyms[src])
        continue

    # 3) Case-insensitive exact
    ci = next((dst for dst in lu_cols if isinstance(dst, str) and dst.lower() == src.lower()), None)
    if ci and ci not in used_targets and ci not in mine_cols:
        mapping[src] = ci
        used_targets.add(ci)
        continue

    # 4) Normalized name match
    nsrc = norm(src)
    if nsrc in lu_norm_to_name:
        dst = lu_norm_to_name[nsrc]
        if dst not in used_targets and dst not in mine_cols:
            mapping[src] = dst
            used_targets.add(dst)
            continue

    # 5) Fuzzy match for stragglers (safe threshold)
    # Only attempt for alphas; ignore obviously different admin columns you don't want changed
    candidates = difflib.get_close_matches(src, lu_cols, n=1, cutoff=0.92)
    if candidates:
        dst = candidates[0]
        if dst not in used_targets and dst not in mine_cols:
            mapping[src] = dst
            used_targets.add(dst)
            continue

# ---------- Apply rename ----------
mine_renamed = mine.rename(columns=mapping).copy()

# ---------- Reorder to Lu’s order (extras at end; keep *_lu at very end) ----------
ordered = [c for c in lu_cols if c in mine_renamed.columns]
extras  = [c for c in mine_renamed.columns if c not in ordered and not c.endswith('_lu')]
audit   = [c for c in mine_renamed.columns if c.endswith('_lu')]

df_my_cov_aligned = mine_renamed[ordered + extras + audit].copy()

# ---------- Report ----------
renamed_pairs = sorted(mapping.items(), key=lambda x: x[0].lower())
missing_in_mine = [c for c in lu_cols if c not in df_my_cov_aligned.columns]

print(f"Renamed {len(renamed_pairs)} columns automatically.")
print("Examples:", renamed_pairs[:10])
print("Still missing from your data (present in Lu):", missing_in_mine)
print("Final order starts with:", df_my_cov_aligned.columns[:15].tolist())


Renamed 15 columns automatically.
Examples: [('AGE_YR', 'age'), ('BMI', 'bmi'), ('CANCER', 'cancer'), ('DBP', 'dbp'), ('DIABETES', 'diabetes'), ('EDU', 'edu'), ('HDL', 'hdl'), ('LDL', 'ldl'), ('PIR', 'pir'), ('PIR_CAT', 'pir_cat')]
Still missing from your data (present in Lu): ['X', 'WTINT2YR', 'WTSAF2YR', 'wc', 'dm_self', 'hba1c', 'fpg', 'chf', 'chd', 'mi', 'stroke', 'emphysema', 'bronchitis', 'asthma', 're2', 'copd', 'dm_rx', 'chol_rx', 'angina_rx', 'htn_rx', 'roseQ', 'no_na', 'age_cat', 'edu2', 'lung_disease', 'tchol_hdl', 'angina', 'lipid_pri', 'adiposity_pri', 'bp_pri', 'glucose_pri', 'cvd_pri', 'lipid_sec', 'adiposity_sec', 'bp_sec', 'glucose_sec', 'cvd_sec', 'optimal_pri_count', 'intermediate_pri_count', 'poor_pri_count', 'optimal_sec_count', 'intermediate_sec_count', 'poor_sec_count', 'optimal_all', 'poor_all', 'optimal_all_sec', 'poor_all_sec', 'MetS_hdl', 'MetS_triglycerides', 'MetS_bp', 'MetS_wc', 'MetS_fpg', 'MetS_count', 'MetS']
Final order starts with: ['SEQN', 'SDDSRVYR'

#### Adding missing column merge from lu

In [83]:
import re, pandas as pd

# === 0) Start from the two originals ===
mine = df_my_cov_1999_2023.copy()
lu   = df_lu_cov_1999_2018.copy()

# === 1) Auto-rename YOUR columns to Lu's names (case/underscore-insensitive + synonyms) ===
def norm(s: str) -> str:
    return re.sub(r'[^0-9a-z]+', '', s.lower()) if isinstance(s, str) else s

lu_cols = list(lu.columns)
mine_cols = list(mine.columns)

# First pass: normalized name index for Lu
lu_norm_to_name = {}
for c in lu_cols:
    lu_norm_to_name.setdefault(norm(c), c)

# Synonyms (YOUR -> Lu)
synonyms = {
    'AGE_YR':'age','SEX':'sex','RACE_ETH':'re','EDU':'edu','PIR':'pir',
    'TCHOL':'tchol','HDL':'hdl','LDL':'ldl','TG':'tg',
    'WC':'wc','BMI':'bmi','SBP':'sbp','DBP':'dbp',
    'DIABETES':'diabetes','CVD':'CVD','CANCER':'cancer',
    'DM_RX':'dm_rx','CHOL_RX':'chol_rx','HTN_RX':'htn_rx','ANGINA_RX':'angina_rx','ANGINA':'angina',
    'AGE_CAT':'age_cat','PIR_CAT':'pir_cat','EDU2':'edu2',
    'METS_HDL':'MetS_hdl','METS_TRIGLYCERIDES':'MetS_triglycerides','METS_BP':'MetS_bp',
    'METS_WC':'MetS_wc','METS_FPG':'MetS_fpg','METS_COUNT':'MetS_count',
    'ROSEQ':'roseQ','NO_NA':'no_na','LUNG_DISEASE':'lung_disease',
    'BP_PRI':'bp_pri','GLUCOSE_PRI':'glucose_pri','LIPID_PRI':'lipid_pri','ADIPOSITY_PRI':'adiposity_pri',
    'CVD_PRI':'cvd_pri','BP_SEC':'bp_sec','GLUCOSE_SEC':'glucose_sec','LIPID_SEC':'lipid_sec',
    'ADIPOSITY_SEC':'adiposity_sec','CVD_SEC':'cvd_sec',
    # admin/weights that already match:
    'WTMEC2YR':'WTMEC2YR','SDDSRVYR':'SDDSRVYR','SDMVPSU':'SDMVPSU','SDMVSTRA':'SDMVSTRA'
}

protect_exact = {'SEQN','SDDSRVYR','SDMVPSU','SDMVSTRA','WTMEC2YR'}

mapping = {}
used_targets = set()
for src in mine_cols:
    if src in protect_exact or src.endswith('_lu'):
        continue
    if src in lu_cols:
        continue
    # synonyms first
    if src in synonyms and synonyms[src] in lu_cols and synonyms[src] not in used_targets and synonyms[src] not in mine.columns:
        mapping[src] = synonyms[src]; used_targets.add(synonyms[src]); continue
    # case-insensitive exact
    ci = next((dst for dst in lu_cols if isinstance(dst, str) and dst.lower()==src.lower()), None)
    if ci and ci not in used_targets and ci not in mine.columns:
        mapping[src] = ci; used_targets.add(ci); continue
    # normalized match
    nc = norm(src)
    if nc in lu_norm_to_name:
        dst = lu_norm_to_name[nc]
        if dst not in used_targets and dst not in mine.columns:
            mapping[src] = dst; used_targets.add(dst); continue

mine = mine.rename(columns=mapping)

# === 2) Identify Lu columns you still lack, and merge ONLY those in ===
missing = [c for c in lu_cols if c not in mine.columns]
# keys must exist in both:
for k in ['SEQN','SDDSRVYR']:
    if k not in mine.columns or k not in lu.columns:
        raise KeyError(f"Key {k} missing in one of the frames")

# subset Lu to keys + missing, drop dup keys, then merge
lu_sub = lu[['SEQN','SDDSRVYR'] + missing].copy()
dup_ct = lu_sub.duplicated(['SEQN','SDDSRVYR']).sum()
if dup_ct:
    print(f"[warn] Dropping {dup_ct} duplicate rows on keys in Lu subset")
    lu_sub = lu_sub.drop_duplicates(['SEQN','SDDSRVYR'], keep='first')

# Merge (no suffix needed—these cols are missing in 'mine')
aligned = mine.merge(lu_sub, on=['SEQN','SDDSRVYR'], how='left')

# === 3) Reorder to Lu order first, then any extras ===
order = [c for c in lu_cols if c in aligned.columns]
extras = [c for c in aligned.columns if c not in order]
df_my_cov_aligned = aligned[order + extras].copy()

# === 4) Quick report
print(f"Auto-renamed {len(mapping)} columns to Lu names.")
print("Filled from Lu (newly added):", missing[:20], "..." if len(missing)>20 else "")
still_missing = [c for c in lu_cols if c not in df_my_cov_aligned.columns]  # should be empty
print("Still missing Lu cols:", still_missing)
print("Final starts with:", df_my_cov_aligned.columns[:15].tolist())


Auto-renamed 15 columns to Lu names.
Filled from Lu (newly added): ['X', 'WTINT2YR', 'WTSAF2YR', 'wc', 'dm_self', 'hba1c', 'fpg', 'chf', 'chd', 'mi', 'stroke', 'emphysema', 'bronchitis', 'asthma', 're2', 'copd', 'dm_rx', 'chol_rx', 'angina_rx', 'htn_rx'] ...
Still missing Lu cols: []
Final starts with: ['X', 'SEQN', 'SDDSRVYR', 'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'SDMVPSU', 'SDMVSTRA', 'age', 'sex', 're', 'edu', 'pir', 'tchol', 'hdl']


#### clean and check merged file 

In [85]:
if 'X' in df_my_cov_aligned.columns:
    df_my_cov_aligned = df_my_cov_aligned.drop(columns=['X'])


In [88]:
import pandas as pd

# binary flags
bin_cols = ['dm_self','chf','chd','mi','stroke','emphysema','bronchitis','asthma',
            'copd','dm_rx','chol_rx','angina_rx','htn_rx','angina']
for c in df_my_cov_aligned.columns.intersection(bin_cols):
    df_my_cov_aligned[c] = pd.to_numeric(df_my_cov_aligned[c], errors='coerce').astype('Int8')

# labs/metrics
num_cols = ['wc','hba1c','fpg','tchol_hdl','MetS_hdl','MetS_triglycerides',
            'MetS_bp','MetS_wc','MetS_fpg','MetS_count']
for c in df_my_cov_aligned.columns.intersection(num_cols):
    df_my_cov_aligned[c] = pd.to_numeric(df_my_cov_aligned[c], errors='coerce')


In [90]:
# check 
df_my_cov_aligned.head(10)

Unnamed: 0,SEQN,SDDSRVYR,WTINT2YR,WTMEC2YR,WTSAF2YR,SDMVPSU,SDMVSTRA,age,sex,re,...,FS_HH,FS_ADULT,FS_FINAL,HHFDSEC,ADFDSEC,FS_HH4,FS_ADULT4,FS_SOURCE_HH,FS_SOURCE_FINAL,SNAP_SOURCE
0,1,1.0,9727.078709,10982.898896,75131.2,1.0,5.0,2.0,F,Other Hispanic,...,,,,,,,,,,
1,2,1.0,26678.636376,28325.384898,60586.147294,3.0,1.0,77.0,M,Mexican American,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,HHFDSEC,household,
2,3,1.0,43621.680548,46192.256945,121969.841152,2.0,7.0,10.0,F,Mexican American,...,,,,,,,,,,
3,4,1.0,10346.119327,10251.26002,4624.687273,1.0,2.0,1.0,M,Other Hispanic,...,,,,,,,,,,
4,5,1.0,91050.84662,99445.065735,234895.20565,2.0,8.0,49.0,M,Mexican American,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,HHFDSEC,household,
5,6,1.0,36508.250375,39656.600444,13379.8,2.0,2.0,19.0,F,NH Black,...,,,,,,,,,,
6,7,1.0,22352.08862,25525.423409,57661.621988,2.0,4.0,59.0,F,Other Hispanic,...,,,,,,,,,,
7,8,1.0,31600.089655,31510.587866,76026.438279,1.0,6.0,13.0,M,Mexican American,...,,,,,,,,,,
8,9,1.0,7529.435502,7575.870247,14694.924957,2.0,9.0,11.0,F,Other Hispanic,...,,,,,,,,,,
9,10,1.0,21071.164059,22445.808572,60202.416895,1.0,7.0,43.0,M,Other Hispanic,...,,,,,,,,,,


In [91]:
cols = ['chd','mi','dm_rx','MetS_count']
(
    df_my_cov_aligned
    .groupby('SDDSRVYR')[cols]
    .apply(lambda g: g.notna().mean())  # fraction non-missing per cycle
    .round(3)
    .sort_index()
)


Unnamed: 0_level_0,chd,mi,dm_rx,MetS_count
SDDSRVYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,1.0,1.0,1.0,1.0
2.0,1.0,1.0,1.0,1.0
3.0,1.0,1.0,1.0,1.0
4.0,1.0,1.0,1.0,1.0
5.0,1.0,1.0,1.0,1.0
6.0,1.0,1.0,1.0,1.0
7.0,1.0,1.0,1.0,1.0
8.0,1.0,1.0,1.0,1.0
9.0,1.0,1.0,1.0,1.0
10.0,1.0,1.0,1.0,1.0


#### check what column missing -23 

In [95]:
import pandas as pd

df = df_my_cov_aligned  # your aligned frame

# --- choose post-2018 cycles you care about ---
target_cycles = [12.0, 66.0]

# keep only rows from those cycles that actually exist
present_cycles = sorted([c for c in target_cycles if c in set(df['SDDSRVYR'].unique())])
if not present_cycles:
    raise ValueError("None of the target cycles are present in SDDSRVYR.")

post = df[df['SDDSRVYR'].isin(present_cycles)].copy()

# columns except the grouping key
cols_no_group = [c for c in post.columns if c != 'SDDSRVYR']

# Fraction non-missing for EVERY column, per cycle (variables as rows)
cov_all = (
    post.groupby('SDDSRVYR')[cols_no_group]
        .apply(lambda g: g.notna().mean(numeric_only=False))
        .T
        .round(3)
    # columns are cycles (e.g., 12.0, 66.0)
)

# pretty display if available
try:
    from IPython.display import display
    print("Fraction non-missing by cycle (all columns):")
    display(cov_all)
except Exception:
    print("Fraction non-missing by cycle (all columns):\n", cov_all)

# Ensure both target cycle columns exist for the zero-coverage check
cov_for_check = cov_all.reindex(columns=target_cycles, fill_value=0.0)

# Columns with 0.0 coverage in both cycles
zero_both_mask = (cov_for_check.iloc[:, 0] == 0) & (cov_for_check.iloc[:, 1] == 0)
zero_both = cov_for_check.index[zero_both_mask].tolist()

print(f"Vars with 0.0 coverage in both cycles ({len(zero_both)}):",
      zero_both[:30], "..." if len(zero_both) > 30 else "")


Fraction non-missing by cycle (all columns):


SDDSRVYR,12.0,66.0
SEQN,1.0,1.000
WTINT2YR,0.0,0.000
WTMEC2YR,1.0,0.000
WTSAF2YR,0.0,0.000
SDMVPSU,1.0,1.000
...,...,...
FS_HH4,0.0,0.549
FS_ADULT4,0.0,0.000
FS_SOURCE_HH,0.0,0.549
FS_SOURCE_FINAL,0.0,0.549


Vars with 0.0 coverage in both cycles (94): ['WTINT2YR', 'WTSAF2YR', 'tchol', 'hdl', 'ldl', 'tg', 'wc', 'bmi', 'dm_self', 'hba1c', 'fpg', 'chf', 'chd', 'mi', 'stroke', 'cancer', 'emphysema', 'bronchitis', 'asthma', 're2', 'copd', 'sbp', 'dbp', 'dm_rx', 'chol_rx', 'angina_rx', 'htn_rx', 'roseQ', 'no_na', 'age_cat'] ...


In [97]:

import pandas as pd

df = df_my_cov_aligned  # your aligned frame

# --- Define post-2018 cycles ---
post_cycles = [12.0, 66.0]

# Keep only those rows
post = df[df['SDDSRVYR'].isin(post_cycles)].copy()

# --- Compute fraction non-missing for each column by cycle ---
cov_all = (
    post.groupby('SDDSRVYR')
        .apply(lambda g: g.notna().mean(numeric_only=False), include_groups=False)
        .T.round(3)
)

# --- Identify columns missing in either or both cycles ---
missing_any = cov_all.index[(cov_all == 0).any(axis=1)].tolist()
missing_both = cov_all.index[(cov_all == 0).all(axis=1)].tolist()

print("🔎 Missing in at least one post-2018 cycle:", len(missing_any))
print(missing_any[:30], "..." if len(missing_any) > 30 else "")

print("\n❌ Missing in BOTH 12.0 and 66.0:", len(missing_both))
print(missing_both[:30], "..." if len(missing_both) > 30 else "")

# Optional: show a nice table of just the missing vars
missing_table = cov_all.loc[missing_any]
try:
    from IPython.display import display
    display(missing_table)
except:
    print(missing_table)


🔎 Missing in at least one post-2018 cycle: 106
['WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'tchol', 'hdl', 'ldl', 'tg', 'wc', 'bmi', 'dm_self', 'hba1c', 'fpg', 'chf', 'chd', 'mi', 'stroke', 'cancer', 'emphysema', 'bronchitis', 'asthma', 're2', 'copd', 'sbp', 'dbp', 'dm_rx', 'chol_rx', 'angina_rx', 'htn_rx', 'roseQ', 'no_na'] ...

❌ Missing in BOTH 12.0 and 66.0: 94
['WTINT2YR', 'WTSAF2YR', 'tchol', 'hdl', 'ldl', 'tg', 'wc', 'bmi', 'dm_self', 'hba1c', 'fpg', 'chf', 'chd', 'mi', 'stroke', 'cancer', 'emphysema', 'bronchitis', 'asthma', 're2', 'copd', 'sbp', 'dbp', 'dm_rx', 'chol_rx', 'angina_rx', 'htn_rx', 'roseQ', 'no_na', 'age_cat'] ...


SDDSRVYR,12.0,66.0
WTINT2YR,0.0,0.000
WTMEC2YR,1.0,0.000
WTSAF2YR,0.0,0.000
tchol,0.0,0.000
hdl,0.0,0.000
...,...,...
FS_HH4,0.0,0.549
FS_ADULT4,0.0,0.000
FS_SOURCE_HH,0.0,0.549
FS_SOURCE_FINAL,0.0,0.549


## Fill columns missing post 2018   

In [106]:
import pandas as pd

df = df_my_cov_aligned.copy()
post_mask = df['SDDSRVYR'].isin([12.0, 66.0])

# ---------- 1) Inspect what's in LABS and MCQ ----------
def head_info(name, obj):
    print(f"\n=== {name} ===")
    if isinstance(obj, pd.DataFrame) and not obj.empty:
        print("shape:", obj.shape)
        print("cols:", list(obj.columns[:20]))
        # overlap in SEQN with your post-2018 rows
        inter = set(df.loc[post_mask, 'SEQN']).intersection(set(obj['SEQN'])) if 'SEQN' in obj.columns else set()
        print("SEQN overlap with post-2018:", len(inter))
    else:
        print("(empty)")

if 'LABS' in globals(): head_info("LABS", LABS)
if 'MCQ'  in globals(): head_info("MCQ", MCQ)

# Make sure SEQN dtypes agree (prevents silent non-matches)
if 'SEQN' in df.columns:
    df['SEQN'] = pd.to_numeric(df['SEQN'], errors='coerce')

for tbl in ['LABS','MCQ']:
    if tbl in globals() and isinstance(globals()[tbl], pd.DataFrame) and 'SEQN' in globals()[tbl].columns:
        globals()[tbl]['SEQN'] = pd.to_numeric(globals()[tbl]['SEQN'], errors='coerce')

# ---------- 2) Retry LABS fill (robust) ----------
if 'LABS' in globals() and isinstance(LABS, pd.DataFrame) and not LABS.empty:
    # Which analytes do we actually have?
    pairs_all = {'tchol':'LBXTC','hdl':'LBDHDD','tg':'LBXTR','fpg':'LBXGLU','hba1c':'LBXGH'}
    have = {dst:src for dst,src in pairs_all.items() if src in LABS.columns}
    if have:
        # Pre-create dest so right-side gets _labs suffix
        for dst in have:
            if dst not in df.columns:
                df[dst] = pd.NA

        # Merge ONLY post-2018 persons to reduce memory
        post_seqn = df.loc[post_mask, ['SEQN']].drop_duplicates()
        labs_sub = post_seqn.merge(LABS[['SEQN'] + list(have.values())], on='SEQN', how='left')

        df = df.merge(labs_sub, on='SEQN', how='left', suffixes=('','_labs'))

        # Fill where NA and right side present
        for dst, src in have.items():
            rhs = f'{src}_labs'
            if rhs in df.columns:
                m = post_mask & df[dst].isna() & df[rhs].notna()
                df.loc[m, dst] = df.loc[m, rhs]

        # ratio if both present
        if all(c in df.columns for c in ['tchol','hdl','tchol_hdl']):
            m = post_mask & df['tchol_hdl'].isna() & df['tchol'].notna() & df['hdl'].notna() & (df['hdl']!=0)
            df.loc[m, 'tchol_hdl'] = df.loc[m, 'tchol'] / df.loc[m, 'hdl']

        # clean up
        drop_cols = [f'{src}_labs' for src in have.values() if f'{src}_labs' in df.columns]
        df.drop(columns=drop_cols, inplace=True, errors='ignore')

        # Quick audit
        print("\nFilled LABS (post-2018) non-missing share:")
        cols = [c for c in ['tchol','hdl','tg','fpg','hba1c','tchol_hdl'] if c in df.columns]
        if cols:
            print(df.loc[post_mask, cols].notna().mean().round(3))

# ---------- 3) MCQ (CVD history) — corrected & case-insensitive ----------
if 'MCQ' in globals() and isinstance(MCQ, pd.DataFrame) and not MCQ.empty:
    # only bring MCQ rows for post-2018 persons
    post_seqn = df.loc[post_mask, ['SEQN','SDDSRVYR']].drop_duplicates()
    mcq_sub = post_seqn.merge(MCQ, on=['SEQN','SDDSRVYR'], how='left')

    # merge (no suffixing; we'll drop MCQ160* afterward)
    df = df.merge(mcq_sub, on=['SEQN','SDDSRVYR'], how='left', suffixes=('',''))

    # helper: pick first existing column from candidates (exact or case-insensitive)
    def pick_any(frame, names):
        for n in names:
            if n in frame.columns:
                return n
        lower_map = {c.lower(): c for c in frame.columns}
        for n in names:
            if n.lower() in lower_map:
                return lower_map[n.lower()]
        return None

    # Correct mapping (stroke is f/F), support J vs L and any casing
    mcq_map_fixed = {
        'chf':    ['MCQ160b','MCQ160B'],
        'chd':    ['MCQ160c','MCQ160C'],
        'angina': ['MCQ160d','MCQ160D'],
        'mi':     ['MCQ160e','MCQ160E'],
        'stroke': ['MCQ160f','MCQ160F'],
    }

    for out, candidates in mcq_map_fixed.items():
        raw = pick_any(df, candidates)
        if raw and out in df.columns:
            m = post_mask & df[out].isna() & df[raw].notna()
            df.loc[m, out] = (df.loc[m, raw] == 1).astype('Int8')

    # drop raw MCQ columns to keep your schema clean
    drop_mcq = [c for c in df.columns if c.startswith('MCQ160')]
    if drop_mcq:
        df.drop(columns=drop_mcq, inplace=True, errors='ignore')

    # quick coverage report
    cols = [c for c in ['chf','chd','angina','mi','stroke'] if c in df.columns]
    if cols:
        print("\nFilled MCQ (post-2018) non-missing share:")
        print(df.loc[post_mask, cols].notna().mean().round(3))

# ---------- 4) Recompute MetS flags ONLY where inputs exist ----------
sex = df.get('sex', pd.Series(index=df.index, dtype='object')).astype(str).str.lower()

def set_flag(col, mask_bool):
    if col in df.columns:
        m = post_mask & df[col].isna() & mask_bool.notna()
        df.loc[m, col] = mask_bool[m].astype('Int8')

if {'MetS_hdl','hdl'}.issubset(df.columns):
    present = df['hdl'].notna() & sex.isin(['male','female'])
    hdl_abn = pd.Series(pd.NA, index=df.index)
    hdl_abn[present & (sex=='male')]   = (df.loc[present & (sex=='male'), 'hdl'] < 40)
    hdl_abn[present & (sex=='female')] = (df.loc[present & (sex=='female'), 'hdl'] < 50)
    set_flag('MetS_hdl', hdl_abn)

if {'MetS_triglycerides','tg'}.issubset(df.columns):
    present = df['tg'].notna()
    set_flag('MetS_triglycerides', (df['tg'] >= 150).where(present))

if 'MetS_fpg' in df.columns and ('fpg' in df.columns or 'hba1c' in df.columns):
    present = df.get('fpg', pd.Series(index=df.index)).notna() | df.get('hba1c', pd.Series(index=df.index)).notna()
    flag = ((df.get('fpg', 0) >= 100) | (df.get('hba1c', 0) >= 6.5)).where(present)
    set_flag('MetS_fpg', flag)

# Waist/BP MetS flags are already set earlier if their inputs exist

# ---------- 5) Save back and show coverage ----------
df_my_cov_aligned = df

check = [c for c in ['BMXWT','BMXHT','BMI','wc','SBP','DBP','tchol','hdl','tg','fpg','hba1c','tchol_hdl','chd','mi','chf','stroke','MetS_count'] if c in df.columns]
print("\nPost-2018 non-missing share (key fields):")
if check:
    print(df.loc[post_mask, check].notna().mean().round(3).sort_values(ascending=False))
else:
    print("(no key fields to report)")



=== LABS ===
shape: (20266, 7)
cols: ['SEQN', 'SDDSRVYR', 'LBXTC', 'LBDHDD', 'LBXTR', 'LBXGLU', 'LBXGH']
SEQN overlap with post-2018: 20266

=== MCQ ===
shape: (26730, 11)
cols: ['SEQN', 'SDDSRVYR', 'MCQ160A', 'MCQ160B', 'MCQ160C', 'MCQ160D', 'MCQ160E', 'MCQ160F', 'MCQ160M', 'MCQ160P', 'MCQ160L']
SEQN overlap with post-2018: 26730

Filled LABS (post-2018) non-missing share:
tchol        0.644
hdl          0.644
tg           0.169
fpg          0.306
hba1c        0.598
tchol_hdl    0.644
dtype: float64

Filled MCQ (post-2018) non-missing share:
chf       0.284
chd       0.284
angina    0.284
mi        0.284
stroke    0.284
dtype: float64

Post-2018 non-missing share (key fields):
MetS_count    1.000
BMXWT         0.830
BMXHT         0.788
BMI           0.786
wc            0.755
tchol         0.644
hdl           0.644
tchol_hdl     0.644
hba1c         0.598
fpg           0.306
chd           0.284
mi            0.284
chf           0.284
stroke        0.284
tg            0.169
dtype: float

In [120]:
 df_my_cov_aligned[['MetS_hdl']].tail(100)

Unnamed: 0,MetS_hdl
128709,0.0
128710,0.0
128711,0.0
128712,0.0
128713,0.0
...,...
128804,0.0
128805,0.0
128806,0.0
128807,0.0


In [107]:
import pandas as pd

df = df_my_cov_aligned.copy()
POST = [12.0, 66.0]
post_mask = df['SDDSRVYR'].isin(POST)
filled = []

def maybe_merge(left, right, cols, suffix):
    """Merge if right exists and has the needed cols."""
    if isinstance(right, pd.DataFrame) and all(c in right.columns for c in cols):
        return left.merge(right[cols], on='SEQN', how='left', suffixes=('', suffix))
    return left

def fill_na(dst, src):
    if dst in df.columns and src in df.columns:
        m = post_mask & df[dst].isna() & df[src].notna()
        if m.any():
            df.loc[m, dst] = df.loc[m, src]
            filled.append(dst)

# ---------- BMX ----------
if 'bmx_2019_2023' in globals():
    df = maybe_merge(df, bmx_2019_2023, ['SEQN','BMXWT','BMXHT','BMXWAIST'], '_bmx')
    for dst, src in [('BMXWT','BMXWT_bmx'), ('BMXHT','BMXHT_bmx')]:
        fill_na(dst, src)
    # waist -> wc
    if 'BMXWAIST_bmx' in df.columns:
        fill_na('wc', 'BMXWAIST_bmx')
    # BMI
    m = post_mask & df['BMI'].isna() & df['BMXWT'].notna() & df['BMXHT'].notna()
    df.loc[m, 'BMI'] = df.loc[m,'BMXWT'] / (df.loc[m,'BMXHT']/100.0)**2
    if m.any(): filled.append('BMI')
    # BMI class
    if 'BMI_CLAS' in df.columns:
        bins = [0,18.5,25,30,float('inf')]; labels = ['under','normal','over','obese']
        m = post_mask & df['BMI_CLAS'].isna() & df['BMI'].notna()
        df.loc[m, 'BMI_CLAS'] = pd.cut(df.loc[m,'BMI'], bins=bins, labels=labels)
        if m.any(): filled.append('BMI_CLAS')
    df.drop(columns=[c for c in df.columns if c.endswith('_bmx')], inplace=True, errors='ignore')

# ---------- BPX ----------
if 'bpx_2019_2023' in globals():
    keep = [c for c in ['SEQN','BPXSY1','BPXSY2','BPXSY3','BPXDI1','BPXDI2','BPXDI3'] if c in bpx_2019_2023.columns]
    if len(keep) >= 2:
        tmp = bpx_2019_2023[keep].copy()
        tmp['SBP_mean'] = tmp[[c for c in ['BPXSY1','BPXSY2','BPXSY3'] if c in tmp]].mean(axis=1)
        tmp['DBP_mean'] = tmp[[c for c in ['BPXDI1','BPXDI2','BPXDI3'] if c in tmp]].mean(axis=1)
        df = df.merge(tmp[['SEQN','SBP_mean','DBP_mean']], on='SEQN', how='left')
        fill_na('SBP', 'SBP_mean')
        fill_na('DBP', 'DBP_mean')
        df.drop(columns=['SBP_mean','DBP_mean'], inplace=True, errors='ignore')

# ---------- LAB ----------
if 'lab_2019_2023' in globals():
    poss = ['SEQN','LBXTC','LBDHDD','LBDLDL','LBXTR','LBXGLU','LBXGH']
    keep = [c for c in poss if c in lab_2019_2023.columns]
    if len(keep) > 1:
        df = maybe_merge(df, lab_2019_2023, keep, '_lab')
        pairs = {'tchol':'LBXTC_lab','hdl':'LBDHDD_lab','ldl':'LBDLDL_lab',
                 'tg':'LBXTR_lab','fpg':'LBXGLU_lab','hba1c':'LBXGH_lab'}
        for dst, src in pairs.items():
            fill_na(dst, src)
        # tchol/hdl ratio
        if all(c in df.columns for c in ['tchol','hdl','tchol_hdl']):
            m = post_mask & df['tchol_hdl'].isna() & df['tchol'].notna() & df['hdl'].notna() & (df['hdl']!=0)
            df.loc[m, 'tchol_hdl'] = df.loc[m,'tchol']/df.loc[m,'hdl']
            if m.any(): filled.append('tchol_hdl')
        df.drop(columns=[c for c in df.columns if c.endswith('_lab')], inplace=True, errors='ignore')

# ---------- MCQ (CVD history) ----------
if 'mcq_2019_2023' in globals():
    keep = [c for c in ['SEQN','MCQ160B','MCQ160E','MCQ160F','MCQ160G'] if c in mcq_2019_2023.columns]
    if len(keep) > 1:
        df = df.merge(mcq_2019_2023[keep], on='SEQN', how='left')
        for out, raw in [('chd','MCQ160E'),('mi','MCQ160F'),('chf','MCQ160B'),('stroke','MCQ160G')]:
            if out in df.columns and raw in df.columns:
                m = post_mask & df[out].isna()
                df.loc[m, out] = (df.loc[m, raw] == 1).astype('Int8')
                if m.any(): filled.append(out)

# ---------- RXQ (Medications) ----------
if 'rxq_2019_2023' in globals() and 'SEQN' in rxq_2019_2023.columns:
    # Replace this logic with your ATC/RxClass classification
    class_col = 'DRUG_CLASS'
    if class_col in rxq_2019_2023.columns:
        rx = rxq_2019_2023[['SEQN', class_col]].copy()
        rx['dm_f']   = rx[class_col].str.contains('antidiabet', case=False, na=False)
        rx['htn_f']  = rx[class_col].str.contains('antihypertens', case=False, na=False)
        rx['chol_f'] = rx[class_col].str.contains('statin|lipid', case=False, na=False)
        rx['ang_f']  = rx[class_col].str.contains('nitrate|antiangina|ranolazine', case=False, na=False)
        flags = (rx.groupby('SEQN')[['dm_f','htn_f','chol_f','ang_f']].any()
                   .rename(columns={'dm_f':'dm_rx','htn_f':'htn_rx','chol_f':'chol_rx','ang_f':'angina_rx'})
                   .reset_index())
        df = df.merge(flags, on='SEQN', how='left', suffixes=('','_new'))
        for col in ['dm_rx','htn_rx','chol_rx','angina_rx']:
            if col in df.columns and f'{col}_new' in df.columns:
                m = post_mask & df[col].isna()
                df.loc[m, col] = df.loc[m, f'{col}_new'].fillna(False).astype('Int8')
                df.drop(columns=[f'{col}_new'], inplace=True)

# ---------- MetS components & count ----------
if 'sex' in df.columns:
    sex = df['sex'].astype(str).str.lower()
    def set_bin(col, mask_bool):
        if col in df.columns:
            m = post_mask & df[col].isna()
            df.loc[m, col] = mask_bool[m].astype('Int8')
            if m.any(): filled.append(col)

    # components
    if 'BMXWAIST' in df.columns:  # for wc criterion
        wc_abn = ((sex=='male') & (df['BMXWAIST']>=102)) | ((sex=='female') & (df['BMXWAIST']>=88))
        set_bin('MetS_wc', wc_abn)

    if all(c in df.columns for c in ['SBP','DBP']):
        bp_abn = (df['SBP']>=130) | (df['DBP']>=85) | (df.get('htn_rx', 0)==1)
        set_bin('MetS_bp', bp_abn)

    if 'tg' in df.columns:
        set_bin('MetS_triglycerides', df['tg']>=150)

    if 'hdl' in df.columns:
        hdl_abn = ((sex=='male') & (df['hdl']<40)) | ((sex=='female') & (df['hdl']<50))
        set_bin('MetS_hdl', hdl_abn)

    set_bin('MetS_fpg', (df.get('fpg', pd.Series(index=df.index))>=100) | (df.get('hba1c', pd.Series(index=df.index))>=6.5))

    # count
    if 'MetS_count' in df.columns:
        m = post_mask & df['MetS_count'].isna()
        comps = [c for c in ['MetS_wc','MetS_bp','MetS_triglycerides','MetS_hdl','MetS_fpg'] if c in df.columns]
        if comps:
            comp_sum = df[comps].replace({pd.NA:0}).astype('Int8').sum(axis=1, min_count=1)
            df.loc[m, 'MetS_count'] = comp_sum[m]
            if m.any(): filled.append('MetS_count')

# ---------- Food security ----------
if 'fs_2019_2023' in globals():
    fs_cols = [c for c in ['SEQN','FS','FS_FINAL','FS_HH','FS_ADULT','HHFDSEC','ADFDSEC','FS_HH4','FS_ADULT4','FS_SOURCE_HH','FS_SOURCE_FINAL','SNAP_SOURCE'] if c in fs_2019_2023.columns]
    if fs_cols:
        df = df.merge(fs_2019_2023[fs_cols], on='SEQN', how='left', suffixes=('','_fsnew'))
        for base in [c for c in fs_cols if c!='SEQN']:
            newc = f'{base}_fsnew'
            if base in df.columns and newc in df.columns:
                m = post_mask & df[base].isna()
                df.loc[m, base] = df.loc[m, newc]
        df.drop(columns=[c for c in df.columns if c.endswith('_fsnew')], inplace=True, errors='ignore')

# ---------- Summary ----------
print("Filled (post-2018) columns:", sorted(set(filled)))

# quick coverage re-check for post cycles
post_cov = (
    df[df['SDDSRVYR'].isin(POST)]
      .groupby('SDDSRVYR')
      .apply(lambda g: g.notna().mean(numeric_only=False), include_groups=False)
      .T.round(3)
)
try:
    from IPython.display import display
    print("Fraction non-missing by cycle (post cycles):")
    display(post_cov)
except Exception:
    print(post_cov)

df_my_cov_aligned = df


Filled (post-2018) columns: ['MetS_wc']
Fraction non-missing by cycle (post cycles):


SDDSRVYR,12.0,66.0
SEQN,1.000,1.000
WTINT2YR,0.000,0.000
WTMEC2YR,1.000,0.000
WTSAF2YR,0.000,0.000
SDMVPSU,1.000,1.000
...,...,...
LBXTC,0.577,0.696
LBDHDD,0.577,0.696
LBXTR,0.000,0.299
LBXGLU,0.308,0.305


#### check after fetch and merge 

In [112]:
import pandas as pd

df = df_my_cov_aligned  # assumes it exists

# 1) Just the column names (alphabetical)
cols = sorted(df.columns.tolist())
print(f"{len(cols)} columns:\n", cols)

# 2) Detailed summary per column
summary = (
    pd.DataFrame({
        "col": df.columns,
        "dtype": df.dtypes.astype(str),
        "n_nonnull": df.notna().sum().values,
        "n_unique": df.nunique(dropna=True).values,
        "na_rate": df.isna().mean().round(3).values,
    })
    .sort_values("col")
    .reset_index(drop=True)
)
summary





143 columns:
 ['ADFDSEC', 'ALCOHOL_CAT', 'BMI', 'BMI_CLAS', 'BMXHT', 'BMXWAIST', 'BMXWT', 'CENSORED', 'CIDI_12M_MDE', 'CIDI_SCORE_RAW', 'CIGS_PER_DAY', 'CVD', 'DEP_HARMONIZED', 'DEP_IMP', 'DEP_SOURCE', 'DMDHHSIZ', 'DPQ_CAT', 'DRINKS_PER_DAY', 'EDU_CAT', 'ELIGSTAT', 'EMPLOY', 'EVENT', 'FEMALE', 'FORMER_SMOKER', 'FS', 'FSDHH', 'FS_ADULT', 'FS_ADULT4', 'FS_FINAL', 'FS_HH', 'FS_HH4', 'FS_SOURCE_FINAL', 'FS_SOURCE_HH', 'FU_YRS_EXM', 'FU_YRS_INT', 'HHFDSEC', 'HIGH_CHOL', 'HOD050', 'HOQ065', 'HTN', 'IMP', 'INDFMINC', 'INS', 'IS_ADULT', 'IS_POST2018', 'LBDHDD', 'LBXGH', 'LBXGLU', 'LBXTC', 'LBXTR', 'LTPA', 'MARITAL', 'MARITAL_CAT', 'METSCORE', 'MORTALITY_COVERED', 'MORTSTAT', 'MetS', 'MetS_bp', 'MetS_count', 'MetS_fpg', 'MetS_hdl', 'MetS_triglycerides', 'MetS_wc', 'PACK_YEARS', 'PERMTH_EXM', 'PERMTH_INT', 'PHQ9', 'PHQ9_GE10', 'RIAGENDR', 'SDDSRVYR', 'SDDSRVYR_src', 'SDMVPSU', 'SDMVSTRA', 'SEQN', 'SMK_STATUS', 'SNAP', 'SNAP_SOURCE', 'UCOD_LABEL', 'UCOD_LEADING', 'UNEMPLOYMENT', 'WTINT2YR', 'WTME

Unnamed: 0,col,dtype,n_nonnull,n_unique,na_rate
0,ADFDSEC,Int64,9798,4,0.924
1,ALCOHOL_CAT,category,68861,3,0.465
2,BMI,object,21608,20855,0.832
3,BMI_CLAS,string,87799,4,0.318
4,BMXHT,float64,109895,1225,0.147
...,...,...,...,...,...
138,stroke,Int8,109122,3,0.153
139,tchol,float64,119034,366,0.076
140,tchol_hdl,float64,119034,21058,0.076
141,tg,float64,105966,697,0.177


In [119]:
 df_my_cov_aligned[['MetS_hdl']].tail(40)

Unnamed: 0,MetS_hdl
128769,0.0
128770,0.0
128771,0.0
128772,0.0
128773,0.0
128774,0.0
128775,0.0
128776,0.0
128777,0.0
128778,0.0


#### remove duplicate columns 

In [114]:
import pandas as pd

df = df_my_cov_aligned.copy()

# -------- 1) Coverage table --------
cov = (
    pd.DataFrame({
        "col": df.columns,
        "nonnull": df.notna().sum().values,
        "na_rate": df.isna().mean().values
    })
    .assign(pct=lambda x: (x["nonnull"] / len(df)).round(3))
    .sort_values(["pct","col"], ascending=[False,True])
    .reset_index(drop=True)
)
print(f"{len(df.columns)} columns; top coverage preview:")
print(cov.head(20))

# -------- 2) Alias groups you likely have duplicates for --------
# Winner is chosen by highest non-missing. If tie, the first name in each group wins.
ALIASES = {
    # case / spelling duplicates
    "bmi":        ["BMI", "bmi"],
    "sbp":        ["SBP", "sbp"],
    "dbp":        ["DBP", "dbp"],
    "tchol":      ["tchol", "LBXTC"],
    "hdl":        ["hdl", "LBDHDD"],
    "tg":         ["tg", "LBXTR"],
    "fpg":        ["fpg", "LBXGLU"],
    "hba1c":      ["hba1c", "LBXGH"],
    "wc":         ["wc", "BMXWAIST"],
    "bmxht":      ["BMXHT", "bmxht"],
    "bmxwt":      ["BMXWT", "bmxwt"],
    "tchol_hdl":  ["tchol_hdl"],
    "sddsrvyr":   ["SDDSRVYR", "SDDSRVYR_src"],

    # common harmonized vs raw pairs you showed:
    "chd":        ["chd"],
    "mi":         ["mi"],
    "chf":        ["chf"],
    "stroke":     ["stroke"],
    "angina":     ["angina"],

    # lowercase versions of PRI/SEC sets (kept as-is; not merged):
    # 'sbp/dbp' above already mapped; these stay independent scores:
    # adiposity_pri/sec, bp_pri/sec, lipid_pri/sec, glucose_pri/sec, cvd_pri/sec ...
}

# Only consider aliases that actually exist in your df
ALIASES = {k:[c for c in v if c in df.columns] for k,v in ALIASES.items()}
ALIASES = {k:v for k,v in ALIASES.items() if len(v) >= 2}

# -------- 3) Decide winners by coverage --------
plan = []
for canon, cols in ALIASES.items():
    sub = cov[cov["col"].isin(cols)].sort_values(["pct", "col"], ascending=[False, True])
    winner = sub.iloc[0]["col"]
    losers = [c for c in cols if c != winner]
    plan.append((canon, winner, losers, sub[["col","pct","nonnull","na_rate"]].reset_index(drop=True)))

print("\nConsolidation plan (preview):")
for canon, winner, losers, stats in plan:
    print(f"\n[{canon}] winner -> {winner}; drop -> {losers}")
    print(stats)

# -------- 4) Apply coalesce-and-drop (safe by default) --------
EXECUTE = False  # <<< set True to modify df_my_cov_aligned

if EXECUTE:
    for canon, winner, losers, _ in plan:
        # fill winner with any non-missing from losers (in order)
        for lo in losers:
            df[winner] = df[winner].where(df[winner].notna(), df[lo])
        # drop losers
        df.drop(columns=losers, inplace=True, errors="ignore")

    # Optional: enforce canonical names (rename raw to tidy lowercase where desired)
    RENAME = {
        "LBXTC":"tchol", "LBDHDD":"hdl", "LBXTR":"tg", "LBXGLU":"fpg", "LBXGH":"hba1c",
        "BMXHT":"BMXHT", "BMXWT":"BMXWT",  # keep CDC BMI sources uppercase (your choice)
        "SDDSRVYR":"SDDSRVYR",  # keep canonical survey cycle name
    }
    # Only rename if present and not already the desired name
    df.rename(columns={k:v for k,v in RENAME.items() if k in df.columns and k != v}, inplace=True)

    # Write back
    df_my_cov_aligned = df
    print("\n✓ Applied consolidation. New column count:", df.shape[1])

# -------- 5) Quick report of resolved duplicates --------
resolved = {canon:(winner, losers) for canon, winner, losers, _ in plan}
print("\nResolved duplicates summary:")
for k,(w,losers) in resolved.items():
    print(f"- {k}: kept {w}; dropped {losers if losers else '[]'}")


143 columns; top coverage preview:
                   col  nonnull   na_rate    pct
0               FEMALE   128809  0.000000  1.000
1             IS_ADULT   128809  0.000000  1.000
2          IS_POST2018   128809  0.000000  1.000
3    MORTALITY_COVERED   128809  0.000000  1.000
4           MetS_count   128809  0.000000  1.000
5             MetS_fpg   128809  0.000000  1.000
6             MetS_hdl   128809  0.000000  1.000
7   MetS_triglycerides   128809  0.000000  1.000
8              MetS_wc   128809  0.000000  1.000
9             RIAGENDR   128809  0.000000  1.000
10            SDDSRVYR   128809  0.000000  1.000
11             SDMVPSU   128809  0.000000  1.000
12            SDMVSTRA   128809  0.000000  1.000
13                SEQN   128809  0.000000  1.000
14                 age   128809  0.000000  1.000
15                  re   128809  0.000000  1.000
16                 sex   128809  0.000000  1.000
17                  wc   122080  0.052240  0.948
18                 hdl   119034  0

#### try fetch from nhance web