# backfill missing cov

In [None]:
## check before start work

In [159]:
# Close any open plots and free memory
import gc
try:
    import matplotlib.pyplot as plt
    plt.close('all')
except Exception:
    pass
gc.collect()

# Clear the interactive output area
from IPython.display import clear_output
clear_output(wait=True)

# Wipe user variables (keeps imported modules loaded)
%reset -f


In [160]:
%who  

Interactive namespace is empty.


## Load previous cov file

In [161]:
import pandas as pd
from pathlib import Path

OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
src = OUT / "cov_concise_99_23.parquet"

df_my_cov_aligned_short = pd.read_parquet(src)
df_my_cov_aligned_short["SEQN"] = pd.to_numeric(df_my_cov_aligned_short["SEQN"], errors="coerce").astype("Int64")
df_my_cov_aligned_short["SDDSRVYR"] = pd.to_numeric(df_my_cov_aligned_short["SDDSRVYR"], errors="coerce")
if "SNAP" not in df_my_cov_aligned_short.columns:
    df_my_cov_aligned_short["SNAP"] = pd.Series(pd.NA, index=df_my_cov_aligned_short.index, dtype="Int64")

print("✓ Loaded cov_concise_99_23:", df_my_cov_aligned_short.shape)


✓ Loaded cov_concise_99_23: (128809, 51)


In [162]:
df_my_cov_aligned_short["bmic"].head(20)

0      UNDER
1     NORMAL
2      UNDER
3       <NA>
4       OVER
5     NORMAL
6       OVER
7      UNDER
8      UNDER
9      OBESE
10    NORMAL
11     OBESE
12      OVER
13      OVER
14      OVER
15    NORMAL
16     UNDER
17      <NA>
18      <NA>
19    NORMAL
Name: bmic, dtype: string

## SNAP pre2003

In [163]:
# 08_snap_backfill: Fill SNAP in 1999–2002 and re-check coverage
# - Load FSQ (1999) / FSQ_B (2001)
# - Hierarchy: FSD200 (current) → FSD180 (past yr) → HH proxy FSD170N (promote if HH size==1)
# - Overwrite only cycles 1 & 2, keep provenance in SNAP_src
# - Build SNAP_bin (0/1/NA) and print audits

import pandas as pd, numpy as np, io, requests
from typing import Optional

# ---------- fetch helper (fix early-cycle year folders) ----------
_YEARFOLDER_FIX = {"1999-2000": "1999", "2001-2002": "2001"}

def fetch_xpt(year_folder: str, filebase: str) -> pd.DataFrame:
    yf = _YEARFOLDER_FIX.get(year_folder, year_folder)
    url = f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{yf}/DataFiles/{filebase}.xpt"
    r = requests.get(url); r.raise_for_status()
    df = pd.read_sas(io.BytesIO(r.content), format="xport", encoding="latin1")
    df.columns = [c.upper() for c in df.columns]
    df["SEQN"] = pd.to_numeric(df["SEQN"], errors="coerce").astype("Int64")
    return df

# ---------- pull SNAP-relevant columns from FSQ ----------
def load_fsq_for_cycle(year_folder: str, filebase: str) -> pd.DataFrame:
    """
    Returns FSQ subset indexed by SEQN with SNAP candidates + household size.
    Columns (when present): FSD200, FSD180, FSD170N, DMDHHSIZ
    """
    fsq = fetch_xpt(year_folder, filebase).set_index("SEQN")
    keep = [c for c in ["FSD200","FSD180","FSD170N","DMDHHSIZ"] if c in fsq.columns]
    return fsq[keep].apply(pd.to_numeric, errors="coerce")

def fill_snap_hierarchy(df: pd.DataFrame, fsq: pd.DataFrame, cycle_code: int) -> pd.DataFrame:
    """
    For SDDSRVYR == cycle_code:
      1) FSD200: 1→1, 2→0
      2) FSD180: 1→1, 2→0 (fill remaining)
      3) FSD170N (HH proxy): >0→1, ==0→0 (fill remaining); mark as HH proxy
         Promote HH proxy to individual if DMDHHSIZ==1.
    Writes SNAP (Int64 0/1/NA) and SNAP_src (string).
    """
    out = df.copy()
    m = out["SDDSRVYR"].eq(float(cycle_code))
    if not m.any():
        return out

    seqn = out.loc[m, "SEQN"].astype("Int64")
    sub = fsq.reindex(seqn.values)

    snap = pd.Series(pd.NA, index=sub.index, dtype="Int64")
    src  = pd.Series(pd.NA, index=sub.index, dtype="string")

    # 1) Current authorization
    if "FSD200" in sub.columns:
        s = sub["FSD200"]
        snap.loc[s.eq(1)] = 1; src.loc[s.eq(1)] = "FSD200"
        snap.loc[s.eq(2)] = 0; src.loc[s.eq(2)] = "FSD200"

    # 2) Past-year authorization
    need = snap.isna()
    if "FSD180" in sub.columns:
        s = sub["FSD180"]
        snap.loc[need & s.eq(1)] = 1; src.loc[need & s.eq(1)] = "FSD180"
        snap.loc[need & s.eq(2)] = 0; src.loc[need & s.eq(2)] = "FSD180"

    # 3) HH proxy (any authorized in HH)
    need = snap.isna()
    if "FSD170N" in sub.columns:
        h = sub["FSD170N"]
        snap.loc[need & (h > 0)] = 1; src.loc[need & (h > 0)] = "FSD170N_HH"
        snap.loc[need & (h == 0)] = 0; src.loc[need & (h == 0)] = "FSD170N_HH"

        # Promote to individual if household size==1
        if "DMDHHSIZ" in sub.columns:
            promote = src.eq("FSD170N_HH") & sub["DMDHHSIZ"].eq(1)
            src.loc[promote] = "FSD170N_HH_singleton"

    # ensure cols exist
    if "SNAP" not in out.columns:
        out["SNAP"] = pd.Series(pd.NA, index=out.index, dtype="Int64")
    if "SNAP_src" not in out.columns:
        out["SNAP_src"] = pd.Series(pd.NA, index=out.index, dtype="string")

    out.loc[m, "SNAP"] = snap.values
    out.loc[m, "SNAP_src"] = src.values
    return out

# ---------- PRE audit ----------
def print_coverage(df: pd.DataFrame, col: str, title: str):
    cov = (df.groupby("SDDSRVYR", observed=True)[col]
             .apply(lambda s: s.notna().mean()*100, include_groups=False)
             .round(1))
    print(f"\n{title}\n", cov)

print_coverage(df_my_cov_aligned_short, "SNAP", "SNAP coverage by cycle (% non-missing) — BEFORE")

# ---------- apply to cycles 1 & 2 ----------
fsq_9900 = load_fsq_for_cycle("1999-2000", "FSQ")
fsq_0102 = load_fsq_for_cycle("2001-2002", "FSQ_B")

before1 = df_my_cov_aligned_short["SNAP"].notna().sum()
df_my_cov_aligned_short = fill_snap_hierarchy(df_my_cov_aligned_short, fsq_9900, 1)
df_my_cov_aligned_short = fill_snap_hierarchy(df_my_cov_aligned_short, fsq_0102, 2)
after1  = df_my_cov_aligned_short["SNAP"].notna().sum()
print(f"\nFilled cycles 1–2 | rows with SNAP non-missing: {before1} → {after1}")

# ---------- binarize & audits ----------
# SNAP is already 0/1/NA; keep a dedicated binary column for clarity
df_my_cov_aligned_short["SNAP_bin"] = df_my_cov_aligned_short["SNAP"].astype("Int64")

print_coverage(df_my_cov_aligned_short, "SNAP", "SNAP coverage by cycle (% non-missing) — AFTER")

# source breakdown for early cycles
src_counts = (df_my_cov_aligned_short
    .loc[df_my_cov_aligned_short["SDDSRVYR"].isin([1.0, 2.0])]
    .groupby(["SDDSRVYR", "SNAP_src"], observed=True)["SEQN"]
    .count()
    .sort_index())
print("\nSNAP source counts (cycles 1–2):\n", src_counts)



SNAP coverage by cycle (% non-missing) — BEFORE
 SDDSRVYR
1.0      4.8
2.0      4.2
3.0     48.9
4.0     47.5
5.0     57.9
6.0     58.3
7.0     56.5
8.0     55.9
9.0     55.0
10.0    56.2
12.0     0.0
66.0    54.5
Name: SNAP, dtype: float64

Filled cycles 1–2 | rows with SNAP non-missing: 53193 → 55371

SNAP coverage by cycle (% non-missing) — AFTER
 SDDSRVYR
1.0     15.1
2.0     14.6
3.0     48.9
4.0     47.5
5.0     57.9
6.0     58.3
7.0     56.5
8.0     55.9
9.0     55.0
10.0    56.2
12.0     0.0
66.0    54.5
Name: SNAP, dtype: float64

SNAP source counts (cycles 1–2):
 SDDSRVYR  SNAP_src  
1.0       FSD170N_HH      8
          FSD180        948
          FSD200        548
2.0       FSD180        893
          FSD200        716
Name: SEQN, dtype: Int64


#### check 

In [164]:
# --- Guardrails: ensure ONLY cycles 1–2 changed ---
_changed = (
    df_my_cov_aligned_short["SNAP_src"].notna()
    & ~df_my_cov_aligned_short["SDDSRVYR"].isin([1.0, 2.0])
)
assert not _changed.any(), "SNAP was modified outside cycles 1–2 unexpectedly."

# --- Provenance-aware flags (nullable Int64-safe) ---
src_rank = {
    "FSD200": 3,                    # current, individual
    "FSD180": 2,                    # past 12m, individual
    "FSD170N_HH_singleton": 2,      # HH proxy but singleton household
    "FSD170N_HH": 1                 # HH proxy (multi-person)
}
df_my_cov_aligned_short["SNAP_src_rank"] = (
    df_my_cov_aligned_short["SNAP_src"].map(src_rank).astype("Int64")
)

# --- Sensitivity variants (build as Pandas Series, not NumPy arrays) ---
mask_indiv = df_my_cov_aligned_short["SNAP_src"].isin(["FSD200", "FSD180"])
mask_indiv_or_single = df_my_cov_aligned_short["SNAP_src"].isin(
    ["FSD200", "FSD180", "FSD170N_HH_singleton"]
)

# Start as all NA (nullable Int64), then fill by mask
df_my_cov_aligned_short["SNAP_indiv_only"] = pd.Series(
    pd.NA, index=df_my_cov_aligned_short.index, dtype="Int64"
)
df_my_cov_aligned_short.loc[mask_indiv, "SNAP_indiv_only"] = (
    pd.to_numeric(df_my_cov_aligned_short.loc[mask_indiv, "SNAP"], errors="coerce")
      .astype("Int64")
)

df_my_cov_aligned_short["SNAP_indiv_plus_singleton"] = pd.Series(
    pd.NA, index=df_my_cov_aligned_short.index, dtype="Int64"
)
df_my_cov_aligned_short.loc[mask_indiv_or_single, "SNAP_indiv_plus_singleton"] = (
    pd.to_numeric(df_my_cov_aligned_short.loc[mask_indiv_or_single, "SNAP"], errors="coerce")
      .astype("Int64")
)

# Ensure the main binary is tidy nullable Int64 too
df_my_cov_aligned_short["SNAP_bin"] = pd.to_numeric(
    df_my_cov_aligned_short["SNAP"], errors="coerce"
).astype("Int64")

# --- Quick QC: adult participation rates by cycle (no save) ---
adults = df_my_cov_aligned_short.loc[df_my_cov_aligned_short["RIDAGEYR"] >= 18]

def _rate(s: pd.Series) -> float:
    s = pd.to_numeric(s, errors="coerce")
    denom = s.notna().sum()
    return float((s.eq(1)).sum() / denom * 100) if denom else float("nan")

qc = pd.DataFrame({
    "SNAP_bin_rate": adults.groupby("SDDSRVYR", observed=True)["SNAP_bin"].apply(_rate),
    "indiv_only_rate": adults.groupby("SDDSRVYR", observed=True)["SNAP_indiv_only"].apply(_rate),
    "indiv+singleton_rate": adults.groupby("SDDSRVYR", observed=True)["SNAP_indiv_plus_singleton"].apply(_rate),
}).round(2)

print("\nAdult SNAP participation rates by cycle (% among non-missing):\n", qc)

# Optional: free memory if not reusing FSQ pulls
# del fsq_9900, fsq_0102



Adult SNAP participation rates by cycle (% among non-missing):
           SNAP_bin_rate  indiv_only_rate  indiv+singleton_rate
SDDSRVYR                                                      
1.0               48.73            48.17                 48.17
2.0               54.66            54.66                 54.66
3.0               12.72              NaN                   NaN
4.0               11.10              NaN                   NaN
5.0               15.68              NaN                   NaN
6.0               19.82              NaN                   NaN
7.0               23.63              NaN                   NaN
8.0               22.37              NaN                   NaN
9.0               25.86              NaN                   NaN
10.0              23.67              NaN                   NaN
12.0                NaN              NaN                   NaN
66.0              24.46              NaN                   NaN


In [165]:
df_my_cov_aligned_short.columns

Index(['SEQN', 'SDDSRVYR', 'sdmvpsu', 'sdmvstra', 'RIDAGEYR', 'SEX', 'RACE',
       'household_size', 'EDU', 'pir', 'SMK_AVG', 'SMK', 'ALCG2', 'met_hr',
       'SMK_STATUS', 'CIGS_PER_DAY', 'PACK_YEARS', 'FORMER_SMOKER',
       'DRINKS_PER_DAY', 'ALCOHOL_CAT', 'bmic', 'DIABE', 'HYPERTEN', 'chol_rx',
       'CVD', 'cancer', 'probable_depression', 'ahei_total', 'unemployment2',
       'sdoh_access', 'ins', 'HOQ065', 'marriage', 'SNAP', 'FS', 'WTINT2YR',
       'WTMEC2YR', 'WTSAF2YR', 'WTINT4YR', 'WTMEC4YR', 'WTINTPRP', 'WTMECPRP',
       'WTSAFPRP', 'wt_int', 'wt_mec', 'wt_fasting', 'wt_phlebotomy',
       'WTPH2YR', 'marriage_prev', 'marriage_label', 'marriage3', 'SNAP_src',
       'SNAP_bin', 'SNAP_src_rank', 'SNAP_indiv_only',
       'SNAP_indiv_plus_singleton'],
      dtype='object')

## check major missingness 

In [6]:
import pandas as pd
import numpy as np

df = df_my_cov_aligned_short

# --- keep only SNAP_bin & marriage3; drop helpers ---
snap_drop = {"SNAP","SNAP_prev","SNAP_src","SNAP_src_rank","SNAP_indiv_only","SNAP_indiv_plus_singleton"}
marriage_drop = {"marriage","marriage_prev","marriage_label"}
cols = [c for c in df.columns if c not in snap_drop | marriage_drop]
for must in ["SNAP_bin","marriage3"]:
    if must in df.columns and must not in cols:
        cols.append(must)

# -------- eligibility masks --------
def smokers_mask(d):
    if "SMK_STATUS" in d.columns:        # 1=never, 2=former, 3=current (adjust if different)
        return d["SMK_STATUS"].notna() & (d["SMK_STATUS"] != 1)
    m1 = d["FORMER_SMOKER"].eq(1) if "FORMER_SMOKER" in d.columns else False
    m2 = d["SMK"].eq(1) if "SMK" in d.columns else False
    return (m1 | m2).fillna(False)

def adults_mask(d):
    return pd.to_numeric(d["RIDAGEYR"], errors="coerce").ge(18).fillna(False)

def fasting_mask(d):  # prefer an actual fasting lab variable if present
    for cand in ["P_GLU","LBXGLU","fasting_glucose","glucose_fasting","wt_fasting"]:
        if cand in d.columns:
            return d[cand].notna()
    # fallback: very conservative (no rows)
    return pd.Series(False, index=d.index)

def phleb_mask(d):
    if "wt_phlebotomy" in d.columns:
        return d["wt_phlebotomy"].notna()
    # fallback: very conservative
    return pd.Series(False, index=d.index)

# Per-variable eligibility (row-level denominator)
eligibility = {
    "CIGS_PER_DAY": smokers_mask,
    "PACK_YEARS": smokers_mask,
    "SMK_AVG": smokers_mask,
    "probable_depression": adults_mask,
    # weights: compute missingness only where applicable to the sub-sample
    "WTSAFPRP": fasting_mask,            # fasting subsample (e.g., P_GLU universe)
    "WTPH2YR": phleb_mask,               # phlebotomy subsample
}

# Cycle-level applicability helper (exclude cycles where a var basically doesn't exist)
def applicable_cycles(d, col, thresh=0.10):
    by_cyc = d.groupby("SDDSRVYR", observed=True)[col].apply(lambda s: s.notna().mean())
    return set(by_cyc[by_cyc > thresh].index.tolist())

# Some weights are intended only for specific cycles (override presence heuristic)
cycle_overrides = {
    "WTINTPRP": {66.0},   # pre-pandemic combined cycle
    "WTMECPRP": {66.0},
    # Add any others you use with known cycle scopes
    # "WTINT4YR": {9.0, 10.0},  # example if you want to pin 4-year weights
    # "WTMEC4YR": {9.0, 10.0},
}

# -------- compute effective missingness --------
cycles = sorted(df["SDDSRVYR"].dropna().unique().tolist())
eff_miss = pd.DataFrame(index=[int(c) for c in cycles], columns=[])
overall_eff = {}

for col in cols:
    # 1) row-level eligibility
    elig = eligibility.get(col, lambda d: pd.Series(True, index=d.index))(df)

    # 2) cycle applicability
    if col in cycle_overrides:
        apps = cycle_overrides[col]
    else:
        apps = applicable_cycles(df, col, thresh=0.10)  # empirical presence

    cyc_mask = df["SDDSRVYR"].isin(list(apps)) if apps else pd.Series(False, index=df.index)
    use = elig & cyc_mask

    # per-cycle effective missingness
    per_cyc = []
    for cyc in cycles:
        m = use & df["SDDSRVYR"].eq(cyc)
        denom = m.sum()
        per_cyc.append(np.nan if denom == 0 else round(df.loc[m, col].isna().mean()*100, 1))
    eff_miss[col] = per_cyc

    # overall effective missingness
    denom_all = use.sum()
    overall_eff[col] = np.nan if denom_all == 0 else round(df.loc[use, col].isna().mean()*100, 1)

eff_miss.index.name = "SDDSRVYR"
eff_miss = eff_miss.sort_index()
overall_eff = pd.Series(overall_eff, name="OVERALL_effective_%miss").sort_values(ascending=False)

print("Top 20 by OVERALL *effective* % missing (weights handled as subsamples):\n",
      overall_eff.head(20))

top20_cols = overall_eff.head(20).index.tolist()
display(eff_miss[top20_cols])


Top 20 by OVERALL *effective* % missing (weights handled as subsamples):
 CIGS_PER_DAY      79.4
SMK_AVG           79.4
PACK_YEARS        77.9
DRINKS_PER_DAY    71.7
ahei_total        54.4
SNAP_bin          52.6
FS                46.9
HOQ065            46.7
ALCOHOL_CAT       46.5
ALCG2             46.5
sdoh_access       46.1
SMK_STATUS        45.7
SMK               45.7
FORMER_SMOKER     45.6
unemployment2     44.0
met_hr            43.5
marriage3         39.0
ins               38.4
wt_phlebotomy     32.4
wt_fasting        24.1
Name: OVERALL_effective_%miss, dtype: float64


Unnamed: 0_level_0,CIGS_PER_DAY,SMK_AVG,PACK_YEARS,DRINKS_PER_DAY,ahei_total,SNAP_bin,FS,HOQ065,ALCOHOL_CAT,ALCG2,sdoh_access,SMK_STATUS,SMK,FORMER_SMOKER,unemployment2,met_hr,marriage3,ins,wt_phlebotomy,wt_fasting
SDDSRVYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,79.5,79.5,76.4,75.7,19.0,84.9,52.4,52.0,55.4,55.4,51.9,51.2,51.2,51.0,51.0,51.0,39.2,52.4,,13.6
2,78.3,78.3,76.9,75.4,18.2,85.4,54.2,51.9,54.5,54.5,51.8,51.1,51.1,51.0,51.0,51.0,34.3,52.3,,11.8
3,77.8,77.8,75.7,75.5,24.5,51.1,52.4,50.8,53.2,53.2,50.8,50.3,50.3,50.2,50.2,50.2,33.2,51.0,,11.8
4,78.3,78.3,77.7,75.4,69.5,52.5,52.4,52.4,53.9,53.9,52.3,51.9,51.9,51.9,51.9,51.9,35.4,54.6,,13.0
5,77.9,77.9,77.3,70.1,68.9,42.1,42.1,42.1,43.8,43.8,42.1,41.6,41.6,41.5,41.5,41.5,41.6,44.5,,13.8
6,78.5,78.5,78.0,68.7,67.1,41.7,41.7,41.4,42.5,42.5,41.3,41.0,41.0,41.0,41.0,41.0,41.0,44.6,,12.3
7,80.3,80.3,79.9,68.7,69.5,43.5,43.3,43.4,42.4,42.4,43.3,43.1,43.1,43.0,43.0,43.0,43.1,46.7,,11.9
8,79.6,79.6,79.4,66.9,70.1,44.1,44.1,44.1,41.8,41.8,44.0,43.3,43.3,43.3,43.3,43.3,43.3,46.9,,12.1
9,81.6,81.6,79.6,68.4,71.3,45.0,44.7,44.8,42.5,42.5,44.6,42.8,42.8,42.6,42.6,42.6,42.7,48.7,,12.9
10,82.2,82.2,77.8,,70.1,43.8,43.3,43.3,40.2,40.2,43.2,39.8,39.8,39.8,39.8,39.8,39.9,44.7,,12.0


## check focus on pre18 missing

In [7]:
import pandas as pd
import numpy as np

df = df_my_cov_aligned_short

# ---------- config ----------
PRE2018_CYCLES = list(range(1, 11))          # cycles 1..10
THRESH = 20.0                                 # non-missingness (%)
IGNORE_VARS = {"CIGS_PER_DAY","SMK_AVG","PACK_YEARS","DRINKS_PER_DAY"}  # your expected-missing set
SNAP_HELPERS = {"SNAP","SNAP_prev","SNAP_src","SNAP_src_rank","SNAP_indiv_only","SNAP_indiv_plus_singleton"}
MARRIAGE_HELPERS = {"marriage","marriage_prev","marriage_label"}
ID_VARS = {"SEQN","SDDSRVYR"}

# keep only SNAP_bin & marriage3 for those topics
cols = []
for c in df.columns:
    if c in ID_VARS:                         # we won't audit these here
        continue
    if c in SNAP_HELPERS or c in MARRIAGE_HELPERS:
        continue
    if c in IGNORE_VARS:                     # skip your “expected missing” vars
        continue
    if c.startswith("WT"):                   # skip structural weight columns (e.g., WTSAFPRP, WTMECPRP...)
        continue
    cols.append(c)

# ensure main signals present if available
for must in ["SNAP_bin","marriage3"]:
    if must in df.columns and must not in cols:
        cols.append(must)

# subset to pre-2018 cycles
pre_mask = df["SDDSRVYR"].isin(PRE2018_CYCLES)
pre = df.loc[pre_mask, cols + ["SDDSRVYR"]].copy()

# non-missingness (%) by cycle
nonmiss = pre.groupby("SDDSRVYR", observed=True)[cols].apply(
    lambda x: x.notna().mean() * 100, include_groups=False
).round(1).sort_index()

# find columns with any cycle < THRESH
flag_mask = nonmiss.lt(THRESH)
flagged_cols = flag_mask.any(axis=0)
flagged = nonmiss.loc[:, flagged_cols]

# summary table
summary = pd.DataFrame({
    "min_nonmiss_%": flagged.min(axis=0),
    "max_nonmiss_%": flagged.max(axis=0),
    "cycles_below_thresh": flag_mask.loc[:, flagged_cols].sum(axis=0).astype(int),
    "cycles_triggering": flag_mask.loc[:, flagged_cols].apply(lambda s: [int(c) for c in s.index[s].tolist()], axis=0)
}).sort_values(["cycles_below_thresh","min_nonmiss_%"], ascending=[False, True])

print(f"Pre-2018 major-missing check (non-missing < {THRESH}% in any cycle):")
print(f"Flagged variables: {len(summary)}")
display(summary)

print("\nPer-cycle non-missingness (%) for flagged variables (rows = cycles 1..10):")
display(flagged)


Pre-2018 major-missing check (non-missing < 20.0% in any cycle):
Flagged variables: 3


Unnamed: 0,min_nonmiss_%,max_nonmiss_%,cycles_below_thresh,cycles_triggering
wt_phlebotomy,0.0,0.0,10,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
probable_depression,6.8,60.4,3,"[1, 2, 3]"
SNAP_bin,14.6,58.3,2,"[1, 2]"



Per-cycle non-missingness (%) for flagged variables (rows = cycles 1..10):


Unnamed: 0_level_0,probable_depression,wt_phlebotomy,SNAP_bin
SDDSRVYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,7.2,0.0,15.1
2.0,7.4,0.0,14.6
3.0,6.8,0.0,48.9
4.0,51.5,0.0,47.5
5.0,59.1,0.0,57.9
6.0,60.4,0.0,58.3
7.0,57.6,0.0,56.5
8.0,58.2,0.0,55.9
9.0,57.5,0.0,55.0
10.0,59.8,0.0,56.2


In [None]:
######  wt_phlebotomy expected missing, as only one  cycle 21-23 have it
######  probable_depression expected missing for cycle 1-3 as only a small subsample tested (cidi used)
######  SNAP_bin expected, questionary on those question have lots missing

In [8]:
df_my_cov_aligned_short.columns

Index(['SEQN', 'SDDSRVYR', 'sdmvpsu', 'sdmvstra', 'RIDAGEYR', 'SEX', 'RACE',
       'household_size', 'EDU', 'pir', 'SMK_AVG', 'SMK', 'ALCG2', 'met_hr',
       'SMK_STATUS', 'CIGS_PER_DAY', 'PACK_YEARS', 'FORMER_SMOKER',
       'DRINKS_PER_DAY', 'ALCOHOL_CAT', 'bmic', 'DIABE', 'HYPERTEN', 'chol_rx',
       'CVD', 'cancer', 'probable_depression', 'ahei_total', 'unemployment2',
       'sdoh_access', 'ins', 'HOQ065', 'marriage', 'SNAP', 'FS', 'WTINT2YR',
       'WTMEC2YR', 'WTSAF2YR', 'WTINT4YR', 'WTMEC4YR', 'WTINTPRP', 'WTMECPRP',
       'WTSAFPRP', 'wt_int', 'wt_mec', 'wt_fasting', 'wt_phlebotomy',
       'WTPH2YR', 'marriage_prev', 'marriage_label', 'marriage3', 'SNAP_src',
       'SNAP_bin', 'SNAP_src_rank', 'SNAP_indiv_only',
       'SNAP_indiv_plus_singleton'],
      dtype='object')

## Keep fetch/work on post 18 missing 

#### first check what is still missing 

In [166]:
import pandas as pd
import numpy as np
import re, io, requests

df = df_my_cov_aligned_short

# ---------------- config ----------------
POST2018_CYCLES = [66.0, 12.0]
IGNORE_EXPECTED = {"CIGS_PER_DAY","SMK_AVG","PACK_YEARS","DRINKS_PER_DAY"}  # your expected-missing vars
SNAP_HELPERS = {"SNAP","SNAP_prev","SNAP_src","SNAP_src_rank","SNAP_indiv_only","SNAP_indiv_plus_singleton"}
MARRIAGE_HELPERS = {"marriage","marriage_prev","marriage_label"}
ID_VARS = {"SEQN","SDDSRVYR"}
# ignore structural weights (both WT* and your wt_* flags)
def is_weight_col(c: str) -> bool:
    return c.startswith("WT") or c.startswith("wt_")

# build column list to audit
cols = []
for c in df.columns:
    if c in ID_VARS: continue
    if c in IGNORE_EXPECTED: continue
    if c in SNAP_HELPERS or c in MARRIAGE_HELPERS: continue
    if is_weight_col(c): continue  # structural
    cols.append(c)

# ensure main signals are included
for must in ["SNAP_bin","marriage3"]:
    if must in df.columns and must not in cols:
        cols.append(must)

# subset post-2018
post_mask = df["SDDSRVYR"].isin(POST2018_CYCLES)
post = df.loc[post_mask, cols + ["SDDSRVYR"]].copy()

# missingness by cycle
miss = (post.groupby("SDDSRVYR", observed=True)[cols]
            .apply(lambda x: x.isna().mean()*100, include_groups=False)
            .round(1)
            .sort_index())

# classify
HIGH_T = 50.0
MID_T  = 20.0

high_any = miss.gt(HIGH_T).any(axis=0)
mid_any  = miss.gt(MID_T).any(axis=0) & ~high_any

high_tbl = miss.loc[:, high_any].copy()
mid_tbl  = miss.loc[:, mid_any].copy()

# ordered lists by worst cycle value
def worst_order(tbl: pd.DataFrame):
    if tbl.empty: return []
    worst = tbl.max(axis=0).sort_values(ascending=False)
    return worst.index.tolist(), worst

high_vars, high_worst = worst_order(high_tbl)
mid_vars,  mid_worst  = worst_order(mid_tbl)

print("POST-2018 missingness (% by cycle):")
display(miss)

print("\nHIGH missing (>50% in either 66 or 12):", len(high_vars))
if high_vars:
    display(high_tbl[high_vars])
    print("\nWorst % (either cycle):")
    display(high_worst)

print("\nMEDIUM missing (20–50% in either 66 or 12):", len(mid_vars))
if mid_vars:
    display(mid_tbl[mid_vars])
    print("\nWorst % (either cycle):")
    display(mid_worst)

# ---------------- OPTIONAL: source scanner to help pick modules/cols next ----------------
_YEARFOLDER_FIX = {"1999-2000":"1999","2001-2002":"2001"}  # kept for reuse

def _cycle_to_year_suffix(cyc: int):
    # 66 → ("2017","P_*"), 12 → ("2021","*_L")
    if cyc == 66:
        return "2017", "P", "{mod}"
    elif cyc == 12:
        return "2021", "L", "{mod}_{suf}"
    else:
        raise ValueError("Only cycles 66 and 12 supported here.")

def fetch_xpt(year_folder: str, filebase: str) -> pd.DataFrame:
    url = f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{year_folder}/DataFiles/{filebase}.xpt"
    r = requests.get(url); r.raise_for_status()
    df2 = pd.read_sas(io.BytesIO(r.content), format="xport", encoding="latin1")
    df2.columns = [c.upper() for c in df2.columns]
    return df2

def scan_modules_for_patterns(cycle_code: int, modules=("DEMO","HIQ","HUQ","HOQ","INQ","FSQ","OCQ","ALQ","DBQ","DPQ","SMQ","PAQ","MCQ"),
                              patterns=(r".*",)):
    year, suf, fmt = _cycle_to_year_suffix(int(cycle_code))
    hits = []
    for mod in modules:
        filebase = (f"P_{fmt.format(mod=mod)}" if suf=="P" else fmt.format(mod=mod, suf=suf)).upper()
        try:
            dfm = fetch_xpt(year, filebase)
        except Exception as e:
            continue
        upcols = {c:c.upper() for c in dfm.columns}
        for pat in patterns:
            rx = re.compile(pat, flags=re.I)
            for c, cu in upcols.items():
                if rx.search(cu):
                    hits.append((mod, filebase, c))
    return hits

# Example: when you're ready to backfill a variable, try scanning like:
# hits66 = scan_modules_for_patterns(66, patterns=[r"^HIQ0*11$", r"HEALTH.?INS", r"INSUR"])  # insurance
# hits12 = scan_modules_for_patterns(12, patterns=[r"^HIQ0*11$", r"HEALTH.?INS", r"INSUR"])
# print("Cycle 66 hits:", hits66[:10]); print("Cycle 12 hits:", hits12[:10])


POST-2018 missingness (% by cycle):


Unnamed: 0_level_0,sdmvpsu,sdmvstra,RIDAGEYR,SEX,RACE,household_size,EDU,pir,SMK,ALCG2,...,cancer,probable_depression,ahei_total,unemployment2,sdoh_access,ins,HOQ065,FS,marriage3,SNAP_bin
SDDSRVYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12.0,0.0,0.0,0.0,0.0,100.0,0.0,34.7,17.1,100.0,46.9,...,100.0,46.9,100.0,34.6,41.3,0.5,100.0,100.0,34.8,100.0
66.0,0.0,0.0,0.0,0.0,100.0,100.0,40.7,14.1,100.0,42.4,...,100.0,42.4,100.0,40.7,100.0,0.2,100.0,45.1,40.7,45.5



HIGH missing (>50% in either 66 or 12): 16


Unnamed: 0_level_0,RACE,household_size,SMK,SMK_STATUS,FORMER_SMOKER,bmic,DIABE,HYPERTEN,chol_rx,CVD,cancer,ahei_total,sdoh_access,HOQ065,FS,SNAP_bin
SDDSRVYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
12.0,100.0,0.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,41.3,100.0,100.0,100.0
66.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,45.1,45.5



Worst % (either cycle):


RACE              100.0
household_size    100.0
SMK               100.0
SMK_STATUS        100.0
FORMER_SMOKER     100.0
bmic              100.0
DIABE             100.0
HYPERTEN          100.0
chol_rx           100.0
CVD               100.0
cancer            100.0
ahei_total        100.0
sdoh_access       100.0
HOQ065            100.0
FS                100.0
SNAP_bin          100.0
dtype: float64


MEDIUM missing (20–50% in either 66 or 12): 7


Unnamed: 0_level_0,ALCG2,ALCOHOL_CAT,probable_depression,EDU,unemployment2,marriage3,met_hr
SDDSRVYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12.0,46.9,46.9,46.9,34.7,34.6,34.8,32.4
66.0,42.4,42.4,42.4,40.7,40.7,40.7,37.8



Worst % (either cycle):


ALCG2                  46.9
ALCOHOL_CAT            46.9
probable_depression    46.9
EDU                    40.7
unemployment2          40.7
marriage3              40.7
met_hr                 37.8
dtype: float64

#### systematic fetch try for post 2018 

In [143]:
import pandas as pd, numpy as np, io, requests
from pandas.api.types import CategoricalDtype

# =========================
# Fetch & file mapping
# =========================
_YEARFOLDER_FIX = {"1999-2000": "1999", "2001-2002": "2001"}  # early-cycle folder quirk

def fetch_xpt(year_folder: str, filebase: str) -> pd.DataFrame:
    yf = _YEARFOLDER_FIX.get(year_folder, year_folder)
    url = f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{yf}/DataFiles/{filebase}.xpt"
    r = requests.get(url); r.raise_for_status()
    df = pd.read_sas(io.BytesIO(r.content), format="xport", encoding="latin1")
    df.columns = [c.upper() for c in df.columns]
    if "SEQN" in df.columns:
        df["SEQN"] = pd.to_numeric(df["SEQN"], errors="coerce").astype("Int64")
    return df

def file_for(module: str, cycle: int) -> tuple[str, str]:
    """
    Returns (year_folder, filebase) for cycles 66 and 12.
    - 66 (2017–Mar 2020): P_<MODULE> under '2017' (EXCEPT HOQ → HOQ_J)
    - 12 (Aug 2021–Aug 2023): <MODULE>_L under '2021' (HOQ → HOQ_L)
    """
    m = module.upper()
    if m == "HOQ":
        return ("2017", "HOQ_J") if cycle == 66 else ("2021", "HOQ_L")
    if cycle == 66:
        return ("2017", f"P_{m}")
    elif cycle == 12:
        return ("2021", f"{m}_L")
    else:
        raise ValueError("This helper is scoped to cycles 66 and 12 only.")

# =========================
# Dtype helpers
# =========================
def ensure_col_dtype(df: pd.DataFrame, col: str, desired: str) -> None:
    """
    Ensure df[col] exists and has dtype `desired`
      desired ∈ {'Int64','float64','string','boolean'}
    If categorical/incompatible, coerce the whole column.
    NOTE: Use this for columns that are safe to coerce globally.
          For smoking columns, use the safe-assign helpers below instead.
    """
    if col not in df.columns:
        if desired == "boolean":
            df[col] = pd.Series(False, index=df.index, dtype="boolean")
        else:
            df[col] = pd.Series(pd.NA, index=df.index, dtype=desired)
        return
    try:
        df[col] = df[col].astype(desired)
    except Exception:
        if isinstance(df[col].dtype, CategoricalDtype):
            df[col] = df[col].astype(object)  # drop categories
        if desired == "Int64":
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
        elif desired == "float64":
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("float64")
        elif desired == "boolean":
            df[col] = df[col].astype("boolean")
        else:
            df[col] = df[col].astype("string")

def map_yes_no_codes(s: pd.Series) -> pd.Series:
    x = pd.to_numeric(s, errors="coerce")
    out = pd.Series(pd.NA, index=x.index, dtype="Int64")
    out.loc[x.eq(1)] = 1
    out.loc[x.eq(2)] = 0
    return out

# =========================
# Assign helpers (protect older cycles)
# =========================
def _ensure_col_exists(df: pd.DataFrame, col: str, desired: str) -> None:
    """Create df[col] if missing; don't coerce if it already exists."""
    if col not in df.columns:
        if desired == "boolean":
            df[col] = pd.Series(False, index=df.index, dtype="boolean")
        else:
            df[col] = pd.Series(pd.NA, index=df.index, dtype=desired)

def _safe_cycle_assign(df: pd.DataFrame, col: str, idx: pd.Index, values: np.ndarray) -> None:
    """
    Assign only on `idx` without disturbing other rows/dtypes.
    If the column is categorical, temporarily drop categories (to 'object') to avoid mass NA.
    """
    if col not in df.columns:
        # create as object for maximal permissiveness; we only write the slice
        df[col] = pd.Series(pd.NA, index=df.index, dtype="object")
    if isinstance(df[col].dtype, CategoricalDtype):
        df[col] = df[col].astype(object)
    df.loc[idx, col] = values

# =========================
# Utility
# =========================
def _mask_from_map(seq: pd.Series, truthy_indexlike) -> np.ndarray:
    """
    Map SEQN -> boolean, cast to pandas nullable boolean, fill NA=False, then numpy.
    `truthy_indexlike` may be a boolean Series indexed by SEQN, or an Index of True rows.
    """
    if isinstance(truthy_indexlike, (pd.Series, pd.DataFrame)):
        mapped = seq.map(truthy_indexlike).astype("boolean")
    else:
        truth = pd.Series(True, index=pd.Index(truthy_indexlike, name="SEQN"))
        mapped = seq.map(truth).astype("boolean")
    return mapped.fillna(False).to_numpy()

# =========================
# Backfill: generic (with 66 fallback)
# =========================
    """
def backfill_simple(df: pd.DataFrame, target_col: str, cycle: int, module: str, src_col: str,
                    mapper=lambda s: pd.to_numeric(s, errors="coerce")) -> pd.DataFrame:
    y, f = file_for(module, cycle)
    try:
        src = fetch_xpt(y, f).set_index("SEQN")
    except Exception as e:
        print(f"[{module} {cycle}] fetch {y}/{f}.xpt failed ({e}) — skipped")
        return df

    # If unified P_* (cycle 66) is missing the variable, try module_J named file
    if src_col not in src.columns and cycle == 66 and f.startswith("P_"):
        alt_file = f"{module.upper()}_J"
        try:
            alt = fetch_xpt("2017", alt_file).set_index("SEQN")
            if src_col in alt.columns:
                src = alt
                print(f"[{module} {cycle}] {src_col} missing in {f}.xpt, fell back to {alt_file}.xpt")
            else:
                print(f"[{module} {cycle}] {src_col} not in {f}.xpt nor {alt_file}.xpt — skipped")
                return df
        except Exception as e:
            print(f"[{module} {cycle}] fallback 2017/{alt_file}.xpt failed ({e}) — skipped")
            return df
    elif src_col not in src.columns:
        print(f"[{module} {cycle}] {src_col} not in file {f}.xpt — skipped")
        return df

    # Map & align
    s_mapped = mapper(src[src_col])
    s_mapped.name = target_col

    m   = df["SDDSRVYR"].eq(float(cycle))
    idx = df.index[m]
    seq = df.loc[idx, "SEQN"].astype("Int64")

    mapped = seq.map(s_mapped)

    # decide desired dtype from the mapped series
    if pd.api.types.is_integer_dtype(s_mapped.dtype):
        desired_dtype = "Int64"
        mapped = mapped.astype("Int64")
    elif pd.api.types.is_numeric_dtype(s_mapped.dtype):
        desired_dtype = "float64"
        mapped = pd.to_numeric(mapped, errors="coerce").astype("float64")
    else:
        desired_dtype = "string"
        mapped = mapped.astype("string")

    # this is safe for non-smoking columns
    ensure_col_dtype(df, target_col, desired_dtype)
    df.loc[idx, target_col] = mapped.values
    print(f"[{module} {cycle}] filled {target_col} from {y}/{f}:{src_col} (n={s_mapped.notna().sum()})")
    return df
    """

# ------------------------- generic backfill (cycle-scoped, snapshot/restore) -------------------------
def _preserve_outside_cycle(df: pd.DataFrame, col: str, m_cycle: pd.Series):
    """
    Return a Series snapshot of df[col] for rows NOT in the cycle mask (to restore after writes).
    If column doesn't exist yet, return None.
    """
    if col in df.columns:
        return df.loc[~m_cycle, col].copy()
    return None

def backfill_simple(df: pd.DataFrame, target_col: str, cycle: int, module: str, src_col: str,
                    mapper=lambda s: pd.to_numeric(s, errors="coerce")) -> pd.DataFrame:
    y, f = file_for(module, cycle)
    try:
        src = fetch_xpt(y, f).set_index("SEQN")
    except Exception as e:
        print(f"[{module} {cycle}] fetch {y}/{f}.xpt failed ({e}) — skipped")
        return df

    # unified P_* bundle sometimes lacks vars in 66 → try MODULE_J
    if src_col not in src.columns and cycle == 66 and f.startswith("P_"):
        alt_file = f"{module.upper()}_J"
        try:
            alt = fetch_xpt("2017", alt_file).set_index("SEQN")
            if src_col in alt.columns:
                src = alt
                print(f"[{module} {cycle}] {src_col} missing in {f}.xpt, fell back to {alt_file}.xpt")
            else:
                print(f"[{module} {cycle}] {src_col} not in {f}.xpt nor {alt_file}.xpt — skipped")
                return df
        except Exception as e:
            print(f"[{module} {cycle}] fallback 2017/{alt_file}.xpt failed ({e}) — skipped")
            return df
    elif src_col not in src.columns:
        print(f"[{module} {cycle}] {src_col} not in file {f}.xpt — skipped")
        return df

    # map & align
    s_mapped = mapper(src[src_col])
    s_mapped.name = target_col

    m_cycle = df["SDDSRVYR"].eq(float(cycle))
    idx = df.index[m_cycle]
    if len(idx) == 0:
        print(f"[{module} {cycle}] no rows in frame — skipped")
        return df

    seq    = df.loc[idx, "SEQN"].astype("Int64")
    mapped = seq.map(s_mapped)

    # choose a dtype for *new* columns only (do NOT coerce existing columns)
    if pd.api.types.is_integer_dtype(s_mapped.dtype):
        desired_dtype = "Int64";   mapped = mapped.astype("Int64")
    elif pd.api.types.is_numeric_dtype(s_mapped.dtype):
        desired_dtype = "float64"; mapped = pd.to_numeric(mapped, errors="coerce").astype("float64")
    else:
        desired_dtype = "string";  mapped = mapped.astype("string")

    # snapshot everything outside the cycle to guarantee no collateral changes
    snapshot = _preserve_outside_cycle(df, target_col, m_cycle)

    # create column if missing (don’t coerce if it exists)
    if target_col not in df.columns:
        if desired_dtype == "boolean":
            df[target_col] = pd.Series(False, index=df.index, dtype="boolean")
        else:
            df[target_col] = pd.Series(pd.NA, index=df.index, dtype=desired_dtype)

    # inside-cycle, fill only where currently NA and we have a mapped value
    cur_slice = df.loc[idx, target_col]
    fill_mask = cur_slice.isna() & mapped.notna()
    # write only the needed rows
    df.loc[idx[fill_mask], target_col] = mapped[fill_mask].values

    # restore everything outside the cycle exactly as it was
    if snapshot is not None:
        df.loc[~m_cycle, target_col] = snapshot

    wrote = int(fill_mask.sum())
    print(f"[{module} {cycle}] filled {target_col} from {y}/{f}:{src_col} (n_src_nonnull={s_mapped.notna().sum()}, wrote={wrote})")
    return df


# =========================
# Backfill: Smoking (safe per-cycle assigns)
# =========================
def backfill_smoking(df: pd.DataFrame, cycle: int) -> pd.DataFrame:
    y, f = file_for("SMQ", cycle)
    try:
        smq = fetch_xpt(y, f).set_index("SEQN")
    except Exception as e:
        print(f"[SMQ {cycle}] fetch failed ({e}) — skipped")
        return df
    if not {"SMQ020","SMQ040"}.issubset(smq.columns):
        print(f"[SMQ {cycle}] needed vars missing — skipped")
        return df

    ever = pd.to_numeric(smq["SMQ020"], errors="coerce")  # 1 yes, 2 no
    now  = pd.to_numeric(smq["SMQ040"], errors="coerce")  # 1 every day, 2 some days, 3 not at all

    current = (ever.eq(1)) & (now.isin([1,2]))
    former  = (ever.eq(1)) & (now.eq(3))
    never   =  ever.eq(2)

    m   = df["SDDSRVYR"].eq(float(cycle))
    idx = df.index[m]
    if len(idx) == 0:
        print(f"[SMQ {cycle}] no rows in frame — skipped")
        return df

    seq = df.loc[idx, "SEQN"].astype("Int64")

    mask_curr  = _mask_from_map(seq, current)
    mask_form  = _mask_from_map(seq, former)
    mask_never = _mask_from_map(seq, never)

    # Create-only (do NOT coerce entire column)
    for col, dtype in [("SMK","Int64"), ("SMK_STATUS","Int64"),
                       ("FORMER_SMOKER","Int64"), ("CIGS_PER_DAY","float64"),
                       ("PACK_YEARS","float64")]:
        _ensure_col_exists(df, col, dtype)

    # Build values for this cycle only
    smk = pd.Series(pd.NA, index=idx, dtype="Int64")
    smk.loc[idx[mask_curr]] = 1
    smk.loc[idx[~mask_curr & (mask_form | mask_never)]] = 0

    status = pd.Series(pd.NA, index=idx, dtype="Int64")
    status.loc[idx[mask_curr]] = 1
    status.loc[idx[~mask_curr & mask_form]] = 2
    status.loc[idx[~mask_curr & ~mask_form & mask_never]] = 3

    fs = pd.Series(pd.NA, index=idx, dtype="Int64")
    fs.loc[idx[mask_form]] = 1
    fs.loc[idx[~mask_form & (mask_curr | mask_never)]] = 0

    # Safe per-cycle assigns (won't disturb other cycles or categories)
    _safe_cycle_assign(df, "SMK",           idx, smk.values)
    _safe_cycle_assign(df, "SMK_STATUS",    idx, status.values)
    _safe_cycle_assign(df, "FORMER_SMOKER", idx, fs.values)

    # ----- Extras (best effort, may be sparse) -----
    cigs_vars = [v for v in ["SMD650","SMQ051"] if v in smq.columns]  # avg cigs/day past 30 days, etc.
    if cigs_vars:
        cigs = pd.to_numeric(smq[cigs_vars[0]], errors="coerce")
        cigs_slice = seq.map(cigs).astype("float64")
        _safe_cycle_assign(df, "CIGS_PER_DAY", idx, cigs_slice.values)

    yrs_vars = [v for v in ["SMD641","SMD030Y","SMQ050Q"] if v in smq.columns]  # years proxy (often weak)
    if yrs_vars and cigs_vars:
        yrs  = pd.to_numeric(smq[yrs_vars[0]], errors="coerce")
        cigs = pd.to_numeric(smq[cigs_vars[0]], errors="coerce")
        packs_slice = (cigs / 20.0) * yrs
        packs_slice = seq.map(packs_slice).astype("float64")
        _safe_cycle_assign(df, "PACK_YEARS", idx, packs_slice.values)

    print(f"[SMQ {cycle}] derived SMK trio + attempted CIGS_PER_DAY, PACK_YEARS")
    return df

# =========================
# Backfill: CVD
# =========================
def backfill_cvd(df: pd.DataFrame, cycle: int) -> pd.DataFrame:
    y, f = file_for("MCQ", cycle)
    try:
        mcq = fetch_xpt(y, f).set_index("SEQN")
    except Exception as e:
        print(f"[MCQ {cycle}] fetch failed ({e}) — skipped")
        return df
    cols = [c for c in ["MCQ160B","MCQ160C","MCQ160D","MCQ160E","MCQ160F"] if c in mcq.columns]
    if not cols:
        print(f"[MCQ {cycle}] CVD items not found — skipped")
        return df

    x = mcq[cols].apply(pd.to_numeric, errors="coerce")
    any_yes = (x == 1).any(axis=1)  # SEQN index
    all_no  = (x == 2).all(axis=1)

    m   = df["SDDSRVYR"].eq(float(cycle))
    idx = df.index[m]
    seq = df.loc[idx, "SEQN"].astype("Int64")

    mask_any_yes = _mask_from_map(seq, any_yes)
    mask_all_no  = _mask_from_map(seq, all_no)

    ensure_col_dtype(df, "CVD", "Int64")  # safe globally
    cvd = pd.Series(pd.NA, index=idx, dtype="Int64")
    cvd.loc[idx[mask_any_yes]] = 1
    cvd.loc[idx[mask_all_no & ~mask_any_yes]] = 0

    df.loc[idx, "CVD"] = cvd.values
    print(f"[MCQ {cycle}] derived CVD from {cols}")
    return df

# =========================
# HOQ065 (tenure unavailable 66 & 12)
# =========================
def backfill_hoq65_with_structural(df: pd.DataFrame) -> pd.DataFrame:
    """
    Tenure (HOQ065) availability per our checks:
      - Cycle 66 (2017–Mar 2020): public HOQ_J does not cover your 66 SEQNs.
      - Cycle 12 (2021–2023): HOQ_L released but has no HOQ065.
    Mark HOQ065 as structural-missing for both cycles.
    """
    ensure_col_dtype(df, "HOQ065", "Int64")
    ensure_col_dtype(df, "HOQ065_structural_missing", "boolean")

    for cyc in (66.0, 12.0):
        m = df["SDDSRVYR"].eq(cyc)
        df.loc[m, "HOQ065"] = pd.NA
        df.loc[m, "HOQ065_structural_missing"] = True
        print(f"[HOQ {int(cyc)}] tenure HOQ065 not available publicly — marked structural missing")
    return df

# =========================
# APPLY: cycles 66 & 12
# =========================
for cyc in (66, 12):
    # DEMO
    df_my_cov_aligned_short = backfill_simple(
        df_my_cov_aligned_short, "RACE", cyc, "DEMO", "RIDRETH3",
        mapper=lambda s: pd.to_numeric(s, errors="coerce").astype("Int64")
    )
    df_my_cov_aligned_short = backfill_simple(
        df_my_cov_aligned_short, "household_size", cyc, "DEMO", "DMDHHSIZ",
        mapper=lambda s: pd.to_numeric(s, errors="coerce").astype("Int64")
    )

    # BMX
    df_my_cov_aligned_short = backfill_simple(
        df_my_cov_aligned_short, "bmic", cyc, "BMX", "BMXBMI",
        mapper=lambda s: pd.to_numeric(s, errors="coerce")
    )

    # HIQ / DIQ / BPQ
    df_my_cov_aligned_short = backfill_simple(df_my_cov_aligned_short, "ins",      cyc, "HIQ", "HIQ011",  mapper=map_yes_no_codes)
    df_my_cov_aligned_short = backfill_simple(df_my_cov_aligned_short, "DIABE",    cyc, "DIQ", "DIQ010",  mapper=map_yes_no_codes)
    df_my_cov_aligned_short = backfill_simple(df_my_cov_aligned_short, "HYPERTEN", cyc, "BPQ", "BPQ020",  mapper=map_yes_no_codes)
    df_my_cov_aligned_short = backfill_simple(df_my_cov_aligned_short, "chol_rx",  cyc, "BPQ", "BPQ101D", mapper=map_yes_no_codes)

    # MCQ (cancer + CVD)
    df_my_cov_aligned_short = backfill_simple(df_my_cov_aligned_short, "cancer", cyc, "MCQ", "MCQ220", mapper=map_yes_no_codes)
    df_my_cov_aligned_short = backfill_cvd(df_my_cov_aligned_short, cyc)

    # Smoking (safe assigns; will not disturb pre-2018 values)
    df_my_cov_aligned_short = backfill_smoking(df_my_cov_aligned_short, cyc)

# HOQ065 (structural)
df_my_cov_aligned_short = backfill_hoq65_with_structural(df_my_cov_aligned_short)

# =========================
# (Optional) quick sanity
# =========================
def _pct_nonmissing(s): return (1 - s.isna().mean())*100
if "SMK" in df_my_cov_aligned_short.columns:
    smk_pct = (df_my_cov_aligned_short.groupby("SDDSRVYR")["SMK"]
               .apply(lambda s: _pct_nonmissing(s).round(1)))
    print("\nSMK non-missing % by cycle:")
    print(smk_pct.to_string())

def _pct_nonmissing(s): return (1 - s.isna().mean())*100
if "SMK" in df_my_cov_aligned_short.columns:
    smk_pct = (df_my_cov_aligned_short.groupby("SDDSRVYR")["bmic"]
               .apply(lambda s: _pct_nonmissing(s).round(1)))
    print("\nbmic non-missing % by cycle:")
    print(smk_pct.to_string())


[DEMO 66] filled RACE from 2017/P_DEMO:RIDRETH3 (n_src_nonnull=15560, wrote=0)
[DEMO 66] DMDHHSIZ missing in P_DEMO.xpt, fell back to DEMO_J.xpt
[DEMO 66] filled household_size from 2017/P_DEMO:DMDHHSIZ (n_src_nonnull=9254, wrote=0)
[BMX 66] filled bmic from 2017/P_BMX:BMXBMI (n_src_nonnull=13137, wrote=0)
[HIQ 66] filled ins from 2017/P_HIQ:HIQ011 (n_src_nonnull=15523, wrote=0)
[DIQ 66] filled DIABE from 2017/P_DIQ:DIQ010 (n_src_nonnull=14694, wrote=0)
[BPQ 66] filled HYPERTEN from 2017/P_BPQ:BPQ020 (n_src_nonnull=10183, wrote=0)
[BPQ 66] BPQ101D not in P_BPQ.xpt nor BPQ_J.xpt — skipped
[MCQ 66] filled cancer from 2017/P_MCQ:MCQ220 (n_src_nonnull=9228, wrote=0)
[MCQ 66] derived CVD from ['MCQ160B', 'MCQ160C', 'MCQ160D', 'MCQ160E', 'MCQ160F']
[SMQ 66] derived SMK trio + attempted CIGS_PER_DAY, PACK_YEARS
[DEMO 12] filled RACE from 2021/DEMO_L:RIDRETH3 (n_src_nonnull=11933, wrote=0)
[DEMO 12] filled household_size from 2021/DEMO_L:DMDHHSIZ (n_src_nonnull=11933, wrote=0)
[BMX 12] filled 

In [167]:
import pandas as pd, numpy as np, io, requests
from pandas.api.types import CategoricalDtype

# =========================
# Fetch & file mapping
# =========================
_YEARFOLDER_FIX = {"1999-2000": "1999", "2001-2002": "2001"}  # early-cycle folder quirk

def fetch_xpt(year_folder: str, filebase: str) -> pd.DataFrame:
    yf = _YEARFOLDER_FIX.get(year_folder, year_folder)
    url = f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{yf}/DataFiles/{filebase}.xpt"
    r = requests.get(url); r.raise_for_status()
    df = pd.read_sas(io.BytesIO(r.content), format="xport", encoding="latin1")
    df.columns = [c.upper() for c in df.columns]
    if "SEQN" in df.columns:
        df["SEQN"] = pd.to_numeric(df["SEQN"], errors="coerce").astype("Int64")
    return df

def file_for(module: str, cycle: int) -> tuple[str, str]:
    """
    Returns (year_folder, filebase) for cycles 66 and 12.
    - 66 (2017–Mar 2020): P_<MODULE> under '2017' (EXCEPT HOQ → HOQ_J)
    - 12 (Aug 2021–Aug 2023): <MODULE>_L under '2021' (HOQ → HOQ_L)
    """
    m = module.upper()
    if m == "HOQ":
        return ("2017", "HOQ_J") if cycle == 66 else ("2021", "HOQ_L")
    if cycle == 66:
        return ("2017", f"P_{m}")
    elif cycle == 12:
        return ("2021", f"{m}_L")
    else:
        raise ValueError("This helper is scoped to cycles 66 and 12 only.")

# =========================
# Dtype helpers
# =========================
def ensure_col_dtype(df: pd.DataFrame, col: str, desired: str) -> None:
    """
    Ensure df[col] exists and has dtype `desired`
      desired ∈ {'Int64','float64','string','boolean'}
    If categorical/incompatible, coerce the whole column.
    NOTE: Use this for columns that are safe to coerce globally.
          For smoking columns, use the safe-assign helpers below instead.
    """
    if col not in df.columns:
        if desired == "boolean":
            df[col] = pd.Series(False, index=df.index, dtype="boolean")
        elif desired == "float64":
            df[col] = pd.Series(np.nan, index=df.index, dtype="float64")  # use np.nan for float64
        else:
            df[col] = pd.Series(pd.NA, index=df.index, dtype=desired)
        return
    try:
        df[col] = df[col].astype(desired)
    except Exception:
        if isinstance(df[col].dtype, CategoricalDtype):
            df[col] = df[col].astype(object)  # drop categories
        if desired == "Int64":
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
        elif desired == "float64":
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("float64")
        elif desired == "boolean":
            df[col] = df[col].astype("boolean")
        else:
            df[col] = df[col].astype("string")

def map_yes_no_codes(s: pd.Series) -> pd.Series:
    x = pd.to_numeric(s, errors="coerce")
    out = pd.Series(pd.NA, index=x.index, dtype="Int64")
    out.loc[x.eq(1)] = 1
    out.loc[x.eq(2)] = 0
    return out

# =========================
# Assign helpers (protect older cycles)
# =========================
def _ensure_col_exists(df: pd.DataFrame, col: str, desired: str) -> None:
    """Create df[col] if missing; don't coerce if it already exists."""
    if col not in df.columns:
        if desired == "boolean":
            df[col] = pd.Series(False, index=df.index, dtype="boolean")
        elif desired == "float64":
            df[col] = pd.Series(np.nan, index=df.index, dtype="float64")  # use np.nan for float64
        else:
            df[col] = pd.Series(pd.NA, index=df.index, dtype=desired)

def _safe_cycle_assign(df: pd.DataFrame, col: str, idx: pd.Index, values: np.ndarray) -> None:
    """
    Assign only on `idx` without disturbing other rows/dtypes.
    If the column is categorical, temporarily drop categories (to 'object') to avoid mass NA.
    """
    if col not in df.columns:
        df[col] = pd.Series(pd.NA, index=df.index, dtype="object")
    if isinstance(df[col].dtype, CategoricalDtype):
        df[col] = df[col].astype(object)
    df.loc[idx, col] = values

# =========================
# Utility
# =========================
def _mask_from_map(seq: pd.Series, truthy_indexlike) -> np.ndarray:
    """
    Map SEQN -> boolean, cast to pandas nullable boolean, fill NA=False, then numpy.
    `truthy_indexlike` may be a boolean Series indexed by SEQN, or an Index of True rows.
    """
    if isinstance(truthy_indexlike, (pd.Series, pd.DataFrame)):
        mapped = seq.map(truthy_indexlike).astype("boolean")
    else:
        truth = pd.Series(True, index=pd.Index(truthy_indexlike, name="SEQN"))
        mapped = seq.map(truth).astype("boolean")
    return mapped.fillna(False).to_numpy()

# =========================
# Preserve cat BMI; set up numeric BMI
# =========================
def _is_stringlike_or_categorical(s: pd.Series) -> bool:
    return (pd.api.types.is_string_dtype(s.dtype) or isinstance(s.dtype, CategoricalDtype))

def preserve_bmi_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    If existing 'bmic' is categorical/string (older labels), preserve it as 'bmi_cat'.
    Always ensure a numeric 'bmi' column exists for numeric BMI backfill.
    """
    if "bmic" in df.columns and _is_stringlike_or_categorical(df["bmic"]):
        if "bmi_cat" not in df.columns:
            df = df.rename(columns={"bmic": "bmi_cat"})
        # else: keep both; don't touch existing
    if "bmi" not in df.columns:
        df["bmi"] = pd.Series(np.nan, index=df.index, dtype="float64")  # float-safe creation
    return df

def ensure_bmi_cat_from_numeric(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fill bmi_cat ONLY where missing and numeric bmi exists.
    Categories: UNDER (<18.5), NORMAL [18.5,25), OVER [25,30), OBESE [30,∞)
    """
    if "bmi_cat" not in df.columns:
        df["bmi_cat"] = pd.Series(pd.NA, index=df.index, dtype="string")
    need = df["bmi_cat"].isna() & df["bmi"].notna()
    if need.any():
        bins   = [-np.inf, 18.5, 25.0, 30.0, np.inf]
        labels = ["UNDER", "NORMAL", "OVER", "OBESE"]
        df.loc[need, "bmi_cat"] = pd.cut(
            df.loc[need, "bmi"], bins=bins, labels=labels, right=False
        ).astype("string").values
    return df

# =========================
# Backfill: generic (with 66 fallback), NA-only within-cycle, preserves outside
# =========================
def _preserve_outside_cycle(df: pd.DataFrame, col: str, m_cycle: pd.Series):
    """
    Return a Series snapshot of df[col] for rows NOT in the cycle mask (to restore after writes).
    If column doesn't exist yet, return None.
    """
    if col in df.columns:
        return df.loc[~m_cycle, col].copy()
    return None

def backfill_simple(df: pd.DataFrame, target_col: str, cycle: int, module: str, src_col: str,
                    mapper=lambda s: pd.to_numeric(s, errors="coerce")) -> pd.DataFrame:
    y, f = file_for(module, cycle)
    try:
        src = fetch_xpt(y, f).set_index("SEQN")
    except Exception as e:
        print(f"[{module} {cycle}] fetch {y}/{f}.xpt failed ({e}) — skipped")
        return df

    # unified P_* bundle sometimes lacks vars in 66 → try MODULE_J
    if src_col not in src.columns and cycle == 66 and f.startswith("P_"):
        alt_file = f"{module.upper()}_J"
        try:
            alt = fetch_xpt("2017", alt_file).set_index("SEQN")
            if src_col in alt.columns:
                src = alt
                print(f"[{module} {cycle}] {src_col} missing in {f}.xpt, fell back to {alt_file}.xpt")
            else:
                print(f"[{module} {cycle}] {src_col} not in {f}.xpt nor {alt_file}.xpt — skipped")
                return df
        except Exception as e:
            print(f"[{module} {cycle}] fallback 2017/{alt_file}.xpt failed ({e}) — skipped")
            return df
    elif src_col not in src.columns:
        print(f"[{module} {cycle}] {src_col} not in file {f}.xpt — skipped")
        return df

    # map & align
    s_mapped = mapper(src[src_col])
    s_mapped.name = target_col

    m_cycle = df["SDDSRVYR"].eq(float(cycle))
    idx = df.index[m_cycle]
    if len(idx) == 0:
        print(f"[{module} {cycle}] no rows in frame — skipped")
        return df

    seq    = df.loc[idx, "SEQN"].astype("Int64")
    mapped = seq.map(s_mapped)

    # choose dtype for new columns only (do NOT coerce existing columns)
    if pd.api.types.is_integer_dtype(s_mapped.dtype):
        desired_dtype = "Int64";   mapped = mapped.astype("Int64")
    elif pd.api.types.is_numeric_dtype(s_mapped.dtype):
        desired_dtype = "float64"; mapped = pd.to_numeric(mapped, errors="coerce").astype("float64")
    else:
        desired_dtype = "string";  mapped = mapped.astype("string")

    # snapshot outside-cycle values to guarantee no collateral changes
    snapshot = _preserve_outside_cycle(df, target_col, m_cycle)

    # create column if missing (don’t coerce if it exists)
    if target_col not in df.columns:
        if desired_dtype == "boolean":
            df[target_col] = pd.Series(False, index=df.index, dtype="boolean")
        elif desired_dtype == "float64":
            df[target_col] = pd.Series(np.nan, index=df.index, dtype="float64")  # float-safe creation
        else:
            df[target_col] = pd.Series(pd.NA, index=df.index, dtype=desired_dtype)

    # inside-cycle, fill only where currently NA and we have a mapped value
    cur_slice = df.loc[idx, target_col]
    fill_mask = cur_slice.isna() & mapped.notna()
    df.loc[idx[fill_mask], target_col] = mapped[fill_mask].values

    # restore everything outside the cycle exactly as it was
    if snapshot is not None:
        df.loc[~m_cycle, target_col] = snapshot

    wrote = int(fill_mask.sum())
    print(f"[{module} {cycle}] filled {target_col} from {y}/{f}:{src_col} (n_src_nonnull={s_mapped.notna().sum()}, wrote={wrote})")
    return df

# =========================
# Backfill: Smoking (safe per-cycle assigns)
# =========================
def backfill_smoking(df: pd.DataFrame, cycle: int) -> pd.DataFrame:
    y, f = file_for("SMQ", cycle)
    try:
        smq = fetch_xpt(y, f).set_index("SEQN")
    except Exception as e:
        print(f"[SMQ {cycle}] fetch failed ({e}) — skipped")
        return df
    if not {"SMQ020","SMQ040"}.issubset(smq.columns):
        print(f"[SMQ {cycle}] needed vars missing — skipped")
        return df

    ever = pd.to_numeric(smq["SMQ020"], errors="coerce")  # 1 yes, 2 no
    now  = pd.to_numeric(smq["SMQ040"], errors="coerce")  # 1 every day, 2 some days, 3 not at all

    current = (ever.eq(1)) & (now.isin([1,2]))
    former  = (ever.eq(1)) & (now.eq(3))
    never   =  ever.eq(2)

    m   = df["SDDSRVYR"].eq(float(cycle))
    idx = df.index[m]
    if len(idx) == 0:
        print(f"[SMQ {cycle}] no rows in frame — skipped")
        return df

    seq = df.loc[idx, "SEQN"].astype("Int64")

    mask_curr  = _mask_from_map(seq, current)
    mask_form  = _mask_from_map(seq, former)
    mask_never = _mask_from_map(seq, never)

    # Create-only (do NOT coerce entire column)
    for col, dtype in [("SMK","Int64"), ("SMK_STATUS","Int64"),
                       ("FORMER_SMOKER","Int64"), ("CIGS_PER_DAY","float64"),
                       ("PACK_YEARS","float64")]:
        _ensure_col_exists(df, col, dtype)

    # Build values for this cycle only
    smk = pd.Series(pd.NA, index=idx, dtype="Int64")
    smk.loc[idx[mask_curr]] = 1
    smk.loc[idx[~mask_curr & (mask_form | mask_never)]] = 0

    status = pd.Series(pd.NA, index=idx, dtype="Int64")
    status.loc[idx[mask_curr]] = 1
    status.loc[idx[~mask_curr & mask_form]] = 2
    status.loc[idx[~mask_curr & ~mask_form & mask_never]] = 3

    fs = pd.Series(pd.NA, index=idx, dtype="Int64")
    fs.loc[idx[mask_form]] = 1
    fs.loc[idx[~mask_form & (mask_curr | mask_never)]] = 0

    # Safe per-cycle assigns (won't disturb other cycles or categories)
    _safe_cycle_assign(df, "SMK",           idx, smk.values)
    _safe_cycle_assign(df, "SMK_STATUS",    idx, status.values)
    _safe_cycle_assign(df, "FORMER_SMOKER", idx, fs.values)

    # Extras (best effort)
    cigs_vars = [v for v in ["SMD650","SMQ051"] if v in smq.columns]
    if cigs_vars:
        cigs = pd.to_numeric(smq[cigs_vars[0]], errors="coerce")
        cigs_slice = seq.map(cigs).astype("float64")
        _safe_cycle_assign(df, "CIGS_PER_DAY", idx, cigs_slice.values)

    yrs_vars = [v for v in ["SMD641","SMD030Y","SMQ050Q"] if v in smq.columns]
    if yrs_vars and cigs_vars:
        yrs  = pd.to_numeric(smq[yrs_vars[0]], errors="coerce")
        cigs = pd.to_numeric(smq[cigs_vars[0]], errors="coerce")
        packs_slice = (cigs / 20.0) * yrs
        packs_slice = seq.map(packs_slice).astype("float64")
        _safe_cycle_assign(df, "PACK_YEARS", idx, packs_slice.values)

    print(f"[SMQ {cycle}] derived SMK trio + attempted CIGS_PER_DAY, PACK_YEARS")
    return df

# =========================
# Backfill: CVD
# =========================
def backfill_cvd(df: pd.DataFrame, cycle: int) -> pd.DataFrame:
    y, f = file_for("MCQ", cycle)
    try:
        mcq = fetch_xpt(y, f).set_index("SEQN")
    except Exception as e:
        print(f"[MCQ {cycle}] fetch failed ({e}) — skipped")
        return df
    cols = [c for c in ["MCQ160B","MCQ160C","MCQ160D","MCQ160E","MCQ160F"] if c in mcq.columns]
    if not cols:
        print(f"[MCQ {cycle}] CVD items not found — skipped")
        return df

    x = mcq[cols].apply(pd.to_numeric, errors="coerce")
    any_yes = (x == 1).any(axis=1)  # SEQN index
    all_no  = (x == 2).all(axis=1)

    m   = df["SDDSRVYR"].eq(float(cycle))
    idx = df.index[m]
    seq = df.loc[idx, "SEQN"].astype("Int64")

    mask_any_yes = _mask_from_map(seq, any_yes)
    mask_all_no  = _mask_from_map(seq, all_no)

    ensure_col_dtype(df, "CVD", "Int64")
    cvd = pd.Series(pd.NA, index=idx, dtype="Int64")
    cvd.loc[idx[mask_any_yes]] = 1
    cvd.loc[idx[mask_all_no & ~mask_any_yes]] = 0

    df.loc[idx, "CVD"] = cvd.values
    print(f"[MCQ {cycle}] derived CVD from {cols}")
    return df

# =========================
# HOQ065 (tenure unavailable 66 & 12)
# =========================
def backfill_hoq65_with_structural(df: pd.DataFrame) -> pd.DataFrame:
    """
    Tenure (HOQ065) availability per our checks:
      - Cycle 66 (2017–Mar 2020): public HOQ_J does not cover your 66 SEQNs.
      - Cycle 12 (2021–2023): HOQ_L released but has no HOQ065.
    Mark HOQ065 as structural-missing for both cycles.
    """
    ensure_col_dtype(df, "HOQ065", "Int64")
    ensure_col_dtype(df, "HOQ065_structural_missing", "boolean")

    for cyc in (66.0, 12.0):
        m = df["SDDSRVYR"].eq(cyc)
        df.loc[m, "HOQ065"] = pd.NA
        df.loc[m, "HOQ065_structural_missing"] = True
        print(f"[HOQ {int(cyc)}] tenure HOQ065 not available publicly — marked structural missing")
    return df

# =========================
# APPLY: cycles 66 & 12
# =========================
# IMPORTANT: call preserve_bmi_columns() once before any BMX writes
df_my_cov_aligned_short = preserve_bmi_columns(df_my_cov_aligned_short)

for cyc in (66, 12):
    # DEMO
    df_my_cov_aligned_short = backfill_simple(
        df_my_cov_aligned_short, "RACE", cyc, "DEMO", "RIDRETH3",
        mapper=lambda s: pd.to_numeric(s, errors="coerce").astype("Int64")
    )
    df_my_cov_aligned_short = backfill_simple(
        df_my_cov_aligned_short, "household_size", cyc, "DEMO", "DMDHHSIZ",
        mapper=lambda s: pd.to_numeric(s, errors="coerce").astype("Int64")
    )

    # BMX → write numeric BMI to 'bmi' (preserves your old categorical bmic → bmi_cat)
    df_my_cov_aligned_short = backfill_simple(
        df_my_cov_aligned_short, "bmi", cyc, "BMX", "BMXBMI",
        mapper=lambda s: pd.to_numeric(s, errors="coerce")
    )

    # HIQ / DIQ / BPQ
    df_my_cov_aligned_short = backfill_simple(df_my_cov_aligned_short, "ins",      cyc, "HIQ", "HIQ011",  mapper=map_yes_no_codes)
    df_my_cov_aligned_short = backfill_simple(df_my_cov_aligned_short, "DIABE",    cyc, "DIQ", "DIQ010",  mapper=map_yes_no_codes)
    df_my_cov_aligned_short = backfill_simple(df_my_cov_aligned_short, "HYPERTEN", cyc, "BPQ", "BPQ020",  mapper=map_yes_no_codes)
    df_my_cov_aligned_short = backfill_simple(df_my_cov_aligned_short, "chol_rx",  cyc, "BPQ", "BPQ101D", mapper=map_yes_no_codes)

    # MCQ (cancer + CVD)
    df_my_cov_aligned_short = backfill_simple(df_my_cov_aligned_short, "cancer", cyc, "MCQ", "MCQ220", mapper=map_yes_no_codes)
    df_my_cov_aligned_short = backfill_cvd(df_my_cov_aligned_short, cyc)

    # Smoking (safe assigns; will not disturb pre-2018 values)
    df_my_cov_aligned_short = backfill_smoking(df_my_cov_aligned_short, cyc)

# HOQ065 (structural)
df_my_cov_aligned_short = backfill_hoq65_with_structural(df_my_cov_aligned_short)

# (Optional) build missing labels from numeric BMI WITHOUT overwriting your originals
df_my_cov_aligned_short = ensure_bmi_cat_from_numeric(df_my_cov_aligned_short)

# =========================
# Quick sanity
# =========================
def _pct_nonmissing(s): return (1 - s.isna().mean())*100

if "SMK" in df_my_cov_aligned_short.columns:
    smk_pct = (df_my_cov_aligned_short.groupby("SDDSRVYR")["SMK"]
               .apply(lambda s: _pct_nonmissing(s).round(1)))
    print("\nSMK non-missing % by cycle:")
    print(smk_pct.to_string())

if "bmi" in df_my_cov_aligned_short.columns:
    bmi_pct = (df_my_cov_aligned_short.groupby("SDDSRVYR")["bmi"]
               .apply(lambda s: _pct_nonmissing(s).round(1)))
    print("\nBMI (numeric) non-missing % by cycle:")
    print(bmi_pct.to_string())

if "bmi_cat" in df_my_cov_aligned_short.columns:
    bmi_cat_pct = (df_my_cov_aligned_short.groupby("SDDSRVYR")["bmi_cat"]
                   .apply(lambda s: _pct_nonmissing(s).round(1)))
    print("\nBMI (categorical, bmi_cat) non-missing % by cycle:")
    print(bmi_cat_pct.to_string())


[DEMO 66] filled RACE from 2017/P_DEMO:RIDRETH3 (n_src_nonnull=15560, wrote=15560)
[DEMO 66] DMDHHSIZ missing in P_DEMO.xpt, fell back to DEMO_J.xpt
[DEMO 66] filled household_size from 2017/P_DEMO:DMDHHSIZ (n_src_nonnull=9254, wrote=0)
[BMX 66] filled bmi from 2017/P_BMX:BMXBMI (n_src_nonnull=13137, wrote=13137)
[HIQ 66] filled ins from 2017/P_HIQ:HIQ011 (n_src_nonnull=15523, wrote=0)
[DIQ 66] filled DIABE from 2017/P_DIQ:DIQ010 (n_src_nonnull=14694, wrote=14694)
[BPQ 66] filled HYPERTEN from 2017/P_BPQ:BPQ020 (n_src_nonnull=10183, wrote=10183)
[BPQ 66] BPQ101D not in P_BPQ.xpt nor BPQ_J.xpt — skipped
[MCQ 66] filled cancer from 2017/P_MCQ:MCQ220 (n_src_nonnull=9228, wrote=9228)
[MCQ 66] derived CVD from ['MCQ160B', 'MCQ160C', 'MCQ160D', 'MCQ160E', 'MCQ160F']
[SMQ 66] derived SMK trio + attempted CIGS_PER_DAY, PACK_YEARS
[DEMO 12] filled RACE from 2021/DEMO_L:RIDRETH3 (n_src_nonnull=11933, wrote=11933)
[DEMO 12] filled household_size from 2021/DEMO_L:DMDHHSIZ (n_src_nonnull=11933, wro

In [168]:
df_my_cov_aligned_short["bmi_cat"].head(10)

0     UNDER
1    NORMAL
2     UNDER
3      <NA>
4      OVER
5    NORMAL
6      OVER
7     UNDER
8     UNDER
9     OBESE
Name: bmi_cat, dtype: string

In [169]:
# Mark structural-missing for items not obtainable in public 2017–Mar 2020 (cycle 66)
def mark_structural_missing_66(df: pd.DataFrame) -> pd.DataFrame:
    ensure_col_dtype(df, "household_size", "Int64")
    ensure_col_dtype(df, "household_size_structural_missing", "boolean")
    ensure_col_dtype(df, "chol_rx", "Int64")
    ensure_col_dtype(df, "chol_rx_structural_missing", "boolean")

    m66 = df["SDDSRVYR"].eq(66.0)

    # DMDHHSIZ not available for your 66 SEQNs (no overlap with DEMO_J; no DEMO_K)
    df.loc[m66, "household_size"] = pd.NA
    df.loc[m66, "household_size_structural_missing"] = True

    # BPQ101D not present in BPQ_J; no BPQ_K
    df.loc[m66, "chol_rx"] = pd.NA
    df.loc[m66, "chol_rx_structural_missing"] = True

    print("[structural] Marked household_size and chol_rx as structural missing for cycle 66")
    return df

# call it
df_my_cov_aligned_short = mark_structural_missing_66(df_my_cov_aligned_short)


[structural] Marked household_size and chol_rx as structural missing for cycle 66


#### sanity check 

In [110]:
# df_my_cov_aligned_short[["CIGS_PER_DAY", "PACK_YEARS"]].tail(40)

In [111]:
# 1) Post-2018 audit by cycle (missing %)

# ---------- Post-2018 audit: missing vs structural missing ----------
audit_cols = [
    "RACE","household_size","bmic","ins","DIABE","HYPERTEN","chol_rx",
    "CVD","cancer","SMK","SMK_STATUS","FORMER_SMOKER","CIGS_PER_DAY",
    "PACK_YEARS","HOQ065"
]

# Map variables -> their structural-missing flags if you maintain them
structural_map = {
    "HOQ065": "HOQ065_structural_missing",
    "household_size": "household_size_structural_missing",
    "chol_rx": "chol_rx_structural_missing",
    "PACK_YEARS": "PACK_YEARS_structural_missing",  # only if you created it
    "SNAP": "SNAP_structural_missing",               # only if you added SNAP later
}

# We’ll show cycles 66 & 12 only
mask = df_my_cov_aligned_short["SDDSRVYR"].isin([66.0, 12.0])
sub  = df_my_cov_aligned_short.loc[mask, audit_cols + ["SDDSRVYR"]].copy()

# 1) Total missing (%)
miss_long = (
    sub.assign(_row=1)
       .melt(id_vars=["SDDSRVYR","_row"], var_name="var", value_name="val")
       .assign(missing=lambda d: d["val"].isna())
       .groupby(["SDDSRVYR","var"], as_index=False)
       .agg(pct_missing=("missing","mean"))
)
miss_long["pct_missing"] = (miss_long["pct_missing"]*100).round(1)

# 2) Structural missing (%) — pull from *_structural_missing flags if present
rows = []
for var in audit_cols:
    flag = structural_map.get(var, None)
    if flag and flag in df_my_cov_aligned_short.columns:
        g = (
            df_my_cov_aligned_short.loc[mask, ["SDDSRVYR", flag]]
            .groupby("SDDSRVYR")[flag].mean()  # mean(True) = share structural
            .mul(100).round(1)
        )
        for cyc, val in g.items():
            rows.append({"SDDSRVYR": cyc, "var": var, "pct_structural": val})

struct_long = pd.DataFrame(rows)
if struct_long.empty:
    struct_long = miss_long[["SDDSRVYR","var"]].assign(pct_structural=0.0)

# 3) Combine & compute non-structural missing = total - structural (clipped at 0)
audit_all = miss_long.merge(struct_long, on=["SDDSRVYR","var"], how="left")
audit_all["pct_structural"] = audit_all["pct_structural"].fillna(0.0)
audit_all["pct_nonstructural_missing"] = (audit_all["pct_missing"] - audit_all["pct_structural"]).clip(lower=0).round(1)

# 4) Pretty pivots
print("\nTOTAL missing (%):")
print(audit_all.pivot(index="var", columns="SDDSRVYR", values="pct_missing").fillna(0).sort_index())

print("\nSTRUCTURAL missing (%):")
print(audit_all.pivot(index="var", columns="SDDSRVYR", values="pct_structural").fillna(0).sort_index())

print("\nNON-STRUCTURAL missing (%):")
print(audit_all.pivot(index="var", columns="SDDSRVYR", values="pct_nonstructural_missing").fillna(0).sort_index())



TOTAL missing (%):
SDDSRVYR         12.0   66.0
var                         
CIGS_PER_DAY     90.1   89.2
CVD              34.9   41.0
DIABE             4.0    5.6
FORMER_SMOKER    31.9   37.7
HOQ065          100.0  100.0
HYPERTEN         28.9   34.6
PACK_YEARS       97.9   89.2
RACE              0.0    0.0
SMK              31.9   37.7
SMK_STATUS       31.9   37.7
bmic             29.0   15.6
cancer           34.6   40.7
chol_rx          29.1  100.0
household_size    0.0  100.0
ins               0.5    0.2

STRUCTURAL missing (%):
SDDSRVYR         12.0   66.0
var                         
CIGS_PER_DAY      0.0    0.0
CVD               0.0    0.0
DIABE             0.0    0.0
FORMER_SMOKER     0.0    0.0
HOQ065          100.0  100.0
HYPERTEN          0.0    0.0
PACK_YEARS        0.0    0.0
RACE              0.0    0.0
SMK               0.0    0.0
SMK_STATUS        0.0    0.0
bmic              0.0    0.0
cancer            0.0    0.0
chol_rx           0.0  100.0
household_size    0.0  100.

In [112]:
# 2) Composition: how many rows in each post-2018 cycle?
print(df_my_cov_aligned_short.loc[df_my_cov_aligned_short["SDDSRVYR"].isin([66.0, 12.0]), "SDDSRVYR"].value_counts())


SDDSRVYR
66.0    15560
12.0    11933
Name: count, dtype: int64


In [113]:
# 3) HOQ coverage per cycle (share non-missing)
print(
    df_my_cov_aligned_short
      .assign(HOQ_has = df_my_cov_aligned_short["HOQ065"].notna())
      .pivot_table(index="SDDSRVYR", values="HOQ_has", aggfunc="mean")
      .sort_index()
)


           HOQ_has
SDDSRVYR          
1.0       0.480482
2.0       0.481294
3.0       0.492096
4.0       0.476421
5.0       0.578579
6.0       0.585840
7.0       0.566216
8.0       0.558821
9.0       0.552101
10.0      0.566890
12.0      0.000000
66.0      0.000000


In [114]:
# 4) SEQN overlap check for HOQ 66 (sanity that mapping keys aligned)
y,f = file_for("HOQ", 66)
hoq66 = fetch_xpt(y,f).set_index("SEQN")
m66 = df_my_cov_aligned_short["SDDSRVYR"].eq(66.0)

seq_df = set(df_my_cov_aligned_short.loc[m66, "SEQN"].dropna().astype(int))
seq_hoq = set(hoq66.index.dropna().astype(int))
print("Cycle 66 HOQ overlap:", len(seq_df & seq_hoq), "of", len(seq_df))


Cycle 66 HOQ overlap: 0 of 15560


In [115]:
# Overlap between your cycle-66 SEQNs and HOQ_J
m66 = df_my_cov_aligned_short["SDDSRVYR"].eq(66.0)
seq66 = set(df_my_cov_aligned_short.loc[m66, "SEQN"].dropna().astype(int))

hoq_j = fetch_xpt("2017","HOQ_J")
seq_j = set(hoq_j["SEQN"].dropna().astype(int))

over = len(seq66 & seq_j)
print(f"Cycle 66 SEQNs: {len(seq66)}  |  in HOQ_J: {over}  |  share: {over/len(seq66):.3f}")


Cycle 66 SEQNs: 15560  |  in HOQ_J: 0  |  share: 0.000


In [116]:
m10 = df_my_cov_aligned_short["SDDSRVYR"].eq(10.0)
seq10 = set(df_my_cov_aligned_short.loc[m10, "SEQN"].dropna().astype(int))

print("66∩10 SEQN overlap:", len(seq66 & seq10))  # expect 0


66∩10 SEQN overlap: 0


In [16]:
# 5) Specific variables that used the 66→_J fallback (optional peek)
for mod, col in [("DEMO","DMDHHSIZ"), ("BPQ","BPQ101D")]:
    y, f = file_for(mod, 66)
    try:
        psrc = fetch_xpt(y, f)
        print(f"{f}.xpt has {col}?", col in psrc.columns)
    except Exception as e:
        print(f"{f}.xpt fetch failed: {e}")
    try:
        jsrc = fetch_xpt("2017", f"{mod}_J")
        print(f"{mod}_J.xpt has {col}?", col in jsrc.columns)
    except Exception as e:
        print(f"{mod}_J.xpt fetch failed: {e}")


P_DEMO.xpt has DMDHHSIZ? False
DEMO_J.xpt has DMDHHSIZ? True
P_BPQ.xpt has BPQ101D? False
BPQ_J.xpt has BPQ101D? False


#### check DMDHHSIZ fallabck 

In [59]:
df_my_cov_aligned_short = backfill_simple(
    df_my_cov_aligned_short, "household_size", 66, "DEMO", "DMDHHSIZ",
    mapper=lambda s: pd.to_numeric(s, errors="coerce").astype("Int64")
)
df_my_cov_aligned_short = backfill_simple(
    df_my_cov_aligned_short, "chol_rx", 66, "BPQ", "BPQ101D",
    mapper=lambda s: pd.to_numeric(s, errors="coerce").astype("Int64")
)


[DEMO 66] DMDHHSIZ missing in P_DEMO.xpt, fell back to DEMO_J.xpt
[DEMO 66] filled household_size from 2017/P_DEMO:DMDHHSIZ (n=9254)
[BPQ 66] BPQ101D not in P_BPQ.xpt nor BPQ_J.xpt — skipped


In [62]:
# --- helpers ---
def _fetch_or_none(year: str, filebase: str):
    try:
        return fetch_xpt(year, filebase)
    except Exception as e:
        print(f"[info] {year}/{filebase}.xpt not available ({e})")
        return None

def _seqn_set(df):
    if df is None or "SEQN" not in df.columns:
        return set()
    return set(pd.to_numeric(df["SEQN"], errors="coerce").dropna().astype(int))

# Cycle-66 SEQNs from your frame
m66   = df_my_cov_aligned_short["SDDSRVYR"].eq(66.0)
seq66 = set(df_my_cov_aligned_short.loc[m66, "SEQN"].dropna().astype(int))
print("Cycle 66 SEQNs:", len(seq66))

# ---- DEMO: DMDHHSIZ lives in DEMO_J (K may not exist) ----
demo_j = _fetch_or_none("2017", "DEMO_J")
demo_k = _fetch_or_none("2017", "DEMO_K")  # may 404; that's fine

print("DEMO_J has DMDHHSIZ?", (demo_j is not None) and ("DMDHHSIZ" in demo_j.columns))
print("DEMO_K has DMDHHSIZ?", (demo_k is not None) and ("DMDHHSIZ" in getattr(demo_k, "columns", [])))

seq_demo = _seqn_set(demo_j) | _seqn_set(demo_k)
print("66 ∩ (DEMO_J∪K) SEQNs:", len(seq66 & seq_demo), "of", len(seq66))

# ---- BPQ: BPQ101D lives in BPQ_J (K may not exist) ----
bpq_j = _fetch_or_none("2017", "BPQ_J")
bpq_k = _fetch_or_none("2017", "BPQ_K")    # may 404; that's fine

print("BPQ_J has BPQ101D?", (bpq_j is not None) and ("BPQ101D" in bpq_j.columns))
print("BPQ_K has BPQ101D?", (bpq_k is not None) and ("BPQ101D" in getattr(bpq_k, "columns", [])))

seq_bpq = _seqn_set(bpq_j) | _seqn_set(bpq_k)
print("66 ∩ (BPQ_J∪K) SEQNs:", len(seq66 & seq_bpq), "of", len(seq66))


Cycle 66 SEQNs: 15560
[info] 2017/DEMO_K.xpt not available (404 Client Error: Not Found for url: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DEMO_K.xpt)
DEMO_J has DMDHHSIZ? True
DEMO_K has DMDHHSIZ? False
66 ∩ (DEMO_J∪K) SEQNs: 0 of 15560
[info] 2017/BPQ_K.xpt not available (404 Client Error: Not Found for url: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/BPQ_K.xpt)
BPQ_J has BPQ101D? False
BPQ_K has BPQ101D? False
66 ∩ (BPQ_J∪K) SEQNs: 0 of 15560


## check missingness after systematic fetch 

In [74]:
# df_my_cov_aligned_short[["marriage3", "marriage_prev"]].tail(10)

In [171]:
import pandas as pd

THRESH = 20.0
ID_COLS = {"SEQN", "SDDSRVYR"}
FLAG_SUFFIX = "_structural_missing"

# Hide cosmetic / redundant columns here
ALWAYS_EXCLUDE = {
    "marriage_label", "marriage_prev",
}

# ✅ Applicability map (your correction applied: 4YR weights → cycle 66)
WEIGHT_CYCLE_MAP = {
    # classic 2-yr weights (pre-2017 cycles)
    "WTINT2YR": {1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0},
    "WTMEC2YR": {1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0},

    # pre-pandemic combined (2017–Mar 2020)
    "WTINTPRP": {66.0},
    "WTMECPRP": {66.0},
    "WTSAFPRP": {66.0},
    "WTINT4YR": {66.0},   # ⬅ updated
    "WTMEC4YR": {66.0},   # ⬅ updated

    # fasting / phlebotomy 2-yr (legacy)
    "WTPH2YR":  {1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0},
    "WTSAF2YR": {1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0},
}

df = df_my_cov_aligned_short.copy()

def _applicable(var: str, cyc: float) -> bool:
    if var in ALWAYS_EXCLUDE:
        return False
    if var in WEIGHT_CYCLE_MAP:
        return cyc in WEIGHT_CYCLE_MAP[var]
    return True  # default: applicable

# columns to audit
value_cols = [c for c in df.columns if c not in ID_COLS and not c.endswith(FLAG_SUFFIX)]

# long form and applicability
long = df[["SDDSRVYR"] + value_cols].melt("SDDSRVYR", var_name="var", value_name="val")
long["applicable"] = long.apply(lambda r: _applicable(r["var"], r["SDDSRVYR"]), axis=1)

# compute non-missing% only where applicable
app = long[long["applicable"]].copy()
summ = (app.groupby(["SDDSRVYR","var"], as_index=False)
          .agg(nonmissing_pct=("val", lambda x: (1 - x.isna().mean())*100)))
summ["nonmissing_pct"] = summ["nonmissing_pct"].round(1)

# keep vars with < THRESH non-missing in any applicable cycle
low_vars = (summ.groupby("var")["nonmissing_pct"].min()
                 .reset_index().query("nonmissing_pct < @THRESH")["var"].tolist())
summ_low = summ[summ["var"].isin(low_vars)]

# pivot; leave non-applicable blank (NaN) instead of 0
pivot = summ_low.pivot(index="var", columns="SDDSRVYR", values="nonmissing_pct")
pivot["_min"] = pivot.min(axis=1, skipna=True)
pivot = pivot.sort_values("_min", ascending=True).drop(columns="_min")

print(f"\nVars with <{THRESH:.0f}% non-missing in at least one applicable cycle:")
print(pivot.to_string(na_rep=""))  # blanks for not-applicable



Vars with <20% non-missing in at least one applicable cycle:
SDDSRVYR                     1.0    2.0    3.0    4.0    5.0    6.0    7.0    8.0    9.0   10.0   12.0  66.0
var                                                                                                         
SNAP_indiv_plus_singleton   15.0   14.6    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   0.0
household_size             100.0  100.0  100.0  100.0  100.0  100.0  100.0  100.0  100.0  100.0  100.0   0.0
chol_rx                    100.0  100.0  100.0  100.0  100.0  100.0  100.0  100.0  100.0  100.0   70.9   0.0
bmi                          0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   71.0  84.4
ahei_total                  81.0   81.8   75.5   30.5   31.1   32.9   30.5   29.9   28.7   29.9    0.0   0.0
WTPH2YR                      0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0             
SNAP_src_rank               15.1   14.6    0.0    0.0    0.0    0.

In [178]:
import re
import pandas as pd

THRESH = 20.0
ID_COLS = {"SEQN", "SDDSRVYR"}
FLAG_SUFFIX = "_structural_missing"

# Columns we always hide (labels, prev flags, etc.)
ALWAYS_EXCLUDE = {
    "marriage_label", "marriage_prev",
    "probable_depression",
}

# If a more complete/derived sibling exists, skip the base
DERIVED_BETTER = {
    "BMI": {"BMI_CLAS", "BMI_CAT", "bmic", "bmi_cat"},
}

# ✅ Regex rules: ignore families where missingness is expected/OK

IGNORE_REGEXES = [
    r"(^|_)SNAP(\b|_)",          # all SNAP
    r"^(FS|FSDHH)\b",            # food security families
    r"^(SMK|CIGS?|PACK)",        # smoking families
    r".*_label$",                # cosmetic
    r".*_prev$",                 # lag flags
    r"phlebotom|phleb",          # any phlebotomy-related names
]

IGNORE_EXACT = {
    # SNAP variants you’ve seen
    "SNAP", "SNAP_bin", "SNAP_indiv_only", "SNAP_indiv_plus_singleton",
    "SNAP_src", "SNAP_src_rank",
    # explicitly ignore these two
    "WTPH2YR", "wt_phlebotomy",
}


# ✅ Applicability map (4YR weights → cycle 66)
WEIGHT_CYCLE_MAP = {
    "WTINT2YR": {1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0},
    "WTMEC2YR": {1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0},
    "WTINTPRP": {66.0},
    "WTMECPRP": {66.0},
    "WTSAFPRP": {66.0},
    "WTINT4YR": {66.0},
    "WTMEC4YR": {66.0},
    "WTPH2YR":  {1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0},
    "WTSAF2YR": {1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0},
}

def _is_ignored(col: str, cols_lower: set[str]) -> bool:
    c0 = col
    if c0 in ALWAYS_EXCLUDE or c0 in IGNORE_EXACT:
        return True
    for pat in IGNORE_REGEXES:
        if re.search(pat, c0, flags=re.IGNORECASE):
            return True
    # numeric vs categorical “better” siblings
    base = c0.upper()
    if base in DERIVED_BETTER:
        alts = DERIVED_BETTER[base]
        if any(alt.lower() in cols_lower for alt in alts):
            return True
    return False

def _applicable(var: str, cyc: float) -> bool:
    v = var.upper()
    if var in ALWAYS_EXCLUDE or var in IGNORE_EXACT:
        return False
    if v in WEIGHT_CYCLE_MAP:
        return cyc in WEIGHT_CYCLE_MAP[v]
    return True

df = df_my_cov_aligned_short.copy()

# Build filtered column list
cols_lower = {c.lower() for c in df.columns}
value_cols = []
for c in df.columns:
    if c in ID_COLS or c.endswith(FLAG_SUFFIX):
        continue
    if _is_ignored(c, cols_lower):
        continue
    value_cols.append(c)

# Long/applicability → summary
long = df[["SDDSRVYR"] + value_cols].melt("SDDSRVYR", var_name="var", value_name="val")
long["applicable"] = long.apply(lambda r: _applicable(r["var"], r["SDDSRVYR"]), axis=1)

app = long[long["applicable"]].copy()
summ = (app.groupby(["SDDSRVYR","var"], as_index=False)
          .agg(nonmissing_pct=("val", lambda x: (1 - x.isna().mean())*100)))
summ["nonmissing_pct"] = summ["nonmissing_pct"].round(1)

low_vars = (summ.groupby("var")["nonmissing_pct"].min()
                 .reset_index().query("nonmissing_pct < @THRESH")["var"].tolist())
summ_low = summ[summ["var"].isin(low_vars)]

pivot = summ_low.pivot(index="var", columns="SDDSRVYR", values="nonmissing_pct")
pivot["_min"] = pivot.min(axis=1, skipna=True)
pivot = pivot.sort_values("_min", ascending=True).drop(columns="_min")

print(f"\nVars with <{THRESH:.0f}% non-missing in at least one applicable cycle (SNAP removed):")
print(pivot.to_string(na_rep=""))



Vars with <20% non-missing in at least one applicable cycle (SNAP removed):
SDDSRVYR          1.0    2.0    3.0    4.0    5.0    6.0    7.0    8.0    9.0   10.0   12.0  66.0
var                                                                                              
DRINKS_PER_DAY   24.3   24.6   24.5   24.6   29.9   31.3   31.3   33.1   31.6    0.0    0.0   0.0
HOQ065           48.0   48.1   49.2   47.6   57.9   58.6   56.6   55.9   55.2   56.7    0.0   0.0
ahei_total       81.0   81.8   75.5   30.5   31.1   32.9   30.5   29.9   28.7   29.9    0.0   0.0
chol_rx         100.0  100.0  100.0  100.0  100.0  100.0  100.0  100.0  100.0  100.0   70.9   0.0
household_size  100.0  100.0  100.0  100.0  100.0  100.0  100.0  100.0  100.0  100.0  100.0   0.0
sdoh_access      48.1   48.2   49.2   47.7   57.9   58.7   56.7   56.0   55.4   56.8   58.7   0.0


In [173]:
df_my_cov_aligned_short.columns

Index(['SEQN', 'SDDSRVYR', 'sdmvpsu', 'sdmvstra', 'RIDAGEYR', 'SEX', 'RACE',
       'household_size', 'EDU', 'pir', 'SMK_AVG', 'SMK', 'ALCG2', 'met_hr',
       'SMK_STATUS', 'CIGS_PER_DAY', 'PACK_YEARS', 'FORMER_SMOKER',
       'DRINKS_PER_DAY', 'ALCOHOL_CAT', 'bmi_cat', 'DIABE', 'HYPERTEN',
       'chol_rx', 'CVD', 'cancer', 'probable_depression', 'ahei_total',
       'unemployment2', 'sdoh_access', 'ins', 'HOQ065', 'marriage', 'SNAP',
       'FS', 'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'WTINT4YR', 'WTMEC4YR',
       'WTINTPRP', 'WTMECPRP', 'WTSAFPRP', 'wt_int', 'wt_mec', 'wt_fasting',
       'wt_phlebotomy', 'WTPH2YR', 'marriage_prev', 'marriage_label',
       'marriage3', 'SNAP_src', 'SNAP_bin', 'SNAP_src_rank', 'SNAP_indiv_only',
       'SNAP_indiv_plus_singleton', 'bmi', 'HOQ065_structural_missing',
       'household_size_structural_missing', 'chol_rx_structural_missing'],
      dtype='object')

In [184]:
df_my_cov_aligned_short["DIABE"].tail(100)

128709    0
128710    0
128711    0
128712    0
128713    1
         ..
128804    0
128805    1
128806    0
128807    0
128808    0
Name: DIABE, Length: 100, dtype: Int64

## save, pre 2018 complete, post 2018 almost

In [187]:
from pathlib import Path
OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output"); OUT.mkdir(parents=True, exist_ok=True)

df = df_my_cov_aligned_short.copy()
for c in df.select_dtypes(include="object"):
    df[c] = df[c].apply(lambda x: x.decode("utf-8","ignore") if isinstance(x,(bytes,bytearray)) else x).astype("string")

handoff = OUT / "cov_addv2_99_23.parquet"
df.to_parquet(handoff, index=False)
print("✓ Saved:", handoff)


✓ Saved: /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_addv2_99_23.parquet
