# backfill missing cov

## Load previous cov file

In [5]:
import pandas as pd
from pathlib import Path

OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
src = OUT / "cov_concise_99_23.parquet"

df_my_cov_aligned_short = pd.read_parquet(src)
df_my_cov_aligned_short["SEQN"] = pd.to_numeric(df_my_cov_aligned_short["SEQN"], errors="coerce").astype("Int64")
df_my_cov_aligned_short["SDDSRVYR"] = pd.to_numeric(df_my_cov_aligned_short["SDDSRVYR"], errors="coerce")
if "SNAP" not in df_my_cov_aligned_short.columns:
    df_my_cov_aligned_short["SNAP"] = pd.Series(pd.NA, index=df_my_cov_aligned_short.index, dtype="Int64")

print("✓ Loaded cov_concise_99_23:", df_my_cov_aligned_short.shape)


✓ Loaded cov_concise_99_23: (128809, 51)


## SNAP pre2003

In [7]:
# 08_snap_backfill: Fill SNAP in 1999–2002 and re-check coverage
# - Load FSQ (1999) / FSQ_B (2001)
# - Hierarchy: FSD200 (current) → FSD180 (past yr) → HH proxy FSD170N (promote if HH size==1)
# - Overwrite only cycles 1 & 2, keep provenance in SNAP_src
# - Build SNAP_bin (0/1/NA) and print audits

import pandas as pd, numpy as np, io, requests
from typing import Optional

# ---------- fetch helper (fix early-cycle year folders) ----------
_YEARFOLDER_FIX = {"1999-2000": "1999", "2001-2002": "2001"}

def fetch_xpt(year_folder: str, filebase: str) -> pd.DataFrame:
    yf = _YEARFOLDER_FIX.get(year_folder, year_folder)
    url = f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{yf}/DataFiles/{filebase}.xpt"
    r = requests.get(url); r.raise_for_status()
    df = pd.read_sas(io.BytesIO(r.content), format="xport", encoding="latin1")
    df.columns = [c.upper() for c in df.columns]
    df["SEQN"] = pd.to_numeric(df["SEQN"], errors="coerce").astype("Int64")
    return df

# ---------- pull SNAP-relevant columns from FSQ ----------
def load_fsq_for_cycle(year_folder: str, filebase: str) -> pd.DataFrame:
    """
    Returns FSQ subset indexed by SEQN with SNAP candidates + household size.
    Columns (when present): FSD200, FSD180, FSD170N, DMDHHSIZ
    """
    fsq = fetch_xpt(year_folder, filebase).set_index("SEQN")
    keep = [c for c in ["FSD200","FSD180","FSD170N","DMDHHSIZ"] if c in fsq.columns]
    return fsq[keep].apply(pd.to_numeric, errors="coerce")

def fill_snap_hierarchy(df: pd.DataFrame, fsq: pd.DataFrame, cycle_code: int) -> pd.DataFrame:
    """
    For SDDSRVYR == cycle_code:
      1) FSD200: 1→1, 2→0
      2) FSD180: 1→1, 2→0 (fill remaining)
      3) FSD170N (HH proxy): >0→1, ==0→0 (fill remaining); mark as HH proxy
         Promote HH proxy to individual if DMDHHSIZ==1.
    Writes SNAP (Int64 0/1/NA) and SNAP_src (string).
    """
    out = df.copy()
    m = out["SDDSRVYR"].eq(float(cycle_code))
    if not m.any():
        return out

    seqn = out.loc[m, "SEQN"].astype("Int64")
    sub = fsq.reindex(seqn.values)

    snap = pd.Series(pd.NA, index=sub.index, dtype="Int64")
    src  = pd.Series(pd.NA, index=sub.index, dtype="string")

    # 1) Current authorization
    if "FSD200" in sub.columns:
        s = sub["FSD200"]
        snap.loc[s.eq(1)] = 1; src.loc[s.eq(1)] = "FSD200"
        snap.loc[s.eq(2)] = 0; src.loc[s.eq(2)] = "FSD200"

    # 2) Past-year authorization
    need = snap.isna()
    if "FSD180" in sub.columns:
        s = sub["FSD180"]
        snap.loc[need & s.eq(1)] = 1; src.loc[need & s.eq(1)] = "FSD180"
        snap.loc[need & s.eq(2)] = 0; src.loc[need & s.eq(2)] = "FSD180"

    # 3) HH proxy (any authorized in HH)
    need = snap.isna()
    if "FSD170N" in sub.columns:
        h = sub["FSD170N"]
        snap.loc[need & (h > 0)] = 1; src.loc[need & (h > 0)] = "FSD170N_HH"
        snap.loc[need & (h == 0)] = 0; src.loc[need & (h == 0)] = "FSD170N_HH"

        # Promote to individual if household size==1
        if "DMDHHSIZ" in sub.columns:
            promote = src.eq("FSD170N_HH") & sub["DMDHHSIZ"].eq(1)
            src.loc[promote] = "FSD170N_HH_singleton"

    # ensure cols exist
    if "SNAP" not in out.columns:
        out["SNAP"] = pd.Series(pd.NA, index=out.index, dtype="Int64")
    if "SNAP_src" not in out.columns:
        out["SNAP_src"] = pd.Series(pd.NA, index=out.index, dtype="string")

    out.loc[m, "SNAP"] = snap.values
    out.loc[m, "SNAP_src"] = src.values
    return out

# ---------- PRE audit ----------
def print_coverage(df: pd.DataFrame, col: str, title: str):
    cov = (df.groupby("SDDSRVYR", observed=True)[col]
             .apply(lambda s: s.notna().mean()*100, include_groups=False)
             .round(1))
    print(f"\n{title}\n", cov)

print_coverage(df_my_cov_aligned_short, "SNAP", "SNAP coverage by cycle (% non-missing) — BEFORE")

# ---------- apply to cycles 1 & 2 ----------
fsq_9900 = load_fsq_for_cycle("1999-2000", "FSQ")
fsq_0102 = load_fsq_for_cycle("2001-2002", "FSQ_B")

before1 = df_my_cov_aligned_short["SNAP"].notna().sum()
df_my_cov_aligned_short = fill_snap_hierarchy(df_my_cov_aligned_short, fsq_9900, 1)
df_my_cov_aligned_short = fill_snap_hierarchy(df_my_cov_aligned_short, fsq_0102, 2)
after1  = df_my_cov_aligned_short["SNAP"].notna().sum()
print(f"\nFilled cycles 1–2 | rows with SNAP non-missing: {before1} → {after1}")

# ---------- binarize & audits ----------
# SNAP is already 0/1/NA; keep a dedicated binary column for clarity
df_my_cov_aligned_short["SNAP_bin"] = df_my_cov_aligned_short["SNAP"].astype("Int64")

print_coverage(df_my_cov_aligned_short, "SNAP", "SNAP coverage by cycle (% non-missing) — AFTER")

# source breakdown for early cycles
src_counts = (df_my_cov_aligned_short
    .loc[df_my_cov_aligned_short["SDDSRVYR"].isin([1.0, 2.0])]
    .groupby(["SDDSRVYR", "SNAP_src"], observed=True)["SEQN"]
    .count()
    .sort_index())
print("\nSNAP source counts (cycles 1–2):\n", src_counts)



SNAP coverage by cycle (% non-missing) — BEFORE
 SDDSRVYR
1.0      4.8
2.0      4.2
3.0     48.9
4.0     47.5
5.0     57.9
6.0     58.3
7.0     56.5
8.0     55.9
9.0     55.0
10.0    56.2
12.0     0.0
66.0    54.5
Name: SNAP, dtype: float64

Filled cycles 1–2 | rows with SNAP non-missing: 53193 → 55371

SNAP coverage by cycle (% non-missing) — AFTER
 SDDSRVYR
1.0     15.1
2.0     14.6
3.0     48.9
4.0     47.5
5.0     57.9
6.0     58.3
7.0     56.5
8.0     55.9
9.0     55.0
10.0    56.2
12.0     0.0
66.0    54.5
Name: SNAP, dtype: float64

SNAP source counts (cycles 1–2):
 SDDSRVYR  SNAP_src  
1.0       FSD170N_HH      8
          FSD180        948
          FSD200        548
2.0       FSD180        893
          FSD200        716
Name: SEQN, dtype: Int64


#### check 

In [11]:
# --- Guardrails: ensure ONLY cycles 1–2 changed ---
_changed = (
    df_my_cov_aligned_short["SNAP_src"].notna()
    & ~df_my_cov_aligned_short["SDDSRVYR"].isin([1.0, 2.0])
)
assert not _changed.any(), "SNAP was modified outside cycles 1–2 unexpectedly."

# --- Provenance-aware flags (nullable Int64-safe) ---
src_rank = {
    "FSD200": 3,                    # current, individual
    "FSD180": 2,                    # past 12m, individual
    "FSD170N_HH_singleton": 2,      # HH proxy but singleton household
    "FSD170N_HH": 1                 # HH proxy (multi-person)
}
df_my_cov_aligned_short["SNAP_src_rank"] = (
    df_my_cov_aligned_short["SNAP_src"].map(src_rank).astype("Int64")
)

# --- Sensitivity variants (build as Pandas Series, not NumPy arrays) ---
mask_indiv = df_my_cov_aligned_short["SNAP_src"].isin(["FSD200", "FSD180"])
mask_indiv_or_single = df_my_cov_aligned_short["SNAP_src"].isin(
    ["FSD200", "FSD180", "FSD170N_HH_singleton"]
)

# Start as all NA (nullable Int64), then fill by mask
df_my_cov_aligned_short["SNAP_indiv_only"] = pd.Series(
    pd.NA, index=df_my_cov_aligned_short.index, dtype="Int64"
)
df_my_cov_aligned_short.loc[mask_indiv, "SNAP_indiv_only"] = (
    pd.to_numeric(df_my_cov_aligned_short.loc[mask_indiv, "SNAP"], errors="coerce")
      .astype("Int64")
)

df_my_cov_aligned_short["SNAP_indiv_plus_singleton"] = pd.Series(
    pd.NA, index=df_my_cov_aligned_short.index, dtype="Int64"
)
df_my_cov_aligned_short.loc[mask_indiv_or_single, "SNAP_indiv_plus_singleton"] = (
    pd.to_numeric(df_my_cov_aligned_short.loc[mask_indiv_or_single, "SNAP"], errors="coerce")
      .astype("Int64")
)

# Ensure the main binary is tidy nullable Int64 too
df_my_cov_aligned_short["SNAP_bin"] = pd.to_numeric(
    df_my_cov_aligned_short["SNAP"], errors="coerce"
).astype("Int64")

# --- Quick QC: adult participation rates by cycle (no save) ---
adults = df_my_cov_aligned_short.loc[df_my_cov_aligned_short["RIDAGEYR"] >= 18]

def _rate(s: pd.Series) -> float:
    s = pd.to_numeric(s, errors="coerce")
    denom = s.notna().sum()
    return float((s.eq(1)).sum() / denom * 100) if denom else float("nan")

qc = pd.DataFrame({
    "SNAP_bin_rate": adults.groupby("SDDSRVYR", observed=True)["SNAP_bin"].apply(_rate),
    "indiv_only_rate": adults.groupby("SDDSRVYR", observed=True)["SNAP_indiv_only"].apply(_rate),
    "indiv+singleton_rate": adults.groupby("SDDSRVYR", observed=True)["SNAP_indiv_plus_singleton"].apply(_rate),
}).round(2)

print("\nAdult SNAP participation rates by cycle (% among non-missing):\n", qc)

# Optional: free memory if not reusing FSQ pulls
# del fsq_9900, fsq_0102



Adult SNAP participation rates by cycle (% among non-missing):
           SNAP_bin_rate  indiv_only_rate  indiv+singleton_rate
SDDSRVYR                                                      
1.0               48.73            48.17                 48.17
2.0               54.66            54.66                 54.66
3.0               12.72              NaN                   NaN
4.0               11.10              NaN                   NaN
5.0               15.68              NaN                   NaN
6.0               19.82              NaN                   NaN
7.0               23.63              NaN                   NaN
8.0               22.37              NaN                   NaN
9.0               25.86              NaN                   NaN
10.0              23.67              NaN                   NaN
12.0                NaN              NaN                   NaN
66.0              24.46              NaN                   NaN


#### check missingness again 

In [12]:
df_my_cov_aligned_short.columns

Index(['SEQN', 'SDDSRVYR', 'sdmvpsu', 'sdmvstra', 'RIDAGEYR', 'SEX', 'RACE',
       'household_size', 'EDU', 'pir', 'SMK_AVG', 'SMK', 'ALCG2', 'met_hr',
       'SMK_STATUS', 'CIGS_PER_DAY', 'PACK_YEARS', 'FORMER_SMOKER',
       'DRINKS_PER_DAY', 'ALCOHOL_CAT', 'bmic', 'DIABE', 'HYPERTEN', 'chol_rx',
       'CVD', 'cancer', 'probable_depression', 'ahei_total', 'unemployment2',
       'sdoh_access', 'ins', 'HOQ065', 'marriage', 'SNAP', 'FS', 'WTINT2YR',
       'WTMEC2YR', 'WTSAF2YR', 'WTINT4YR', 'WTMEC4YR', 'WTINTPRP', 'WTMECPRP',
       'WTSAFPRP', 'wt_int', 'wt_mec', 'wt_fasting', 'wt_phlebotomy',
       'WTPH2YR', 'marriage_prev', 'marriage_label', 'marriage3', 'SNAP_prev',
       'SNAP_bin', 'SNAP_src', 'SNAP_src_rank', 'SNAP_indiv_only',
       'SNAP_indiv_plus_singleton'],
      dtype='object')

In [23]:
import pandas as pd
import numpy as np

# --- keep only SNAP_bin & marriage3; drop other helper columns ---
snap_drop = {"SNAP", "SNAP_prev", "SNAP_src", "SNAP_src_rank",
             "SNAP_indiv_only", "SNAP_indiv_plus_singleton"}
marriage_drop = {"marriage", "marriage_prev", "marriage_label"}

base_cols = list(df_my_cov_aligned_short.columns)

cols_check = []
for c in base_cols:
    if c in snap_drop: 
        continue
    if c in marriage_drop:
        continue
    cols_check.append(c)

# ensure main signals are present
must_keep = [c for c in ["SNAP_bin", "marriage3"] if c in df_my_cov_aligned_short.columns]
for c in must_keep:
    if c not in cols_check:
        cols_check.append(c)

# always keep cycle/ID up front
priority = ["SEQN", "SDDSRVYR"]
cols_check = [c for c in priority if c in cols_check] + [c for c in cols_check if c not in priority]

# --- helper: % missing by cycle ---
def pct_missing_by_cycle(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    g = df.groupby("SDDSRVYR", observed=True)
    miss = g[cols].apply(lambda x: x.isna().mean() * 100, include_groups=False)
    miss.index = miss.index.astype(float)
    miss = miss.sort_index()
    return miss.round(1)

miss_by_cycle = pct_missing_by_cycle(df_my_cov_aligned_short, cols_check)

# overall % missing to rank columns
overall_missing = df_my_cov_aligned_short[cols_check].isna().mean().mul(100).round(1).rename("overall_%miss")
miss_table = pd.concat([overall_missing.to_frame().T, miss_by_cycle], axis=0)
miss_table.index = ["OVERALL"] + miss_by_cycle.index.astype(int).astype(str).tolist()

# quick view: top 20 most-missing variables overall
top20 = overall_missing.sort_values(ascending=False).head(20).index.tolist()
print("Top 20 by OVERALL % missing:\n", overall_missing.sort_values(ascending=False).head(20))
print("\nMissingness table (top 20 vars) — rows are cycles, columns are variables:")
display(miss_table.loc[:, top20])

# If you want the full table (can be wide), uncomment:
# display(miss_table)


Top 20 by OVERALL % missing:
 WTSAFPRP               96.0
WTPH2YR                93.7
wt_phlebotomy          93.7
SMK_AVG                91.2
CIGS_PER_DAY           91.2
PACK_YEARS             90.6
WTMECPRP               87.9
WTINTPRP               87.9
WTMEC4YR               87.9
WTINT4YR               87.9
DRINKS_PER_DAY         79.8
ahei_total             64.2
HOQ065                 58.1
SMK_STATUS             57.3
SMK                    57.3
FORMER_SMOKER          57.2
SNAP_bin               57.0
probable_depression    55.0
sdoh_access            52.6
FS                     51.8
Name: overall_%miss, dtype: float64

Missingness table (top 20 vars) — rows are cycles, columns are variables:


Unnamed: 0,WTSAFPRP,WTPH2YR,wt_phlebotomy,SMK_AVG,CIGS_PER_DAY,PACK_YEARS,WTMECPRP,WTINTPRP,WTMEC4YR,WTINT4YR,DRINKS_PER_DAY,ahei_total,HOQ065,SMK_STATUS,SMK,FORMER_SMOKER,SNAP_bin,probable_depression,sdoh_access,FS
OVERALL,96.0,93.7,93.7,91.2,91.2,90.6,87.9,87.9,87.9,87.9,79.8,64.2,58.1,57.3,57.3,57.2,57.0,55.0,52.6,51.8
1,100.0,100.0,100.0,90.0,90.0,88.5,100.0,100.0,100.0,100.0,75.7,19.0,52.0,51.2,51.2,51.0,84.9,92.8,51.9,52.4
2,100.0,100.0,100.0,89.4,89.4,88.7,100.0,100.0,100.0,100.0,75.4,18.2,51.9,51.1,51.1,51.0,85.4,92.6,51.8,54.2
3,100.0,100.0,100.0,89.0,89.0,87.9,100.0,100.0,100.0,100.0,75.5,24.5,50.8,50.3,50.3,50.2,51.1,93.2,50.8,52.4
4,100.0,100.0,100.0,89.6,89.6,89.3,100.0,100.0,100.0,100.0,75.4,69.5,52.4,51.9,51.9,51.9,52.5,48.5,52.3,52.4
5,100.0,100.0,100.0,87.1,87.1,86.8,100.0,100.0,100.0,100.0,70.1,68.9,42.1,41.6,41.6,41.5,42.1,40.9,42.1,42.1
6,100.0,100.0,100.0,87.3,87.3,87.0,100.0,100.0,100.0,100.0,68.7,67.1,41.4,41.0,41.0,41.0,41.7,39.6,41.3,41.7
7,100.0,100.0,100.0,88.8,88.8,88.6,100.0,100.0,100.0,100.0,68.7,69.5,43.4,43.1,43.1,43.0,43.5,42.4,43.3,43.3
8,100.0,100.0,100.0,88.4,88.4,88.3,100.0,100.0,100.0,100.0,66.9,70.1,44.1,43.3,43.3,43.3,44.1,41.8,44.0,44.1
9,100.0,100.0,100.0,89.4,89.4,88.3,100.0,100.0,100.0,100.0,68.4,71.3,44.8,42.8,42.8,42.6,45.0,42.5,44.6,44.7


In [None]:
#### work on here next pick the next column to fetch!!