# backfill missing cov

## Load previous cov file

In [5]:
import pandas as pd
from pathlib import Path

OUT = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output")
src = OUT / "cov_concise_99_23.parquet"

df_my_cov_aligned_short = pd.read_parquet(src)
df_my_cov_aligned_short["SEQN"] = pd.to_numeric(df_my_cov_aligned_short["SEQN"], errors="coerce").astype("Int64")
df_my_cov_aligned_short["SDDSRVYR"] = pd.to_numeric(df_my_cov_aligned_short["SDDSRVYR"], errors="coerce")
if "SNAP" not in df_my_cov_aligned_short.columns:
    df_my_cov_aligned_short["SNAP"] = pd.Series(pd.NA, index=df_my_cov_aligned_short.index, dtype="Int64")

print("✓ Loaded cov_concise_99_23:", df_my_cov_aligned_short.shape)


✓ Loaded cov_concise_99_23: (128809, 51)


## SNAP pre2003

In [7]:
# 08_snap_backfill: Fill SNAP in 1999–2002 and re-check coverage
# - Load FSQ (1999) / FSQ_B (2001)
# - Hierarchy: FSD200 (current) → FSD180 (past yr) → HH proxy FSD170N (promote if HH size==1)
# - Overwrite only cycles 1 & 2, keep provenance in SNAP_src
# - Build SNAP_bin (0/1/NA) and print audits

import pandas as pd, numpy as np, io, requests
from typing import Optional

# ---------- fetch helper (fix early-cycle year folders) ----------
_YEARFOLDER_FIX = {"1999-2000": "1999", "2001-2002": "2001"}

def fetch_xpt(year_folder: str, filebase: str) -> pd.DataFrame:
    yf = _YEARFOLDER_FIX.get(year_folder, year_folder)
    url = f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{yf}/DataFiles/{filebase}.xpt"
    r = requests.get(url); r.raise_for_status()
    df = pd.read_sas(io.BytesIO(r.content), format="xport", encoding="latin1")
    df.columns = [c.upper() for c in df.columns]
    df["SEQN"] = pd.to_numeric(df["SEQN"], errors="coerce").astype("Int64")
    return df

# ---------- pull SNAP-relevant columns from FSQ ----------
def load_fsq_for_cycle(year_folder: str, filebase: str) -> pd.DataFrame:
    """
    Returns FSQ subset indexed by SEQN with SNAP candidates + household size.
    Columns (when present): FSD200, FSD180, FSD170N, DMDHHSIZ
    """
    fsq = fetch_xpt(year_folder, filebase).set_index("SEQN")
    keep = [c for c in ["FSD200","FSD180","FSD170N","DMDHHSIZ"] if c in fsq.columns]
    return fsq[keep].apply(pd.to_numeric, errors="coerce")

def fill_snap_hierarchy(df: pd.DataFrame, fsq: pd.DataFrame, cycle_code: int) -> pd.DataFrame:
    """
    For SDDSRVYR == cycle_code:
      1) FSD200: 1→1, 2→0
      2) FSD180: 1→1, 2→0 (fill remaining)
      3) FSD170N (HH proxy): >0→1, ==0→0 (fill remaining); mark as HH proxy
         Promote HH proxy to individual if DMDHHSIZ==1.
    Writes SNAP (Int64 0/1/NA) and SNAP_src (string).
    """
    out = df.copy()
    m = out["SDDSRVYR"].eq(float(cycle_code))
    if not m.any():
        return out

    seqn = out.loc[m, "SEQN"].astype("Int64")
    sub = fsq.reindex(seqn.values)

    snap = pd.Series(pd.NA, index=sub.index, dtype="Int64")
    src  = pd.Series(pd.NA, index=sub.index, dtype="string")

    # 1) Current authorization
    if "FSD200" in sub.columns:
        s = sub["FSD200"]
        snap.loc[s.eq(1)] = 1; src.loc[s.eq(1)] = "FSD200"
        snap.loc[s.eq(2)] = 0; src.loc[s.eq(2)] = "FSD200"

    # 2) Past-year authorization
    need = snap.isna()
    if "FSD180" in sub.columns:
        s = sub["FSD180"]
        snap.loc[need & s.eq(1)] = 1; src.loc[need & s.eq(1)] = "FSD180"
        snap.loc[need & s.eq(2)] = 0; src.loc[need & s.eq(2)] = "FSD180"

    # 3) HH proxy (any authorized in HH)
    need = snap.isna()
    if "FSD170N" in sub.columns:
        h = sub["FSD170N"]
        snap.loc[need & (h > 0)] = 1; src.loc[need & (h > 0)] = "FSD170N_HH"
        snap.loc[need & (h == 0)] = 0; src.loc[need & (h == 0)] = "FSD170N_HH"

        # Promote to individual if household size==1
        if "DMDHHSIZ" in sub.columns:
            promote = src.eq("FSD170N_HH") & sub["DMDHHSIZ"].eq(1)
            src.loc[promote] = "FSD170N_HH_singleton"

    # ensure cols exist
    if "SNAP" not in out.columns:
        out["SNAP"] = pd.Series(pd.NA, index=out.index, dtype="Int64")
    if "SNAP_src" not in out.columns:
        out["SNAP_src"] = pd.Series(pd.NA, index=out.index, dtype="string")

    out.loc[m, "SNAP"] = snap.values
    out.loc[m, "SNAP_src"] = src.values
    return out

# ---------- PRE audit ----------
def print_coverage(df: pd.DataFrame, col: str, title: str):
    cov = (df.groupby("SDDSRVYR", observed=True)[col]
             .apply(lambda s: s.notna().mean()*100, include_groups=False)
             .round(1))
    print(f"\n{title}\n", cov)

print_coverage(df_my_cov_aligned_short, "SNAP", "SNAP coverage by cycle (% non-missing) — BEFORE")

# ---------- apply to cycles 1 & 2 ----------
fsq_9900 = load_fsq_for_cycle("1999-2000", "FSQ")
fsq_0102 = load_fsq_for_cycle("2001-2002", "FSQ_B")

before1 = df_my_cov_aligned_short["SNAP"].notna().sum()
df_my_cov_aligned_short = fill_snap_hierarchy(df_my_cov_aligned_short, fsq_9900, 1)
df_my_cov_aligned_short = fill_snap_hierarchy(df_my_cov_aligned_short, fsq_0102, 2)
after1  = df_my_cov_aligned_short["SNAP"].notna().sum()
print(f"\nFilled cycles 1–2 | rows with SNAP non-missing: {before1} → {after1}")

# ---------- binarize & audits ----------
# SNAP is already 0/1/NA; keep a dedicated binary column for clarity
df_my_cov_aligned_short["SNAP_bin"] = df_my_cov_aligned_short["SNAP"].astype("Int64")

print_coverage(df_my_cov_aligned_short, "SNAP", "SNAP coverage by cycle (% non-missing) — AFTER")

# source breakdown for early cycles
src_counts = (df_my_cov_aligned_short
    .loc[df_my_cov_aligned_short["SDDSRVYR"].isin([1.0, 2.0])]
    .groupby(["SDDSRVYR", "SNAP_src"], observed=True)["SEQN"]
    .count()
    .sort_index())
print("\nSNAP source counts (cycles 1–2):\n", src_counts)



SNAP coverage by cycle (% non-missing) — BEFORE
 SDDSRVYR
1.0      4.8
2.0      4.2
3.0     48.9
4.0     47.5
5.0     57.9
6.0     58.3
7.0     56.5
8.0     55.9
9.0     55.0
10.0    56.2
12.0     0.0
66.0    54.5
Name: SNAP, dtype: float64

Filled cycles 1–2 | rows with SNAP non-missing: 53193 → 55371

SNAP coverage by cycle (% non-missing) — AFTER
 SDDSRVYR
1.0     15.1
2.0     14.6
3.0     48.9
4.0     47.5
5.0     57.9
6.0     58.3
7.0     56.5
8.0     55.9
9.0     55.0
10.0    56.2
12.0     0.0
66.0    54.5
Name: SNAP, dtype: float64

SNAP source counts (cycles 1–2):
 SDDSRVYR  SNAP_src  
1.0       FSD170N_HH      8
          FSD180        948
          FSD200        548
2.0       FSD180        893
          FSD200        716
Name: SEQN, dtype: Int64


#### check 

In [11]:
# --- Guardrails: ensure ONLY cycles 1–2 changed ---
_changed = (
    df_my_cov_aligned_short["SNAP_src"].notna()
    & ~df_my_cov_aligned_short["SDDSRVYR"].isin([1.0, 2.0])
)
assert not _changed.any(), "SNAP was modified outside cycles 1–2 unexpectedly."

# --- Provenance-aware flags (nullable Int64-safe) ---
src_rank = {
    "FSD200": 3,                    # current, individual
    "FSD180": 2,                    # past 12m, individual
    "FSD170N_HH_singleton": 2,      # HH proxy but singleton household
    "FSD170N_HH": 1                 # HH proxy (multi-person)
}
df_my_cov_aligned_short["SNAP_src_rank"] = (
    df_my_cov_aligned_short["SNAP_src"].map(src_rank).astype("Int64")
)

# --- Sensitivity variants (build as Pandas Series, not NumPy arrays) ---
mask_indiv = df_my_cov_aligned_short["SNAP_src"].isin(["FSD200", "FSD180"])
mask_indiv_or_single = df_my_cov_aligned_short["SNAP_src"].isin(
    ["FSD200", "FSD180", "FSD170N_HH_singleton"]
)

# Start as all NA (nullable Int64), then fill by mask
df_my_cov_aligned_short["SNAP_indiv_only"] = pd.Series(
    pd.NA, index=df_my_cov_aligned_short.index, dtype="Int64"
)
df_my_cov_aligned_short.loc[mask_indiv, "SNAP_indiv_only"] = (
    pd.to_numeric(df_my_cov_aligned_short.loc[mask_indiv, "SNAP"], errors="coerce")
      .astype("Int64")
)

df_my_cov_aligned_short["SNAP_indiv_plus_singleton"] = pd.Series(
    pd.NA, index=df_my_cov_aligned_short.index, dtype="Int64"
)
df_my_cov_aligned_short.loc[mask_indiv_or_single, "SNAP_indiv_plus_singleton"] = (
    pd.to_numeric(df_my_cov_aligned_short.loc[mask_indiv_or_single, "SNAP"], errors="coerce")
      .astype("Int64")
)

# Ensure the main binary is tidy nullable Int64 too
df_my_cov_aligned_short["SNAP_bin"] = pd.to_numeric(
    df_my_cov_aligned_short["SNAP"], errors="coerce"
).astype("Int64")

# --- Quick QC: adult participation rates by cycle (no save) ---
adults = df_my_cov_aligned_short.loc[df_my_cov_aligned_short["RIDAGEYR"] >= 18]

def _rate(s: pd.Series) -> float:
    s = pd.to_numeric(s, errors="coerce")
    denom = s.notna().sum()
    return float((s.eq(1)).sum() / denom * 100) if denom else float("nan")

qc = pd.DataFrame({
    "SNAP_bin_rate": adults.groupby("SDDSRVYR", observed=True)["SNAP_bin"].apply(_rate),
    "indiv_only_rate": adults.groupby("SDDSRVYR", observed=True)["SNAP_indiv_only"].apply(_rate),
    "indiv+singleton_rate": adults.groupby("SDDSRVYR", observed=True)["SNAP_indiv_plus_singleton"].apply(_rate),
}).round(2)

print("\nAdult SNAP participation rates by cycle (% among non-missing):\n", qc)

# Optional: free memory if not reusing FSQ pulls
# del fsq_9900, fsq_0102



Adult SNAP participation rates by cycle (% among non-missing):
           SNAP_bin_rate  indiv_only_rate  indiv+singleton_rate
SDDSRVYR                                                      
1.0               48.73            48.17                 48.17
2.0               54.66            54.66                 54.66
3.0               12.72              NaN                   NaN
4.0               11.10              NaN                   NaN
5.0               15.68              NaN                   NaN
6.0               19.82              NaN                   NaN
7.0               23.63              NaN                   NaN
8.0               22.37              NaN                   NaN
9.0               25.86              NaN                   NaN
10.0              23.67              NaN                   NaN
12.0                NaN              NaN                   NaN
66.0              24.46              NaN                   NaN


In [12]:
df_my_cov_aligned_short.columns

Index(['SEQN', 'SDDSRVYR', 'sdmvpsu', 'sdmvstra', 'RIDAGEYR', 'SEX', 'RACE',
       'household_size', 'EDU', 'pir', 'SMK_AVG', 'SMK', 'ALCG2', 'met_hr',
       'SMK_STATUS', 'CIGS_PER_DAY', 'PACK_YEARS', 'FORMER_SMOKER',
       'DRINKS_PER_DAY', 'ALCOHOL_CAT', 'bmic', 'DIABE', 'HYPERTEN', 'chol_rx',
       'CVD', 'cancer', 'probable_depression', 'ahei_total', 'unemployment2',
       'sdoh_access', 'ins', 'HOQ065', 'marriage', 'SNAP', 'FS', 'WTINT2YR',
       'WTMEC2YR', 'WTSAF2YR', 'WTINT4YR', 'WTMEC4YR', 'WTINTPRP', 'WTMECPRP',
       'WTSAFPRP', 'wt_int', 'wt_mec', 'wt_fasting', 'wt_phlebotomy',
       'WTPH2YR', 'marriage_prev', 'marriage_label', 'marriage3', 'SNAP_prev',
       'SNAP_bin', 'SNAP_src', 'SNAP_src_rank', 'SNAP_indiv_only',
       'SNAP_indiv_plus_singleton'],
      dtype='object')

## check major missingness 

In [24]:
import pandas as pd
import numpy as np

df = df_my_cov_aligned_short

# --- keep only SNAP_bin & marriage3; drop helpers ---
snap_drop = {"SNAP","SNAP_prev","SNAP_src","SNAP_src_rank","SNAP_indiv_only","SNAP_indiv_plus_singleton"}
marriage_drop = {"marriage","marriage_prev","marriage_label"}
cols = [c for c in df.columns if c not in snap_drop | marriage_drop]
for must in ["SNAP_bin","marriage3"]:
    if must in df.columns and must not in cols:
        cols.append(must)

# -------- eligibility masks --------
def smokers_mask(d):
    if "SMK_STATUS" in d.columns:        # 1=never, 2=former, 3=current (adjust if different)
        return d["SMK_STATUS"].notna() & (d["SMK_STATUS"] != 1)
    m1 = d["FORMER_SMOKER"].eq(1) if "FORMER_SMOKER" in d.columns else False
    m2 = d["SMK"].eq(1) if "SMK" in d.columns else False
    return (m1 | m2).fillna(False)

def adults_mask(d):
    return pd.to_numeric(d["RIDAGEYR"], errors="coerce").ge(18).fillna(False)

def fasting_mask(d):  # prefer an actual fasting lab variable if present
    for cand in ["P_GLU","LBXGLU","fasting_glucose","glucose_fasting","wt_fasting"]:
        if cand in d.columns:
            return d[cand].notna()
    # fallback: very conservative (no rows)
    return pd.Series(False, index=d.index)

def phleb_mask(d):
    if "wt_phlebotomy" in d.columns:
        return d["wt_phlebotomy"].notna()
    # fallback: very conservative
    return pd.Series(False, index=d.index)

# Per-variable eligibility (row-level denominator)
eligibility = {
    "CIGS_PER_DAY": smokers_mask,
    "PACK_YEARS": smokers_mask,
    "SMK_AVG": smokers_mask,
    "probable_depression": adults_mask,
    # weights: compute missingness only where applicable to the sub-sample
    "WTSAFPRP": fasting_mask,            # fasting subsample (e.g., P_GLU universe)
    "WTPH2YR": phleb_mask,               # phlebotomy subsample
}

# Cycle-level applicability helper (exclude cycles where a var basically doesn't exist)
def applicable_cycles(d, col, thresh=0.10):
    by_cyc = d.groupby("SDDSRVYR", observed=True)[col].apply(lambda s: s.notna().mean())
    return set(by_cyc[by_cyc > thresh].index.tolist())

# Some weights are intended only for specific cycles (override presence heuristic)
cycle_overrides = {
    "WTINTPRP": {66.0},   # pre-pandemic combined cycle
    "WTMECPRP": {66.0},
    # Add any others you use with known cycle scopes
    # "WTINT4YR": {9.0, 10.0},  # example if you want to pin 4-year weights
    # "WTMEC4YR": {9.0, 10.0},
}

# -------- compute effective missingness --------
cycles = sorted(df["SDDSRVYR"].dropna().unique().tolist())
eff_miss = pd.DataFrame(index=[int(c) for c in cycles], columns=[])
overall_eff = {}

for col in cols:
    # 1) row-level eligibility
    elig = eligibility.get(col, lambda d: pd.Series(True, index=d.index))(df)

    # 2) cycle applicability
    if col in cycle_overrides:
        apps = cycle_overrides[col]
    else:
        apps = applicable_cycles(df, col, thresh=0.10)  # empirical presence

    cyc_mask = df["SDDSRVYR"].isin(list(apps)) if apps else pd.Series(False, index=df.index)
    use = elig & cyc_mask

    # per-cycle effective missingness
    per_cyc = []
    for cyc in cycles:
        m = use & df["SDDSRVYR"].eq(cyc)
        denom = m.sum()
        per_cyc.append(np.nan if denom == 0 else round(df.loc[m, col].isna().mean()*100, 1))
    eff_miss[col] = per_cyc

    # overall effective missingness
    denom_all = use.sum()
    overall_eff[col] = np.nan if denom_all == 0 else round(df.loc[use, col].isna().mean()*100, 1)

eff_miss.index.name = "SDDSRVYR"
eff_miss = eff_miss.sort_index()
overall_eff = pd.Series(overall_eff, name="OVERALL_effective_%miss").sort_values(ascending=False)

print("Top 20 by OVERALL *effective* % missing (weights handled as subsamples):\n",
      overall_eff.head(20))

top20_cols = overall_eff.head(20).index.tolist()
display(eff_miss[top20_cols])


Top 20 by OVERALL *effective* % missing (weights handled as subsamples):
 CIGS_PER_DAY      79.4
SMK_AVG           79.4
PACK_YEARS        77.9
DRINKS_PER_DAY    71.7
ahei_total        54.4
SNAP_bin          52.6
FS                46.9
HOQ065            46.7
ALCOHOL_CAT       46.5
ALCG2             46.5
sdoh_access       46.1
SMK_STATUS        45.7
SMK               45.7
FORMER_SMOKER     45.6
unemployment2     44.0
met_hr            43.5
marriage3         39.0
ins               38.4
wt_phlebotomy     32.4
wt_fasting        24.1
Name: OVERALL_effective_%miss, dtype: float64


Unnamed: 0_level_0,CIGS_PER_DAY,SMK_AVG,PACK_YEARS,DRINKS_PER_DAY,ahei_total,SNAP_bin,FS,HOQ065,ALCOHOL_CAT,ALCG2,sdoh_access,SMK_STATUS,SMK,FORMER_SMOKER,unemployment2,met_hr,marriage3,ins,wt_phlebotomy,wt_fasting
SDDSRVYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,79.5,79.5,76.4,75.7,19.0,84.9,52.4,52.0,55.4,55.4,51.9,51.2,51.2,51.0,51.0,51.0,39.2,52.4,,13.6
2,78.3,78.3,76.9,75.4,18.2,85.4,54.2,51.9,54.5,54.5,51.8,51.1,51.1,51.0,51.0,51.0,34.3,52.3,,11.8
3,77.8,77.8,75.7,75.5,24.5,51.1,52.4,50.8,53.2,53.2,50.8,50.3,50.3,50.2,50.2,50.2,33.2,51.0,,11.8
4,78.3,78.3,77.7,75.4,69.5,52.5,52.4,52.4,53.9,53.9,52.3,51.9,51.9,51.9,51.9,51.9,35.4,54.6,,13.0
5,77.9,77.9,77.3,70.1,68.9,42.1,42.1,42.1,43.8,43.8,42.1,41.6,41.6,41.5,41.5,41.5,41.6,44.5,,13.8
6,78.5,78.5,78.0,68.7,67.1,41.7,41.7,41.4,42.5,42.5,41.3,41.0,41.0,41.0,41.0,41.0,41.0,44.6,,12.3
7,80.3,80.3,79.9,68.7,69.5,43.5,43.3,43.4,42.4,42.4,43.3,43.1,43.1,43.0,43.0,43.0,43.1,46.7,,11.9
8,79.6,79.6,79.4,66.9,70.1,44.1,44.1,44.1,41.8,41.8,44.0,43.3,43.3,43.3,43.3,43.3,43.3,46.9,,12.1
9,81.6,81.6,79.6,68.4,71.3,45.0,44.7,44.8,42.5,42.5,44.6,42.8,42.8,42.6,42.6,42.6,42.7,48.7,,12.9
10,82.2,82.2,77.8,,70.1,43.8,43.3,43.3,40.2,40.2,43.2,39.8,39.8,39.8,39.8,39.8,39.9,44.7,,12.0


## check focus on pre18 missing

In [25]:
import pandas as pd
import numpy as np

df = df_my_cov_aligned_short

# ---------- config ----------
PRE2018_CYCLES = list(range(1, 11))          # cycles 1..10
THRESH = 20.0                                 # non-missingness (%)
IGNORE_VARS = {"CIGS_PER_DAY","SMK_AVG","PACK_YEARS","DRINKS_PER_DAY"}  # your expected-missing set
SNAP_HELPERS = {"SNAP","SNAP_prev","SNAP_src","SNAP_src_rank","SNAP_indiv_only","SNAP_indiv_plus_singleton"}
MARRIAGE_HELPERS = {"marriage","marriage_prev","marriage_label"}
ID_VARS = {"SEQN","SDDSRVYR"}

# keep only SNAP_bin & marriage3 for those topics
cols = []
for c in df.columns:
    if c in ID_VARS:                         # we won't audit these here
        continue
    if c in SNAP_HELPERS or c in MARRIAGE_HELPERS:
        continue
    if c in IGNORE_VARS:                     # skip your “expected missing” vars
        continue
    if c.startswith("WT"):                   # skip structural weight columns (e.g., WTSAFPRP, WTMECPRP...)
        continue
    cols.append(c)

# ensure main signals present if available
for must in ["SNAP_bin","marriage3"]:
    if must in df.columns and must not in cols:
        cols.append(must)

# subset to pre-2018 cycles
pre_mask = df["SDDSRVYR"].isin(PRE2018_CYCLES)
pre = df.loc[pre_mask, cols + ["SDDSRVYR"]].copy()

# non-missingness (%) by cycle
nonmiss = pre.groupby("SDDSRVYR", observed=True)[cols].apply(
    lambda x: x.notna().mean() * 100, include_groups=False
).round(1).sort_index()

# find columns with any cycle < THRESH
flag_mask = nonmiss.lt(THRESH)
flagged_cols = flag_mask.any(axis=0)
flagged = nonmiss.loc[:, flagged_cols]

# summary table
summary = pd.DataFrame({
    "min_nonmiss_%": flagged.min(axis=0),
    "max_nonmiss_%": flagged.max(axis=0),
    "cycles_below_thresh": flag_mask.loc[:, flagged_cols].sum(axis=0).astype(int),
    "cycles_triggering": flag_mask.loc[:, flagged_cols].apply(lambda s: [int(c) for c in s.index[s].tolist()], axis=0)
}).sort_values(["cycles_below_thresh","min_nonmiss_%"], ascending=[False, True])

print(f"Pre-2018 major-missing check (non-missing < {THRESH}% in any cycle):")
print(f"Flagged variables: {len(summary)}")
display(summary)

print("\nPer-cycle non-missingness (%) for flagged variables (rows = cycles 1..10):")
display(flagged)


Pre-2018 major-missing check (non-missing < 20.0% in any cycle):
Flagged variables: 3


Unnamed: 0,min_nonmiss_%,max_nonmiss_%,cycles_below_thresh,cycles_triggering
wt_phlebotomy,0.0,0.0,10,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
probable_depression,6.8,60.4,3,"[1, 2, 3]"
SNAP_bin,14.6,58.3,2,"[1, 2]"



Per-cycle non-missingness (%) for flagged variables (rows = cycles 1..10):


Unnamed: 0_level_0,probable_depression,wt_phlebotomy,SNAP_bin
SDDSRVYR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,7.2,0.0,15.1
2.0,7.4,0.0,14.6
3.0,6.8,0.0,48.9
4.0,51.5,0.0,47.5
5.0,59.1,0.0,57.9
6.0,60.4,0.0,58.3
7.0,57.6,0.0,56.5
8.0,58.2,0.0,55.9
9.0,57.5,0.0,55.0
10.0,59.8,0.0,56.2


In [None]:
#### working on notes to mark weights 