## import previous file

In [1]:
import pandas as pd
from pathlib import Path

p = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output/cov_addv4_99_23.parquet")
df_my_cov_aligned_short = pd.read_parquet(p)  # uses pyarrow/fastparquet if available
print(df_my_cov_aligned_short.shape)
df_my_cov_aligned_short.head()


(128809, 68)


Unnamed: 0,SEQN,SDDSRVYR,sdmvpsu,sdmvstra,RIDAGEYR,SEX,RACE,re,household_size,DMDHHSIZ,...,HOQ065_structural_missing,household_size_structural_missing,chol_rx_structural_missing,RIAGENDR,drinking,alcg2,perE_alco,METSCORE_fromPAQ,LTPA_fromPAQ,met_hr_recalc_from
0,1,1.0,1,5,2.0,F,4.0,Other Hispanic,3,3.0,...,False,False,False,2.0,,,0.0,,,
1,2,1.0,3,1,77.0,M,3.0,Mexican American,1,1.0,...,False,False,False,1.0,0.065753,2.0,0.0,,,from_new_PAQ
2,3,1.0,2,7,10.0,F,3.0,Mexican American,4,4.0,...,False,False,False,2.0,,,0.0,,,
3,4,1.0,1,2,1.0,M,4.0,Other Hispanic,7,7.0,...,False,False,False,1.0,,,0.0,,,
4,5,1.0,2,8,49.0,M,3.0,Mexican American,3,3.0,...,False,False,False,1.0,1.714286,2.0,9.101101,,,from_new_PAQ


## show missing 

In [2]:
df = df_my_cov_aligned_short

# use every column except the grouper
# ignore PACK_YEARS, CIGS_PER_DAY, probable_depression and etc as missing naturally 
# ignore dulicate safe saved old column name

exclude = {"SDDSRVYR","CIGS_PER_DAY","PACK_YEARS","probable_depression","wt_phlebotomy", "WTSAFPRP",
           "WTINT2YR", "WTMEC2YR", "WTPH2YR", "WTSAF2YR", "WTMEC4YR", "WTINTPRP", "WTMECPRP", "WTINT4YR",
           "SNAP", "SNAP_src", "SNAP_bin", "SNAP_src_rank", "bmi", "RIAGENDR",
           "SNAP_indiv_only","FS", "ahei_total", "HOQ065", "marriage_label", "marriage_prev",
           "METSCORE_fromPAQ","perE_alco","LTPA_fromPAQ","SNAP_indiv_plus_singleton", 
           "SMK_AVG", "BMI_CLAS", "household_size", "DMDHHSIZ"}
cols_all = [c for c in df.columns if c not in exclude]

# % missing by cycle (split into two lines)
is_na = df[cols_all].isna()
pct_miss = is_na.groupby(df["SDDSRVYR"]).mean().mul(100)

# keep only columns that exceed 80% missing in ANY cycle
pct_miss_gt80 = pct_miss.loc[:, (pct_miss > 80).any(axis=0)].round(1)

print(pct_miss_gt80)


          chol_rx  alcg2
SDDSRVYR                
1.0           0.0   62.9
2.0           0.0   66.5
3.0           0.0   62.9
4.0           0.0   62.7
5.0           0.0   55.4
6.0           0.0   55.4
7.0           0.0   55.1
8.0           0.0   52.2
9.0           0.0   52.8
10.0          0.0   56.0
12.0         29.1   61.4
66.0        100.0   84.3


## fetch chol and code chol_rx

In [6]:
import io, re, requests
import pandas as pd
from pandas.api.types import is_numeric_dtype

# =========================
# Core helpers
# =========================
def _read_xpt(url, cols_upper=True):
    r = requests.get(url); r.raise_for_status()
    df = pd.read_sas(io.BytesIO(r.content), format="xport", encoding="latin1")
    if cols_upper:
        df.columns = [c.upper() for c in df.columns]
    if "SEQN" in df.columns:
        df["SEQN"] = pd.to_numeric(df["SEQN"], errors="coerce").astype("Int64")
    return df

def _try_read_xpt(url, cols_upper=True):
    """Return empty DataFrame on HTTP errors so optional files don't crash."""
    try:
        if not url:
            return pd.DataFrame()
        return _read_xpt(url, cols_upper=cols_upper)
    except requests.HTTPError:
        return pd.DataFrame()

def _flag_yes_no(series, yes=1):
    # NHANES yes/no pattern: 1=Yes, 2=No, 7/9=Missing
    return (series == yes).astype("Int64")

def _hc_med_from_bpq(bpq):
    """
    Cholesterol-med indicator from BPQ:
    - 2017–Mar 2020: BPQ100D (now taking prescribed cholesterol medicine)
    - 2021–2022:     BPQ101D (now taking prescribed cholesterol medicine)
    """
    if "BPQ101D" in bpq.columns:
        return _flag_yes_no(bpq["BPQ101D"])
    if "BPQ100D" in bpq.columns:
        return _flag_yes_no(bpq["BPQ100D"])
    return pd.Series(pd.NA, index=bpq.index, dtype="Int64")

def _collapse_rx_to_person(rx_df, drug_name_cols=("RXDDRUG","RXDBNAME","RXDDC1")):
    """
    Person-level flag 'any_lipid_lowering' from RXQ_RX_* when drug names exist.
    If no usable name columns (e.g., RXQ_RX_L), return empty DF.
    """
    if rx_df is None or rx_df.empty:
        return pd.DataFrame(columns=["SEQN","any_lipid_lowering"]).set_index("SEQN")
    drug_col = next((c for c in drug_name_cols if c in rx_df.columns), None)
    if drug_col is None:
        return pd.DataFrame(columns=["SEQN","any_lipid_lowering"]).set_index("SEQN")

    name = rx_df[drug_col].astype(str).str.lower()

    # Non-capturing alternation to avoid regex group warnings
    statins = ["atorvastatin","simvastatin","rosuvastatin","pravastatin",
               "lovastatin","fluvastatin","pitavastatin"]
    others  = ["ezetimibe","alirocumab","evolocumab","inclisiran",
               "bempedoic","cholestyramine","colestipol","colesevelam",
               "niacin","fenofibrate","gemfibrozil","clofibrate","bezafibrate"]
    pattern = r"(?:%s)" % "|".join(map(re.escape, statins + others))

    any_ll = name.str.contains(pattern, regex=True, na=False)
    person = (
        rx_df.loc[any_ll, ["SEQN"]]
        .dropna()
        .drop_duplicates()
        .assign(any_lipid_lowering=1)
        .set_index("SEQN")
    )
    return person

def _friedewald_ldl(tc, hdl, tg):
    """
    LDL (mg/dL) = TC - HDL - (TG/5) when TG < 400 mg/dL.
    Only compute where all three are present and valid.
    """
    ldl = pd.Series(pd.NA, index=tc.index, dtype="Float64")
    cond = tc.notna() & hdl.notna() & tg.notna() & (tg < 400)
    ldl.loc[cond] = (tc.loc[cond] - hdl.loc[cond] - (tg.loc[cond] / 5.0))
    return ldl

def _find_numeric_col(df, preferred, fallback_regex):
    """
    Return the first existing column among `preferred`, else the first numeric
    column whose name matches `fallback_regex` (case-insensitive), else None.
    """
    if df is None or df.empty:
        return None
    for c in preferred:
        if c in df.columns and is_numeric_dtype(df[c]):
            return c
    pats = re.compile(fallback_regex, flags=re.I)
    for c in df.columns:
        if pats.search(c) and is_numeric_dtype(df[c]):
            return c
    return None

# =========================
# Period-specific sources
# =========================
BASE_2017 = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles"
BASE_2021 = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles"

URLS_P = {
    "bpq":   f"{BASE_2017}/P_BPQ.xpt",      # BPQ080, BPQ100D
    "rxq":   f"{BASE_2017}/P_RXQ_RX.xpt",   # has drug names
    "hdl":   f"{BASE_2017}/P_HDL.xpt",      # HDL (Direct)
    "tchol": f"{BASE_2017}/P_TCHOL.xpt",    # Total chol (+ sometimes direct LDL)
    "trig":  f"{BASE_2017}/P_TRIGLY.xpt",   # TG (exists for this period)
}

URLS_L = {
    "bpq":   f"{BASE_2021}/BPQ_L.xpt",      # BPQ080, BPQ101D
    "rxq":   f"{BASE_2021}/RXQ_RX_L.xpt",   # NO drug names
    "hdl":   f"{BASE_2021}/HDL_L.xpt",      # HDL (Direct)
    "tchol": f"{BASE_2021}/TCHOL_L.xpt",    # Total chol (+ usually direct LDL)
    # Note: TRIGLY_L.xpt not published; omit TG for L cycle
}

# =========================
# Builder for a given period
# =========================
def build_period_df(urls: dict, period_label: str) -> pd.DataFrame:
    # --- BPQ (ever told high cholesterol; on cholesterol meds) ---
    bpq = _read_xpt(urls["bpq"])
    bpq_keep = bpq[["SEQN"]].copy()

    # Ever told cholesterol high (BPQ080 typical)
    qcol = "BPQ080" if "BPQ080" in bpq.columns else None
    if qcol is None:
        cand = [c for c in bpq.columns if re.fullmatch(r"BPQ0?80", c)]
        qcol = cand[0] if cand else None
    bpq_keep["hc_dx"] = _flag_yes_no(bpq[qcol]) if qcol else pd.Series(pd.NA, index=bpq_keep.index, dtype="Int64")

    # Now taking cholesterol meds (BPQ100D or BPQ101D)
    bpq_keep["hc_med_bpq"] = _hc_med_from_bpq(bpq)

    # --- RXQ (optional name-based screen; empty for RXQ_RX_L) ---
    rxq = _try_read_xpt(urls.get("rxq", ""))
    rx_person = _collapse_rx_to_person(rxq)
    if not rx_person.empty:
        rx_person["hc_med_rx"] = rx_person["any_lipid_lowering"].astype("Int64")
        rx_person = rx_person.drop(columns=["any_lipid_lowering"])
    else:
        rx_person = pd.DataFrame(columns=["SEQN","hc_med_rx"]).set_index("SEQN")

    # --- Labs ---
    hdl  = _read_xpt(urls["hdl"])
    tch  = _read_xpt(urls["tchol"])
    trig = _try_read_xpt(urls.get("trig", ""), cols_upper=True) if "trig" in urls else pd.DataFrame()

    # Robustly locate variables (names vary slightly by cycle)
    # HDL (mg/dL): prefer LBDHDD, then LBXHDD; else fallback
    hdl_var = _find_numeric_col(
        hdl,
        preferred=["LBDHDD", "LBXHDD", "LBDHDD_1"],
        fallback_regex=r"\bHDL\b|HDDS?\b|^LB..HD"
    )
    # Total cholesterol (mg/dL): usually LBXTC
    tc_var = _find_numeric_col(
        tch,
        preferred=["LBXTC"],
        fallback_regex=r"\bTOTAL.*CHOL|^LB..TC$|\bTC\b"
    )
    # Direct LDL (mg/dL): often LBDLDL or LBDLDLD (if present)
    ldl_var = _find_numeric_col(
        tch,
        preferred=["LBDLDL", "LBDLDLD", "LBDLDLL"],
        fallback_regex=r"\bLDL\b|^LB.DLDL"
    )
    # Triglycerides (mg/dL): LBXTR when file exists (optional)
    tg_var = _find_numeric_col(
        trig,
        preferred=["LBXTR"],
        fallback_regex=r"\bTRI?G|^LB..TR$"
    ) if not trig.empty else None

    # Keep frames
    hdl_keep = hdl[["SEQN", hdl_var]].rename(columns={hdl_var: "hdl_mgdl"}) if hdl_var else pd.DataFrame(columns=["SEQN","hdl_mgdl"])
    tc_keep  = tch[["SEQN", tc_var ]].rename(columns={tc_var : "total_chol_mgdl"}) if tc_var  else pd.DataFrame(columns=["SEQN","total_chol_mgdl"])
    ldl_keep = tch[["SEQN", ldl_var]].rename(columns={ldl_var: "ldl_mgdl"}) if ldl_var else pd.DataFrame(columns=["SEQN","ldl_mgdl"])
    tg_keep  = trig[["SEQN", tg_var]].rename(columns={tg_var: "trig"}) if tg_var else pd.DataFrame(columns=["SEQN","trig"])

    # Merge person-level
    df = (
        bpq_keep
        .merge(rx_person, how="left", left_on="SEQN", right_index=True)
        .merge(hdl_keep,  how="left", on="SEQN")
        .merge(tc_keep,   how="left", on="SEQN")
        .merge(ldl_keep,  how="left", on="SEQN")
        .merge(tg_keep,   how="left", on="SEQN")
    )

    # Choose med flag: prefer RX names when available, else BPQ
    df["hc_med"] = df["hc_med_rx"].where(df["hc_med_rx"].notna(), df["hc_med_bpq"])
    df["hc_med"] = df["hc_med"].fillna(0).astype("Int64")

    # Lab-based high TC (>200 mg/dL)
    df["hc_lab"] = (df["total_chol_mgdl"] > 200).astype("Int64")

    # Compute LDL if missing and feasible (Friedewald)
    need_calc = df["ldl_mgdl"].isna()
    if need_calc.any() and {"total_chol_mgdl","hdl_mgdl","trig"} <= set(df.columns) and df["trig"].notna().any():
        ldl_est = _friedewald_ldl(df["total_chol_mgdl"], df["hdl_mgdl"], df["trig"])
        df.loc[need_calc, "ldl_mgdl"] = ldl_est.loc[need_calc]

    # Final chol_rx: any of dx / med / lab
    for col in ["hc_dx","hc_med","hc_lab"]:
        if col not in df.columns:
            df[col] = pd.Series(0, index=df.index, dtype="Int64")
        else:
            df[col] = df[col].fillna(0).astype("Int64")
    df["chol_rx"] = ((df[["hc_dx","hc_med","hc_lab"]].sum(axis=1) > 0).astype(int))

    df["period"] = period_label

    # Normalize dtypes for numeric outputs
    for c in ["total_chol_mgdl","hdl_mgdl","ldl_mgdl","trig"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("Float64")

    keep_cols = ["SEQN","period","chol_rx","hc_dx","hc_med","hc_lab",
                 "total_chol_mgdl","hdl_mgdl","ldl_mgdl","trig"]
    for c in keep_cols:
        if c not in df.columns:
            df[c] = pd.NA
    return df[keep_cols]

# =========================
# Build both periods & QC
# =========================
df_p = build_period_df(URLS_P, period_label="2017-2020 (P)")
df_l = build_period_df(URLS_L, period_label="2021-2022 (L)")
df_both = pd.concat([df_p, df_l], ignore_index=True)

qc = (
    df_both.groupby("period", dropna=False)
    .agg(n=("SEQN","count"),
         chol_rx_rate=("chol_rx","mean"),
         mean_tc=("total_chol_mgdl","mean"),
         mean_hdl=("hdl_mgdl","mean"),
         mean_ldl=("ldl_mgdl","mean"))
    .round(3)
)
print(qc)

# =========================
# (Optional) mmol/L conversions
# =========================
# df_both["tc_mmol"]  = (df_both["total_chol_mgdl"] * 0.02586).astype("Float64")
# df_both["hdl_mmol"] = (df_both["hdl_mgdl"]        * 0.02586).astype("Float64")
# df_both["ldl_mmol"] = (df_both["ldl_mgdl"]        * 0.02586).astype("Float64")

# =========================
# (Optional) Join into your main covariate DF and save
# =========================
# df_my_cov_aligned_short = df_my_cov_aligned_short.merge(
#     df_both.drop(columns=["period"]), on="SEQN", how="left"
# )
# out_path = "cov_addv4_99_23.parquet"
# df_my_cov_aligned_short.to_parquet(out_path, index=False)
# print("✓ Saved:", out_path)


                   n  chol_rx_rate  mean_tc  mean_hdl  mean_ldl
period                                                         
2017-2020 (P)  10195         0.515  183.056    53.339   107.112
2021-2022 (L)   8501         0.522  185.691    54.314      <NA>


  df["hc_med"] = df["hc_med"].fillna(0).astype("Int64")


#### check why chol_rx rate is high

In [12]:
import pandas as pd
import numpy as np

df = df_my_cov_aligned_short.copy()

# -------- helpers --------
def pick_col(frame, candidates):
    """Return the first existing column from candidates, else None."""
    for c in candidates:
        if c in frame.columns:
            return c
    return None

def to_numeric(series):
    return pd.to_numeric(series, errors="coerce")

def flag_yes1(series):
    """Return 0/1 float: treat numeric==1 as 'yes'; also accept booleans/strings."""
    if series is None:
        return pd.Series(np.nan, index=df.index, dtype="float64")
    s = series.copy()
    # Try numeric first
    s_num = to_numeric(s)
    if s_num.notna().any():
        return (s_num == 1).astype("float64")
    # Fallback for text/booleans
    s_str = s.astype(str).str.lower()
    yes = {"1","yes","y","true","t"}
    out = s_str.isin(yes).astype("float64")
    out[~s_str.isin(yes | {"0","no","n","false","f","nan","none"})] = np.nan
    return out

def prevalence(flag, wt=None):
    f = to_numeric(flag)
    if wt is None:
        f = f.dropna()
        return np.nan if f.empty else float(f.mean())
    w = to_numeric(wt)
    m = f.notna() & w.notna()
    return np.nan if not m.any() else float((f[m]*w[m]).sum() / w[m].sum())

# -------- pick inputs robustly --------
col_dx  = pick_col(df, ["hc_dx", "BPQ080"])   # 1=ever told high cholesterol
col_med = pick_col(df, ["hc_med", "BPQ101D", "BPQ100D"])  # 1=on chol meds now
col_tc  = pick_col(df, ["total_chol_mgdl", "LBXTC"])      # mg/dL

# Build flags
dx_flag  = flag_yes1(df[col_dx])  if col_dx else pd.Series(np.nan, index=df.index, dtype="float64")
med_flag = flag_yes1(df[col_med]) if col_med else pd.Series(np.nan, index=df.index, dtype="float64")
tc_vals  = to_numeric(df[col_tc]) if col_tc else pd.Series(np.nan, index=df.index, dtype="float64")
lab


NameError: name 'lab' is not defined

In [7]:
df = df_my_cov_aligned_short.copy()

# --- make sure chol_rx is a clean 0/1 Int64 and ignore NAs for the rate denom ---
if "chol_rx" not in df.columns:
    raise ValueError("chol_rx not found. Merge the cholesterol builder outputs first.")
chol = pd.to_numeric(df["chol_rx"], errors="coerce").astype("Int64")

# =============== Unweighted prevalence by cycle ===============
unw = (
    pd.DataFrame({"SDDSRVYR": df["SDDSRVYR"], "chol_rx": chol})
    .dropna(subset=["chol_rx"])
    .groupby("SDDSRVYR")
    .agg(n=("chol_rx", "size"),
         cases=("chol_rx", "sum"))
    .assign(chol_rx_rate=lambda d: d["cases"]/d["n"])
    .reset_index()
)

print("Unweighted chol_rx prevalence by cycle:")
print(unw)

# =============== Weighted prevalence by cycle (optional) ===============
# Prefer WTMEC2YR (exam weights). If not present, fall back to WTINT2YR.
wt_col = None
for cand in ["WTMEC2YR", "WTINT2YR", "WTMECPRP", "WTINTPRP"]:
    if cand in df.columns:
        wt_col = cand
        break

if wt_col:
    tmp = (
        pd.DataFrame({
            "SDDSRVYR": df["SDDSRVYR"],
            "chol_rx": chol,
            "wt": pd.to_numeric(df[wt_col], errors="coerce")
        })
        .dropna(subset=["chol_rx", "wt"])
    )
    w = (
        tmp.assign(w_cases=lambda d: d["wt"]*d["chol_rx"])
          .groupby("SDDSRVYR")
          .agg(w_sum=("wt","sum"),
               w_cases=("w_cases","sum"),
               n=("chol_rx","size"))  # n shown for reference
          .assign(chol_rx_rate_wt=lambda d: d["w_cases"]/d["w_sum"])
          .reset_index()
    )
    print(f"\n{wt_col}-weighted chol_rx prevalence by cycle:")
    print(w[["SDDSRVYR","n","chol_rx_rate_wt"]].rename(columns={"chol_rx_rate_wt":"chol_rx_rate"}))
else:
    print("\nNo weight column found; skipped weighted prevalence.")


Unweighted chol_rx prevalence by cycle:
    SDDSRVYR      n  cases  chol_rx_rate
0        1.0   9965    422      0.042348
1        2.0  11039    613       0.05553
2        3.0  10122    735      0.072614
3        4.0  10348    773        0.0747
4        5.0  10149   1194      0.117647
5        6.0  10537   1232      0.116921
6        7.0   9756   1100      0.112751
7        8.0  10175   1226      0.120491
8        9.0   9971   1170       0.11734
9       10.0   9254   1318      0.142425
10      12.0   8465   2126      0.251152

WTMEC2YR-weighted chol_rx prevalence by cycle:
    SDDSRVYR      n  chol_rx_rate
0        1.0   9282      0.055405
1        2.0  10477      0.070017
2        3.0   9643      0.090047
3        4.0   9950      0.103745
4        5.0   9762      0.119686
5        6.0  10253      0.125808
6        7.0   9338      0.135788
7        8.0   9813      0.152367
8        9.0   9544      0.138229
9       10.0   8704      0.145363
10      12.0   8465      0.211396
