## import file

In [82]:
import pandas as pd
from pathlib import Path

p = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output/cov_addv7_99_23.parquet")
df_my_cov_aligned_short = pd.read_parquet(p)  # uses pyarrow/fastparquet if available
print(df_my_cov_aligned_short.shape)
df_my_cov_aligned_short.head()


(128809, 68)


Unnamed: 0,SEQN,SDDSRVYR,sdmvpsu,sdmvstra,RIDAGEYR,SEX,RACE,re,household_size,DMDHHSIZ,...,LTPA_fromPAQ,chol_rx200,SMK_STATUS_STD,CIGS_PER_DAY_CAT,bmi_lu,bmi_cat_new,trouble_sleeping,sleep_hours_usual,CIDGSCOR,anxiety_rx
0,1,1,1,5,2.0,F,4.0,Other Hispanic,3,3.0,...,,,,,14.9,UNDER,,,,
1,2,1,3,1,77.0,M,3.0,Mexican American,1,1.0,...,,1.0,NEVER,,24.9,NORMAL,,,,
2,3,1,2,7,10.0,F,3.0,Mexican American,4,4.0,...,,0.0,,,17.63,UNDER,,,,
3,4,1,1,2,1.0,M,4.0,Other Hispanic,7,7.0,...,,,,,15.2,UNDER,,,,
4,5,1,2,8,49.0,M,3.0,Mexican American,3,3.0,...,,1.0,FORMER,,29.1,OVER,,,,


In [83]:
df_my_cov_aligned_short.columns

Index(['SEQN', 'SDDSRVYR', 'sdmvpsu', 'sdmvstra', 'RIDAGEYR', 'SEX', 'RACE',
       're', 'household_size', 'DMDHHSIZ', 'EDU', 'pir', 'met_hr',
       'SMK_STATUS', 'CIGS_PER_DAY', 'FORMER_SMOKER', 'METSCORE', 'LTPA',
       'DIABE', 'HYPERTEN', 'chol_rx', 'CVD', 'cancer', 'probable_depression',
       'ahei_total', 'unemployment2', 'ins', 'HOQ065', 'marriage', 'SNAP',
       'FS', 'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'WTINT4YR', 'WTMEC4YR',
       'WTPH2YR', 'WTINTPRP', 'WTMECPRP', 'WTSAFPRP', 'wt_int', 'wt_mec',
       'wt_fasting', 'wt_phlebotomy', 'marriage_prev', 'marriage_label',
       'marriage3', 'SNAP_src', 'SNAP_bin', 'SNAP_src_rank', 'SNAP_indiv_only',
       'SNAP_indiv_plus_singleton', 'bmi', 'RIAGENDR', 'drinking', 'alcg2',
       'perE_alco', 'METSCORE_fromPAQ', 'LTPA_fromPAQ', 'chol_rx200',
       'SMK_STATUS_STD', 'CIGS_PER_DAY_CAT', 'bmi_lu', 'bmi_cat_new',
       'trouble_sleeping', 'sleep_hours_usual', 'CIDGSCOR', 'anxiety_rx'],
      dtype='object')

## try create descriptive table 

#### 1) Config & label maps

In [84]:
# =========================
# 1) CONFIG & LABEL MAPS
# =========================
import pandas as pd
import numpy as np

AGE_FILTER = ">20"         # options: None, ">20", ">=18", etc.
WEIGHT_COL = "WTINT2YR"    # set to None for unweighted
ID_COL     = "SEQN"

# Optional label mappings (edit if your vars are coded as numbers)
SEX_MAP = {1: "Male", 2: "Female"}
RACE_MAP = {}  # e.g., {1:"Non-Hispanic White", 2:"Non-Hispanic Black", 3:"Hispanic", 4:"Other"}
EDU_MAP  = {}  # e.g., {1:"<High school", 2:"HS/GED", 3:"Some college", 4:"College+"}
SNAP_MAP = {}
FS_MAP   = {}

def bucket_pir(pir):
    """Family income-to-poverty ratio buckets."""
    if pd.isna(pir): return "Missing"
    if pir < 1.3:    return "< 1.3"
    if pir < 3.0:    return "1.3–2.99"
    return "≥ 3"


#### 2) Core helpers (string mapping, weights, mean(SE))

In [85]:
# =========================
# 2) CORE HELPERS
# =========================
def as_text(s, mapping=None):
    """Return series as text; if mapping provided and values are numeric, map to text."""
    if mapping:
        out = s.map(mapping)
        out = out.astype("string").fillna(s.astype("string"))
        return out
    return s.astype("string")

def get_weights(df, weight_col):
    """Return weight series and flag for weighted/unweighted."""
    if weight_col and (weight_col in df.columns):
        w = pd.to_numeric(df[weight_col], errors="coerce").fillna(0.0)
        w[w < 0] = 0
        return w, True
    return pd.Series(1.0, index=df.index), False

def mean_se(x, w=None):
    """
    Mean (SE) with optional weights.
    SE via sqrt(var_w / n_eff) using Kish effective sample size.
    """
    x = pd.to_numeric(x, errors="coerce")
    m = x.notna()
    if w is None:
        w = pd.Series(1.0, index=x.index)
    ww = w[m]; xx = x[m]
    if len(xx) == 0:
        return "NA (NA)", 0
    W = ww.sum()
    mu = (ww * xx).sum() / W
    var_w = (ww * (xx - mu) ** 2).sum() / W
    n_eff = (W ** 2) / (ww ** 2).sum() if (ww**2).sum() > 0 else len(xx)
    se = np.sqrt(var_w / max(n_eff, 1))
    return f"{mu:.2f} ({se:.2f})", int(m.sum())


#### 3) N(%) helper (unweighted n, weighted %)

In [86]:
# =========================
# 3) N(%) HELPER
# =========================
def n_pct(labels, w):
    """
    Return Series: label -> 'n (pct%)' with UNWEIGHTED n and WEIGHTED pct.
    Also return the unweighted total N (non-missing for that variable).
    """
    lab = labels.astype("string")
    mask = lab.notna()
    if mask.sum() == 0:
        return pd.Series(dtype="string"), 0
    L  = lab[mask]
    ww = w[mask]

    # unweighted counts (n)
    n_unw = L.value_counts()

    # weighted percent
    W_by_cat = ww.groupby(L).sum()
    W_tot = ww.sum()

    res = {}
    for cat in W_by_cat.index:
        n = int(n_unw.get(cat, 0))
        pct = (100.0 * W_by_cat[cat] / W_tot) if W_tot > 0 else np.nan
        res[cat] = f"{n} ({pct:.1f}%)"
    return pd.Series(res), int(mask.sum())

def add_rows(rows, group, var, ser, w, order=None):
    """Append categorical N(%) rows to rows list using n_pct()."""
    counts, N_unw = n_pct(ser, w)
    if order:
        counts = counts.reindex([o for o in order if o in counts.index])
    for cat, cell in counts.items():
        rows.append([group, var, cat, cell, N_unw])


#### 4) Smoking helper (your canonical function)

In [87]:
def smoking_table_all(df, weight_col=None, include_missing_row=True):
    """
    Returns rows with UNWEIGHTED n and WEIGHTED % using a single denominator
    of NEVER/FORMER/CURRENT.
    """
    if "SMK_STATUS_STD" not in df.columns or "CIGS_PER_DAY_CAT" not in df.columns:
        raise KeyError("Run align_smoking_vars(df) first to create SMK_STATUS_STD and CIGS_PER_DAY_CAT.")

    status = df["SMK_STATUS_STD"].astype("string")
    denom  = status.isin(["NEVER","FORMER","CURRENT"])

    # weights
    if weight_col and (weight_col in df.columns):
        w = pd.to_numeric(df[weight_col], errors="coerce").fillna(0.0)
    else:
        w = pd.Series(1.0, index=df.index)

    # weighted denominator (for percentages)
    W = float(w[denom].sum()) or 1.0

    # helper to format cell as "unweighted n (weighted %)"
    def cell(mask):
        n_unw = int(mask.sum())                 # UNWEIGHTED count
        w_sum = float(w[mask].sum())            # weighted count
        pct   = (100.0 * w_sum / W) if W > 0 else float("nan")
        return f"{n_unw} ({pct:.1f}%)"

    # masks
    nonsmokers = denom & (status == "NEVER")
    former     = denom & (status == "FORMER")
    current    = denom & (status == "CURRENT")

    cat = df["CIGS_PER_DAY_CAT"]
    lt15   = current & (cat == 1)
    m15_25 = current & (cat == 2)
    ge25   = current & (cat == 3)
    cur_m  = current & cat.isna()

    rows = [
        ("Nonsmokers",             cell(nonsmokers)),
        ("Former smokers",         cell(former)),
        ("<15 cigarettes/day",     cell(lt15)),
        ("15–24.9 cigarettes/day", cell(m15_25)),
        ("≥ 25 cigarettes/day",    cell(ge25)),
    ]
    if include_missing_row and cur_m.any():
        rows.append(("Current smokers (cigs/day missing)", cell(cur_m)))

    return pd.DataFrame(rows, columns=["Smoking status", "n (%)"])


#### 5) Build the descriptive table

In [88]:
# =========================
# 5) BUILD DESCRIPTIVE TABLE
# =========================
def build_descriptive_table(df):
    d = df.copy()

    # Age filter
    if AGE_FILTER:
        age = pd.to_numeric(d["RIDAGEYR"], errors="coerce")
        if AGE_FILTER == ">20":
            d = d[age > 20]
        elif AGE_FILTER == ">=18":
            d = d[age >= 18]

    # Weights
    w, weighted = get_weights(d, WEIGHT_COL)

    rows = []

    # --- Sociodemographics ---
    sex_col = "RIAGENDR" if "RIAGENDR" in d.columns else ("SEX" if "SEX" in d.columns else None)
    if sex_col:
        sex = as_text(d[sex_col], mapping=SEX_MAP if sex_col == "RIAGENDR" else None)
        add_rows(rows, "Sociodemographics", "Sex", sex, w, order=["Male","Female"])

    if "RACE" in d.columns:
        race = as_text(d["RACE"], mapping=RACE_MAP)
        add_rows(rows, "", "Race", race, w)

    if "EDU" in d.columns:
        edu = as_text(d["EDU"], mapping=EDU_MAP)
        add_rows(rows, "", "Education", edu, w)

    if "pir" in d.columns:
        pir_bucket = d["pir"].apply(bucket_pir)
        add_rows(rows, "", "Family income to poverty ratio", pir_bucket, w,
                 order=["< 1.3","1.3–2.99","≥ 3","Missing"])

    if "SNAP" in d.columns:
        snap = as_text(d["SNAP"], mapping=SNAP_MAP)
        add_rows(rows, "", "SNAP", snap, w)

    if "FS" in d.columns:
        fs = as_text(d["FS"], mapping=FS_MAP)
        add_rows(rows, "", "Food Insecurity", fs, w)

    if "household_size" in d.columns:
        cell, N = mean_se(d["household_size"], w if weighted else None)
        rows.append(["", "Household Size", "", cell, N])

    # --- Health Behaviors ---
    # Smoking: one-pass table with a common denominator via smoking_table_all()
    if {"SMK_STATUS_STD", "CIGS_PER_DAY_CAT"}.issubset(d.columns):
        st = smoking_table_all(
            d,
            weight_col=WEIGHT_COL if weighted else None,
            include_missing_row=True  # set False if you don't want the missing row
        )
        # Unweighted total N for the smoking variable denominator
        N_unw = int(d["SMK_STATUS_STD"].isin(["NEVER","FORMER","CURRENT"]).sum())

        for _, r in st.iterrows():
            rows.append([
                "Health Behaviors",
                "Smoking status",
                r["Smoking status"],          # Category
                r["n (%)"],                   # n (weighted %) already formatted
                N_unw                         # Total N (unweighted)
            ])

    # Drinking (from alcg2)
    if "alcg2" in d.columns:
        drink_map = {1: "Moderate drinker", 2: "Heavy drinker"}
        drink_status = d["alcg2"].map(drink_map).fillna("Nondrinkers")
        add_rows(rows, "", "Drinking status", drink_status, w,
                 order=["Nondrinkers","Moderate drinker","Heavy drinker"])

    # Physical activity
    if "met_hr" in d.columns:
        cell, N = mean_se(d["met_hr"]/60, w if weighted else None)
        rows.append(["", "Physical activity (met_hr)", "", cell, N])

    # --- Clinical Characteristics ---
    if "bmi_cat_new" in d.columns:
        add_rows(rows, "Clinical Characteristics", "BMI", as_text(d["bmi_cat_new"]), w,
                 order=["UNDER","NORMAL","OVER"])

    bin_vars = [
        ("Diabetes", "DIABE"),
        ("Hypertension", "HYPERTEN"),
        ("CVD", "CVD"),
        ("Cancer", "cancer"),
        ("Depression (probable)", "probable_depression"),
        ("Cholesterol Rx", "chol_rx"),
    ]
    for label, col in bin_vars:
        if col in d.columns:
            add_rows(rows, "", label, as_text(d[col]), w)

    # --- Dietary & Physiologic Measures ---
    cell, N = mean_se(d["RIDAGEYR"], w if weighted else None)
    rows.append(["Dietary & Physiologic Measures", "Age, years", "", cell, N])

    if "ahei_total" in d.columns:
        cell, N = mean_se(d["ahei_total"], w if weighted else None)
        rows.append(["", "AHEI", "", cell, N])

    table = pd.DataFrame(
        rows,
        columns=["Group", "Variable", "Category", "Primary population, N (%) or Mean (SE)", "Total N"]
    )
    table["Group"] = table["Group"].replace({"": pd.NA}).ffill()
    return table


#### 6) Run

In [89]:
# =========================
# 6) RUN
# =========================
table1 = build_descriptive_table(df_my_cov_aligned_short)
display(table1.head(60))


Unnamed: 0,Group,Variable,Category,"Primary population, N (%) or Mean (SE)",Total N
0,Sociodemographics,Sex,Male,33883 (48.0%),70956
1,Sociodemographics,Sex,Female,37073 (52.0%),70956
2,Sociodemographics,Race,1.0,10958 (8.0%),70956
3,Sociodemographics,Race,2.0,6132 (6.0%),70956
4,Sociodemographics,Race,3.0,31659 (67.3%),70956
5,Sociodemographics,Race,4.0,14691 (11.2%),70956
6,Sociodemographics,Race,6.0,5537 (5.1%),70956
7,Sociodemographics,Race,7.0,1979 (2.5%),70956
8,Sociodemographics,Education,1,7903 (5.9%),70935
9,Sociodemographics,Education,2,9766 (10.9%),70935


In [90]:
import pandas as pd
import numpy as np

# === CONFIG ===
AGE_FILTER = ">20"         # options: None, ">20", ">=18", etc.
WEIGHT_COL = "WTINT2YR"    # set to None for unweighted
ID_COL     = "SEQN"

# Optional label mappings (edit if your vars are coded as numbers)
SEX_MAP = {1: "Male", 2: "Female"}
RACE_MAP = {}  # e.g., {1:"Non-Hispanic White", 2:"Non-Hispanic Black", 3:"Hispanic", 4:"Other"}
EDU_MAP  = {}  # e.g., {1:"<High school", 2:"HS/GED", 3:"Some college", 4:"College+"}
SNAP_MAP = {}
FS_MAP   = {}

# PIR buckets (family income-to-poverty ratio)
def bucket_pir(pir):
    if pd.isna(pir): return "Missing"
    if pir < 1.3:    return "< 1.3"
    if pir < 3.0:    return "1.3–2.99"
    return "≥ 3"

# === HELPERS ===
def as_text(s, mapping=None):
    """Return series of labels; if mapping provided and values are numeric, map to text."""
    if mapping:
        out = s.map(mapping)
        out = out.astype("string").fillna(s.astype("string"))
        return out
    return s.astype("string")

def get_weights(df, weight_col):
    if weight_col and weight_col in df.columns:
        w = pd.to_numeric(df[weight_col], errors="coerce").fillna(0.0)
        w[w < 0] = 0
        return w, True
    return pd.Series(1.0, index=df.index), False

def mean_se(x, w=None):
    """Mean (SE) with (optional) weights; SE via sqrt(var_w / n_eff) as a simple approximation."""
    x = pd.to_numeric(x, errors="coerce")
    m = x.notna()
    if w is None:
        w = pd.Series(1.0, index=x.index)
    ww = w[m]; xx = x[m]
    if len(xx) == 0:
        return "NA (NA)", 0
    W = ww.sum()
    mu = (ww * xx).sum() / W
    var_w = (ww * (xx - mu) ** 2).sum() / W
    # Kish effective n:
    n_eff = (W ** 2) / (ww ** 2).sum() if (ww**2).sum() > 0 else len(xx)
    se = np.sqrt(var_w / max(n_eff, 1))
    return f"{mu:.2f} ({se:.2f})", len(xx)

def n_pct(labels, w):
    """Return a DataFrame of label -> 'n (pct%)' using weights w on non-missing labels."""
    lab = labels.astype("string")
    mask = lab.notna()
    ww = w[mask]
    L  = lab[mask]
    if len(ww) == 0:
        return pd.Series(dtype="string"), 0.0
    totals = ww.groupby(L).sum().sort_values(ascending=False)
    W = ww.sum()
    out = totals.apply(lambda n: f"{(n if (w!=1).any() else int(n))} ({100*n/W:.1f}%)")
    return out, W

def add_rows(rows, group, var, ser, w, order=None):
    """Append categorical N(%) rows to rows list."""
    counts, W = n_pct(ser, w)
    if order:
        counts = counts.reindex([o for o in order if o in counts.index]).dropna()
    for cat, cell in counts.items():
        rows.append([group, var, cat, cell, int(W) if (w==1).all() else round(W,1)])

# === BUILD TABLE ===
def build_descriptive_table(df):
    d = df.copy()

    # Age filter
    if AGE_FILTER:
        age = pd.to_numeric(d["RIDAGEYR"], errors="coerce")
        if AGE_FILTER == ">20":
            d = d[age > 20]
        elif AGE_FILTER == ">=18":
            d = d[age >= 18]

    # Weights
    w, weighted = get_weights(d, WEIGHT_COL)

    rows = []

    # --- Sociodemographics ---
    sex = as_text(d.get("RIAGENDR", d.get("SEX")), mapping=SEX_MAP if "RIAGENDR" in d.columns else None)
    add_rows(rows, "Sociodemographics", "Sex", sex, w, order=["Male","Female"])

    if "RACE" in d.columns:
        race = as_text(d["RACE"], mapping=RACE_MAP)
        add_rows(rows, "", "Race", race, w)

    if "EDU" in d.columns:
        edu = as_text(d["EDU"], mapping=EDU_MAP)
        add_rows(rows, "", "Education", edu, w)

    if "pir" in d.columns:
        pir_bucket = d["pir"].apply(bucket_pir)
        add_rows(rows, "", "Family income to poverty ratio", pir_bucket, w,
                 order=["< 1.3","1.3–2.99","≥ 3","Missing"])

    if "SNAP" in d.columns:
        snap = as_text(d["SNAP"], mapping=SNAP_MAP)
        add_rows(rows, "", "SNAP", snap, w)

    if "FS" in d.columns:
        fs = as_text(d["FS"], mapping=FS_MAP)
        add_rows(rows, "", "Food Insecurity", fs, w)

    if "household_size" in d.columns:
        cell, N = mean_se(d["household_size"], w if weighted else None)
        rows.append(["", "Household Size", "", cell, N])

    # --- Health Behaviors ---
    # Smoking: one-pass table with common denominator via smoking_table_all()
    if {"SMK_STATUS_STD", "CIGS_PER_DAY_CAT"}.issubset(d.columns):
        st = smoking_table_all(
            d,
            weight_col=WEIGHT_COL if weighted else None,
            include_missing_row=True  # set False if you don't want the missing row
        )
        # Unweighted total N for the smoking variable denominator
        N_unw = int(d["SMK_STATUS_STD"].isin(["NEVER","FORMER","CURRENT"]).sum())
    
        for _, r in st.iterrows():
            rows.append([
                "Health Behaviors",
                "Smoking status",
                r["Smoking status"],          # Category
                r["n (%)"],                   # n (weighted %) already formatted
                N_unw                         # Total N (unweighted)
            ])
        
    # Drinking from alcg2
    if "alcg2" in d.columns:
        drink_map = {1: "Moderate drinker", 2: "Heavy drinker"}
        drink_status = d["alcg2"].map(drink_map)
        drink_status = drink_status.fillna("Nondrinkers")
        add_rows(rows, "", "Drinking status", drink_status, w,
                 order=["Nondrinkers","Moderate drinker","Heavy drinker"])

    # Physical activity
    if "met_hr" in d.columns:
        cell, N = mean_se(d["met_hr"], w if weighted else None)
        rows.append(["", "Physical activity (met_hr)", "", cell, N])

    # --- Clinical Characteristics ---
    if "bmi_cat_new" in d.columns:
        add_rows(rows, "Clinical Characteristics", "BMI", as_text(d["bmi_cat_new"]), w,
                 order=["UNDER","NORMAL","OVER"])

    bin_vars = [
        ("Diabetes", "DIABE"),
        ("Hypertension", "HYPERTEN"),
        ("CVD", "CVD"),
        ("Cancer", "cancer"),
        ("Depression (probable)", "probable_depression"),
        ("Cholesterol Rx", "chol_rx"),
    ]
    for label, col in bin_vars:
        if col in d.columns:
            add_rows(rows, "", label, as_text(d[col]), w)

    # --- Dietary & Physiologic Measures ---
    cell, N = mean_se(d["RIDAGEYR"], w if weighted else None)
    rows.append(["Dietary & Physiologic Measures", "Age, years", "", cell, N])

    if "ahei_total" in d.columns:
        cell, N = mean_se(d["ahei_total"], w if weighted else None)
        rows.append(["", "AHEI", "", cell, N])

    table = pd.DataFrame(rows, columns=["Group", "Variable", "Category", "Primary population, N (%) or Mean (SE)", "Total N"])
    table["Group"] = table["Group"].replace({"": pd.NA}).ffill()
    return table

# === RUN ===
table1 = build_descriptive_table(df_my_cov_aligned_short)
display(table1.head(60))


Unnamed: 0,Group,Variable,Category,"Primary population, N (%) or Mean (SE)",Total N
0,Sociodemographics,Sex,Male,1135873547.3743942 (48.0%),2365383000.0
1,Sociodemographics,Sex,Female,1229508997.2389026 (52.0%),2365383000.0
2,Sociodemographics,Race,3.0,1591458244.6893723 (67.3%),2365383000.0
3,Sociodemographics,Race,4.0,265114577.43691424 (11.2%),2365383000.0
4,Sociodemographics,Race,1.0,188519572.7387189 (8.0%),2365383000.0
5,Sociodemographics,Race,2.0,141886779.2933978 (6.0%),2365383000.0
6,Sociodemographics,Race,6.0,119501350.10689543 (5.1%),2365383000.0
7,Sociodemographics,Race,7.0,58902020.34799794 (2.5%),2365383000.0
8,Sociodemographics,Education,4,714811280.8727111 (30.2%),2364855000.0
9,Sociodemographics,Education,5,682262137.699723 (28.9%),2364855000.0


## USE THIS! Concise way to build descriptive table 

In [91]:
df_my_cov_aligned_short.columns

Index(['SEQN', 'SDDSRVYR', 'sdmvpsu', 'sdmvstra', 'RIDAGEYR', 'SEX', 'RACE',
       're', 'household_size', 'DMDHHSIZ', 'EDU', 'pir', 'met_hr',
       'SMK_STATUS', 'CIGS_PER_DAY', 'FORMER_SMOKER', 'METSCORE', 'LTPA',
       'DIABE', 'HYPERTEN', 'chol_rx', 'CVD', 'cancer', 'probable_depression',
       'ahei_total', 'unemployment2', 'ins', 'HOQ065', 'marriage', 'SNAP',
       'FS', 'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'WTINT4YR', 'WTMEC4YR',
       'WTPH2YR', 'WTINTPRP', 'WTMECPRP', 'WTSAFPRP', 'wt_int', 'wt_mec',
       'wt_fasting', 'wt_phlebotomy', 'marriage_prev', 'marriage_label',
       'marriage3', 'SNAP_src', 'SNAP_bin', 'SNAP_src_rank', 'SNAP_indiv_only',
       'SNAP_indiv_plus_singleton', 'bmi', 'RIAGENDR', 'drinking', 'alcg2',
       'perE_alco', 'METSCORE_fromPAQ', 'LTPA_fromPAQ', 'chol_rx200',
       'SMK_STATUS_STD', 'CIGS_PER_DAY_CAT', 'bmi_lu', 'bmi_cat_new',
       'trouble_sleeping', 'sleep_hours_usual', 'CIDGSCOR', 'anxiety_rx'],
      dtype='object')

In [92]:
import pandas as pd
import numpy as np

# =========================
# === CONFIG & LABELS   ===
# =========================
AGE_FILTER   = ">20"         # options: None, ">20", ">=18", etc.
WEIGHT_COL   = "WTINT2YR"    # set to None for unweighted
ID_COL       = "SEQN"
DRINKERS_ONLY = False        # if True: denom = drinkers only (alcg2 in {2,3}); if False: include nondrinkers

# Display options
HIDE_REDUNDANT_COLUMNS_IN_VIEW = True  # hides Primary N (unweighted) and Percent in the styled view

SEX_MAP = {1: "Male", 2: "Female"}

# --- Collapsed labels ---
# RACE: 1=MexAm, 2=Other Hisp, 3=NH White, 4=NH Black, 6=NH Asian, 7=Other/multi
RACE_COLLAPSE = {
    1: "Hispanic",
    2: "Hispanic",
    3: "Non-Hispanic White",
    4: "Non-Hispanic Black",
    6: "Other",
    7: "Other",
}

# EDU (DMDEDUC2-like): 1=<9th, 2=9–11th, 3=HS/GED, 4=Some college/AA, 5=College+, 7/9 missing
EDU_COLLAPSE = {
    1: "Less than high school",
    2: "Less than high school",
    3: "High school or equivalent",
    4: "Some college",
    5: "College or above",
    7: pd.NA,
    9: pd.NA,
}

SNAP_MAP = {}
FS_MAP   = {}

# Only show the "1" (Yes) row for these variables
ONLY_SHOW_1_FOR = {
    "SNAP",
    "Food Insecurity",
    "Diabetes",
    "Hypertension",
    "CVD",
    "Cancer",
    "Depression (probable)",
    "Cholesterol Rx",
    "Trouble sleeping",
    "Anxiety Rx",
}

# PIR buckets (family income-to-poverty ratio)
def bucket_pir(pir):
    if pd.isna(pir): return "Missing"
    if pir < 1.3:    return "< 1.3"
    if pir < 3.0:    return "1.3–2.99"
    return "≥ 3"

# =========================
# === HELPER FUNCTIONS  ===
# =========================
def as_text(s, mapping=None):
    s = s.copy()
    if mapping:
        out = s.map(mapping)
        out = out.astype("string").fillna(s.astype("string"))
        return out
    return s.astype("string")

def as_binary_yesno(s):
    """Map 0/1 (or strings '0'/'1', true/false) to 'Yes'/'No'. Others unchanged."""
    ser = s.copy()
    num = pd.to_numeric(ser, errors="coerce")  # non-numeric -> NaN
    out = ser.astype("string")
    out.loc[num == 1] = "Yes"
    out.loc[num == 0] = "No"
    out = out.replace({
        "1": "Yes", "0": "No",
        "true": "Yes", "false": "No",
        "True": "Yes", "False": "No",
        "TRUE": "Yes", "FALSE": "No"
    })
    return out

def get_weights(df, weight_col):
    if weight_col and weight_col in df.columns:
        w = pd.to_numeric(df[weight_col], errors="coerce").fillna(0.0)
        w[w < 0] = 0
        return w, True
    return pd.Series(1.0, index=df.index), False

def mean_se(x, w=None):
    x = pd.to_numeric(x, errors="coerce")
    m = x.notna()
    if w is None:
        w = pd.Series(1.0, index=x.index)
    ww = w[m]; xx = x[m]
    if len(xx) == 0:
        return "NA (NA)", 0
    W = ww.sum()
    mu = (ww * xx).sum() / W
    var_w = (ww * (xx - mu) ** 2).sum() / W
    n_eff = (W ** 2) / (ww ** 2).sum() if (ww**2).sum() > 0 else len(xx)
    se = np.sqrt(var_w / max(n_eff, 1))
    return f"{mu:.2f} ({se:.2f})", int(xx.notna().sum())  # unweighted N

def median_iqr(x, w=None):
    """
    Returns 'median [Q1–Q3]' and unweighted N.
    Uses weighted quantiles if w is provided, otherwise unweighted.
    """
    xv = pd.to_numeric(x, errors="coerce")
    m = xv.notna()
    if w is None:
        vals = xv[m].to_numpy()
        if len(vals) == 0:
            return "NA [NA–NA]", 0
        q1, med, q3 = np.percentile(vals, [25, 50, 75])
        return f"{med:.2f} [{q1:.2f}–{q3:.2f}]", int(m.sum())

    ww = pd.to_numeric(w, errors="coerce").fillna(0.0)
    vv = xv[m].to_numpy()
    wt = ww[m].to_numpy()
    if len(vv) == 0 or wt.sum() <= 0:
        return "NA [NA–NA]", 0

    order = np.argsort(vv)
    v = vv[order]
    wts = wt[order]
    cdf = np.cumsum(wts) / wts.sum()

    def _wq(p):
        return np.interp(p, cdf, v)

    q1, med, q3 = (_wq(0.25), _wq(0.50), _wq(0.75))
    return f"{med:.2f} [{q1:.2f}–{q3:.2f}]", int(m.sum())

def n_pct(labels, w):
    """
    Returns: category, n_unw (UNW count), n_w (weighted count), pct (weighted %),
             total_unw (UNW denom), total_w (weighted denom).
    """
    lab = labels.astype("string")
    mask = lab.notna()
    ww = w[mask]
    L  = lab[mask]
    if len(ww) == 0:
        return pd.DataFrame(columns=[
            "category","n_unw","n_w","pct","total_unw","total_w"
        ]), 0.0, 0

    counts_unw = L.value_counts()
    total_unw = int(mask.sum())

    totals_w = ww.groupby(L).sum()
    W = ww.sum()

    out = (pd.DataFrame({"n_unw": counts_unw, "n_w": totals_w})
           .fillna(0.0)
           .sort_values("n_w", ascending=False)
           .rename_axis("category")
           .reset_index())
    out["pct"] = np.where(W > 0, 100 * out["n_w"] / W, np.nan)
    out["total_unw"] = total_unw
    out["total_w"] = W
    return out, W, total_unw

def add_rows(rows, group, var, ser, w, order=None, only_show_yes=False):
    """
    Append categorical rows; Total N = UNWEIGHTED denominator.
    Primary population, N (%) uses n_unw (unweighted) and pct (weighted).
    """
    df, W, total_unw = n_pct(ser, w)

    if only_show_yes:
        yes_labels = {"Yes", "1", 1, True, "TRUE"}
        df = df[df["category"].isin(yes_labels)]

    if order:
        df = df.set_index("category")
        keep = [o for o in order if o in df.index]
        df = df.reindex(keep).dropna(how="all").reset_index()

    for _, r in df.iterrows():
        rows.append([
            group, var, r["category"],
            r["n_unw"],              # unweighted N for combined cell
            r["n_w"],                # weighted N
            r["pct"],                # weighted %
            total_unw,               # UNWEIGHTED denominator
            None                     # text placeholder for continuous rows
        ])

def _bin_cigs_per_day(series):
    x = pd.to_numeric(series, errors="coerce")
    out = pd.Series(pd.NA, index=series.index, dtype="string")
    out = out.mask(x.notna() & (x < 15), "<15 cigarettes/day")
    out = out.mask(x.notna() & (x >= 15) & (x < 25), "15–24.9 cigarettes/day")
    out = out.mask(x.notna() & (x >= 25), "≥ 25 cigarettes/day")
    return out

def add_smoking_rows(rows, d, w):
    if "SMK_STATUS_STD" not in d.columns:
        return
    status = d["SMK_STATUS_STD"].astype("string")
    # NEVER / FORMER
    add_rows(rows, "Health Behaviors", "Smoking status",
             status.replace({"CURRENT": pd.NA}), w,
             order=["NEVER","FORMER"])
    # CURRENT intensity
    cur_mask = status.eq("CURRENT")
    if not cur_mask.any():
        return
    if "CIGS_PER_DAY_CAT" in d.columns:
        cur_intensity = d.loc[cur_mask, "CIGS_PER_DAY_CAT"].astype("string")
    elif "CIGS_PER_DAY" in d.columns:
        cur_intensity = _bin_cigs_per_day(d.loc[cur_mask, "CIGS_PER_DAY"])
    else:
        cur_series = pd.Series(np.where(cur_mask, "CURRENT", pd.NA), index=d.index, dtype="string")
        add_rows(rows, "Health Behaviors", "Smoking status", cur_series, w, order=["CURRENT"])
        return
    intensity_series = pd.Series(pd.NA, index=d.index, dtype="string")
    intensity_series.loc[cur_mask] = cur_intensity
    add_rows(rows, "Health Behaviors", "Smoking status", intensity_series, w,
             order=["<15 cigarettes/day", "15–24.9 cigarettes/day", "≥ 25 cigarettes/day"])

# ---- NEW: helper to add two rows for continuous variables ----
def add_continuous_rows(rows, group, var, values, w, weighted, transform=None, label_suffix=""):
    """
    Adds two rows for a continuous variable:
      1) '<var> — Mean (SE)' with Mean (SE)
      2) '<var> — Median [IQR]' with Median [IQR]'
    Total N is unweighted non-missing.
    Optionally apply a transform (e.g., lambda s: s/60).
    """
    x = values.copy()
    if transform is not None:
        x = transform(x)

    mean_cell, N_unw = mean_se(x, w if weighted else None)
    med_cell,  _     = median_iqr(x, w if weighted else None)

    # Row 1: Mean (SE)
    rows.append([group, f"{var} — Mean (SE){label_suffix}", "",
                 np.nan, np.nan, np.nan, N_unw, mean_cell])
    # Row 2: Median [IQR]
    rows.append([group, f"{var} — Median [IQR]{label_suffix}", "",
                 np.nan, np.nan, np.nan, N_unw, med_cell])

# =================================
# === MAIN TABLE BUILDER (T1)   ===
# =================================
def build_descriptive_table(df):
    d = df.copy()

    # Age filter
    if AGE_FILTER:
        age = pd.to_numeric(d["RIDAGEYR"], errors="coerce")
        if AGE_FILTER == ">20":
            d = d[age > 20]
        elif AGE_FILTER == ">=18":
            d = d[age >= 18]

    # Weights
    w, weighted = get_weights(d, WEIGHT_COL)

    rows = []

    # --- Sociodemographics ---
    sex = as_text(d.get("RIAGENDR", d.get("SEX")), mapping=SEX_MAP if "RIAGENDR" in d.columns else None)
    add_rows(rows, "Sociodemographics", "Sex", sex, w, order=["Male","Female"])

    # RACE → collapsed to 4 labels
    if "RACE" in d.columns:
        race_raw = pd.to_numeric(d["RACE"], errors="coerce")
        race4 = race_raw.map(RACE_COLLAPSE).astype("string")
        add_rows(rows, "Sociodemographics", "Race", race4, w,
                 order=["Non-Hispanic White","Non-Hispanic Black","Hispanic","Other"])

    # EDU → collapsed to 4 labels
    if "EDU" in d.columns:
        edu_raw = pd.to_numeric(d["EDU"], errors="coerce")
        edu4 = edu_raw.map(EDU_COLLAPSE).astype("string")
        add_rows(rows, "Sociodemographics", "Education", edu4, w,
                 order=["Less than high school","High school or equivalent","Some college","College or above"])

    if "pir" in d.columns:
        pir_bucket = d["pir"].apply(bucket_pir)
        add_rows(rows, "Sociodemographics", "Family income to poverty ratio", pir_bucket, w,
                 order=["< 1.3","1.3–2.99","≥ 3","Missing"])

    if "SNAP" in d.columns:
        snap = as_binary_yesno(d["SNAP"]) if not SNAP_MAP else as_text(d["SNAP"], mapping=SNAP_MAP)
        add_rows(rows, "Sociodemographics", "SNAP", snap, w, only_show_yes=True)

    if "FS" in d.columns:
        fs = as_binary_yesno(d["FS"]) if not FS_MAP else as_text(d["FS"], mapping=FS_MAP)
        add_rows(rows, "Sociodemographics", "Food Insecurity", fs, w, only_show_yes=True)

    if "household_size" in d.columns:
        add_continuous_rows(rows, "Sociodemographics", "Household Size",
                            d["household_size"], w, weighted)

    # --- Health Behaviors ---
    add_smoking_rows(rows, d, w)

    # Drinking status (correct coding: 1=Nondrinkers, 2=Moderate, 3=Heavy)
    if "alcg2" in d.columns:
        drink_map = {1: "Nondrinkers", 2: "Moderate drinker", 3: "Heavy drinker"}
        if DRINKERS_ONLY:
            mask = d["alcg2"].isin([2, 3])
            ser = pd.Series(pd.NA, index=d.index, dtype="string")
            ser.loc[mask] = d.loc[mask, "alcg2"].map(drink_map).astype("string")
            add_rows(rows, "Health Behaviors", "Drinking status", ser, w,
                     order=["Moderate drinker","Heavy drinker"])
        else:
            ser = d["alcg2"].map(drink_map).astype("string").fillna("Nondrinkers")
            add_rows(rows, "Health Behaviors", "Drinking status", ser, w,
                     order=["Nondrinkers","Moderate drinker","Heavy drinker"])

    # Physical activity: two rows in HOURS (met_hr is minutes → /60)
    if "met_hr" in d.columns:
        add_continuous_rows(rows, "Health Behaviors", "Physical activity (hours)",
                            d["met_hr"], w, weighted, transform=lambda s: s/60)

    # --- Sleep & Mental Health ---
    if "trouble_sleeping" in d.columns:
        ser = as_binary_yesno(d["trouble_sleeping"])
        add_rows(rows, "Sleep & Mental Health", "Trouble sleeping", ser, w, only_show_yes=True)

    if "sleep_hours_usual" in d.columns:
        add_continuous_rows(rows, "Sleep & Mental Health", "Sleep hours (usual)",
                            d["sleep_hours_usual"], w, weighted)

    if "anxiety_rx" in d.columns:
        ser = as_binary_yesno(d["anxiety_rx"])
        add_rows(rows, "Sleep & Mental Health", "Anxiety Rx", ser, w, only_show_yes=True)

    # --- Clinical Characteristics ---
    if "bmi_cat_new" in d.columns:
        add_rows(rows, "Clinical Characteristics", "BMI", as_text(d["bmi_cat_new"]), w,
                 order=["UNDER","NORMAL","OVER"])

    bin_vars = [
        ("Clinical Characteristics", "Diabetes", "DIABE"),
        ("Clinical Characteristics", "Hypertension", "HYPERTEN"),
        ("Clinical Characteristics", "CVD", "CVD"),
        ("Clinical Characteristics", "Cancer", "cancer"),
        ("Clinical Characteristics", "Depression (probable)", "probable_depression"),
        ("Clinical Characteristics", "Cholesterol Rx", "chol_rx"),
    ]
    for grp, label, col in bin_vars:
        if col in d.columns:
            ser = as_binary_yesno(d[col])
            add_rows(rows, grp, label, ser, w, only_show_yes=True)

    # --- Dietary & Physiologic Measures ---
    add_continuous_rows(rows, "Dietary & Physiologic Measures", "Age, years",
                        d["RIDAGEYR"], w, weighted)

    if "ahei_total" in d.columns:
        add_continuous_rows(rows, "Dietary & Physiologic Measures", "AHEI",
                            d["ahei_total"], w, weighted)

   

     # Assemble
    table = pd.DataFrame(rows, columns=[
        "Group","Variable","Category",
        "Primary N (unweighted)","Primary N (weighted)","Percent","Total N",
        SUMMARY_COL
    ])
    table["Group"] = table["Group"].replace({"": pd.NA}).ffill()
    
    # === Presentation formatting ===
    def fmt_int(x):
        return "" if pd.isna(x) else f"{int(round(x,0)):,}"
    def fmt_pct(x):
        return "" if pd.isna(x) else f"{x:.1f}%"
    
    # Combined cell like "n (pct%)" for categorical rows; blank for continuous
    table["Primary population, N (%)"] = np.where(
        table[SUMMARY_COL].notna(),
        "",
        table["Primary N (unweighted)"].apply(fmt_int) + " (" +
        table["Percent"].apply(lambda v: "" if pd.isna(v) else f"{v:.1f}%") + ")"
    )

    # Re-order columns (granular kept for export)
    columns = [
        "Group","Variable","Category",
        "Primary population, N (%)",
        "Primary N (unweighted)","Primary N (weighted)","Percent","Total N",
        SUMMARY_COL
    ]
    table = table[columns]
    
    # Build a Styler (hide redundant columns in the VIEW, keep them in table for export)
    view = table.copy()
    if HIDE_REDUNDANT_COLUMNS_IN_VIEW:
        view = view.drop(columns=["Primary N (unweighted)", "Percent"])
    
    sty = (view.style
           .hide(axis="index")
           .format({
               "Primary N (unweighted)": fmt_int,   # only applied if not dropped
               "Primary N (weighted)": fmt_int,
               "Percent": fmt_pct,                  # only applied if not dropped
               "Total N": fmt_int
           }, na_rep="")
           .set_properties(subset=["Group","Variable","Category"], **{"text-align":"left"})
           .set_properties(subset=[c for c in view.columns if c not in {"Group","Variable","Category"}],
                           **{"text-align":"right"})
           .set_table_styles([
               {"selector":"th","props":"font-weight:bold; text-align:left;"},
               {"selector":"tbody tr:nth-child(odd)","props":"background-color:#fafafa;"},
               {"selector":"tbody tr:hover","props":"background-color:#f2f2f2;"}
           ]))
    
    return table, sty


# =========================
# === RUN & DISPLAY     ===
# =========================
# Expect your DataFrame: df_my_cov_aligned_short
table1, styled = build_descriptive_table(df_my_cov_aligned_short)
display(styled)
# table1.to_excel("table1_descriptive.xlsx", index=False)


Group,Variable,Category,"Primary population, N (%)",Primary N (weighted),Total N,Mean (SE) / Median [IQR]
Sociodemographics,Sex,Male,"33,883 (48.0%)",1135873547.0,70956,
Sociodemographics,Sex,Female,"37,073 (52.0%)",1229508997.0,70956,
Sociodemographics,Race,Non-Hispanic White,"31,659 (67.3%)",1591458245.0,70956,
Sociodemographics,Race,Non-Hispanic Black,"14,691 (11.2%)",265114577.0,70956,
Sociodemographics,Race,Hispanic,"17,090 (14.0%)",330406352.0,70956,
Sociodemographics,Race,Other,"7,516 (7.5%)",178403370.0,70956,
Sociodemographics,Education,Less than high school,"17,669 (16.7%)",395220304.0,70796,
Sociodemographics,Education,High school or equivalent,"16,319 (24.1%)",569191807.0,70796,
Sociodemographics,Education,Some college,"20,209 (30.3%)",714811281.0,70796,
Sociodemographics,Education,College or above,"16,599 (28.9%)",682262138.0,70796,
