## import file

In [2]:
import pandas as pd
from pathlib import Path

p = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output/cov_addv6_99_23.parquet")
df_my_cov_aligned_short = pd.read_parquet(p)  # uses pyarrow/fastparquet if available
print(df_my_cov_aligned_short.shape)
df_my_cov_aligned_short.head()


(128809, 64)


Unnamed: 0,SEQN,SDDSRVYR,sdmvpsu,sdmvstra,RIDAGEYR,SEX,RACE,re,household_size,DMDHHSIZ,...,drinking,alcg2,perE_alco,METSCORE_fromPAQ,LTPA_fromPAQ,chol_rx200,SMK_STATUS_STD,CIGS_PER_DAY_CAT,bmi_lu,bmi_cat_new
0,1,1.0,1,5,2.0,F,4.0,Other Hispanic,3,3.0,...,,,0.0,,,,,,14.9,UNDER
1,2,1.0,3,1,77.0,M,3.0,Mexican American,1,1.0,...,0.065753,2.0,0.0,,,1.0,NEVER,,24.9,NORMAL
2,3,1.0,2,7,10.0,F,3.0,Mexican American,4,4.0,...,,,0.0,,,0.0,,,17.63,UNDER
3,4,1.0,1,2,1.0,M,4.0,Other Hispanic,7,7.0,...,,,0.0,,,,,,15.2,UNDER
4,5,1.0,2,8,49.0,M,3.0,Mexican American,3,3.0,...,1.714286,2.0,9.101101,,,1.0,FORMER,,29.1,OVER


In [3]:
df_my_cov_aligned_short.columns

Index(['SEQN', 'SDDSRVYR', 'sdmvpsu', 'sdmvstra', 'RIDAGEYR', 'SEX', 'RACE',
       're', 'household_size', 'DMDHHSIZ', 'EDU', 'pir', 'met_hr',
       'SMK_STATUS', 'CIGS_PER_DAY', 'FORMER_SMOKER', 'METSCORE', 'LTPA',
       'DIABE', 'HYPERTEN', 'chol_rx', 'CVD', 'cancer', 'probable_depression',
       'ahei_total', 'unemployment2', 'ins', 'HOQ065', 'marriage', 'SNAP',
       'FS', 'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'WTINT4YR', 'WTMEC4YR',
       'WTPH2YR', 'WTINTPRP', 'WTMECPRP', 'WTSAFPRP', 'wt_int', 'wt_mec',
       'wt_fasting', 'wt_phlebotomy', 'marriage_prev', 'marriage_label',
       'marriage3', 'SNAP_src', 'SNAP_bin', 'SNAP_src_rank', 'SNAP_indiv_only',
       'SNAP_indiv_plus_singleton', 'bmi', 'RIAGENDR', 'drinking', 'alcg2',
       'perE_alco', 'METSCORE_fromPAQ', 'LTPA_fromPAQ', 'chol_rx200',
       'SMK_STATUS_STD', 'CIGS_PER_DAY_CAT', 'bmi_lu', 'bmi_cat_new'],
      dtype='object')

## try create descriptive table 

In [13]:
df_my_cov_aligned_short[["alcg2"]].head(10)

Unnamed: 0,alcg2
0,
1,2.0
2,
3,
4,2.0
5,
6,1.0
7,
8,
9,1.0


In [14]:
import pandas as pd
import numpy as np

# === CONFIG ===
AGE_FILTER = ">20"         # options: None, ">20", ">=18", etc.
WEIGHT_COL = "WTINT2YR"    # set to None for unweighted
ID_COL     = "SEQN"

# Optional label mappings (edit if your vars are coded as numbers)
SEX_MAP = {1: "Male", 2: "Female"}
RACE_MAP = {}  # e.g., {1:"Non-Hispanic White", 2:"Non-Hispanic Black", 3:"Hispanic", 4:"Other"}
EDU_MAP  = {}  # e.g., {1:"<High school", 2:"HS/GED", 3:"Some college", 4:"College+"}
SNAP_MAP = {}
FS_MAP   = {}

# PIR buckets (family income-to-poverty ratio)
def bucket_pir(pir):
    if pd.isna(pir): return "Missing"
    if pir < 1.3:    return "< 1.3"
    if pir < 3.0:    return "1.3–2.99"
    return "≥ 3"

# === HELPERS ===
def as_text(s, mapping=None):
    """Return series of labels; if mapping provided and values are numeric, map to text."""
    if mapping:
        out = s.map(mapping)
        out = out.astype("string").fillna(s.astype("string"))
        return out
    return s.astype("string")

def get_weights(df, weight_col):
    if weight_col and weight_col in df.columns:
        w = pd.to_numeric(df[weight_col], errors="coerce").fillna(0.0)
        w[w < 0] = 0
        return w, True
    return pd.Series(1.0, index=df.index), False

def mean_se(x, w=None):
    """Mean (SE) with (optional) weights; SE via sqrt(var_w / n_eff) as a simple approximation."""
    x = pd.to_numeric(x, errors="coerce")
    m = x.notna()
    if w is None:
        w = pd.Series(1.0, index=x.index)
    ww = w[m]; xx = x[m]
    if len(xx) == 0:
        return "NA (NA)", 0
    W = ww.sum()
    mu = (ww * xx).sum() / W
    var_w = (ww * (xx - mu) ** 2).sum() / W
    # Kish effective n:
    n_eff = (W ** 2) / (ww ** 2).sum() if (ww**2).sum() > 0 else len(xx)
    se = np.sqrt(var_w / max(n_eff, 1))
    return f"{mu:.2f} ({se:.2f})", len(xx)

def n_pct(labels, w):
    """Return a DataFrame of label -> 'n (pct%)' using weights w on non-missing labels."""
    lab = labels.astype("string")
    mask = lab.notna()
    ww = w[mask]
    L  = lab[mask]
    if len(ww) == 0:
        return pd.Series(dtype="string"), 0.0
    totals = ww.groupby(L).sum().sort_values(ascending=False)
    W = ww.sum()
    out = totals.apply(lambda n: f"{(n if (w!=1).any() else int(n))} ({100*n/W:.1f}%)")
    return out, W

def add_rows(rows, group, var, ser, w, order=None):
    """Append categorical N(%) rows to rows list."""
    counts, W = n_pct(ser, w)
    if order:
        counts = counts.reindex([o for o in order if o in counts.index]).dropna()
    for cat, cell in counts.items():
        rows.append([group, var, cat, cell, int(W) if (w==1).all() else round(W,1)])

# === BUILD TABLE ===
def build_descriptive_table(df):
    d = df.copy()

    # Age filter
    if AGE_FILTER:
        age = pd.to_numeric(d["RIDAGEYR"], errors="coerce")
        if AGE_FILTER == ">20":
            d = d[age > 20]
        elif AGE_FILTER == ">=18":
            d = d[age >= 18]

    # Weights
    w, weighted = get_weights(d, WEIGHT_COL)

    rows = []

    # --- Sociodemographics ---
    sex = as_text(d.get("RIAGENDR", d.get("SEX")), mapping=SEX_MAP if "RIAGENDR" in d.columns else None)
    add_rows(rows, "Sociodemographics", "Sex", sex, w, order=["Male","Female"])

    if "RACE" in d.columns:
        race = as_text(d["RACE"], mapping=RACE_MAP)
        add_rows(rows, "", "Race", race, w)

    if "EDU" in d.columns:
        edu = as_text(d["EDU"], mapping=EDU_MAP)
        add_rows(rows, "", "Education", edu, w)

    if "pir" in d.columns:
        pir_bucket = d["pir"].apply(bucket_pir)
        add_rows(rows, "", "Family income to poverty ratio", pir_bucket, w,
                 order=["< 1.3","1.3–2.99","≥ 3","Missing"])

    if "SNAP" in d.columns:
        snap = as_text(d["SNAP"], mapping=SNAP_MAP)
        add_rows(rows, "", "SNAP", snap, w)

    if "FS" in d.columns:
        fs = as_text(d["FS"], mapping=FS_MAP)
        add_rows(rows, "", "Food Insecurity", fs, w)

    if "household_size" in d.columns:
        cell, N = mean_se(d["household_size"], w if weighted else None)
        rows.append(["", "Household Size", "", cell, N])

    # --- Health Behaviors ---
    # Smoking
    if "SMK_STATUS_STD" in d.columns and "CIGS_PER_DAY_CAT" in d.columns:
        status = d["SMK_STATUS_STD"].astype("string")
        current = (status == "CURRENT")
        add_rows(rows, "Health Behaviors", "Smoking status", status.where(status=="NEVER"), w, order=["NEVER"])
        add_rows(rows, "", "Smoking status", status.where(status=="FORMER"), w, order=["FORMER"])
        label_map = {1:"<15 cigarettes/day", 2:"15–24.9 cigarettes/day", 3:"≥ 25 cigarettes/day"}
        intensity = d["CIGS_PER_DAY_CAT"].where(current).map(label_map)
        add_rows(rows, "", "Smoking status", intensity, w, order=list(label_map.values()))

    # Drinking from alcg2
    if "alcg2" in d.columns:
        drink_map = {1: "Moderate drinker", 2: "Heavy drinker"}
        drink_status = d["alcg2"].map(drink_map)
        drink_status = drink_status.fillna("Nondrinkers")
        add_rows(rows, "", "Drinking status", drink_status, w,
                 order=["Nondrinkers","Moderate drinker","Heavy drinker"])

    # Physical activity
    if "met_hr" in d.columns:
        cell, N = mean_se(d["met_hr"], w if weighted else None)
        rows.append(["", "Physical activity (met_hr)", "", cell, N])

    # --- Clinical Characteristics ---
    if "bmi_cat_new" in d.columns:
        add_rows(rows, "Clinical Characteristics", "BMI", as_text(d["bmi_cat_new"]), w,
                 order=["UNDER","NORMAL","OVER"])

    bin_vars = [
        ("Diabetes", "DIABE"),
        ("Hypertension", "HYPERTEN"),
        ("CVD", "CVD"),
        ("Cancer", "cancer"),
        ("Depression (probable)", "probable_depression"),
        ("Cholesterol Rx", "chol_rx"),
    ]
    for label, col in bin_vars:
        if col in d.columns:
            add_rows(rows, "", label, as_text(d[col]), w)

    # --- Dietary & Physiologic Measures ---
    cell, N = mean_se(d["RIDAGEYR"], w if weighted else None)
    rows.append(["Dietary & Physiologic Measures", "Age, years", "", cell, N])

    if "ahei_total" in d.columns:
        cell, N = mean_se(d["ahei_total"], w if weighted else None)
        rows.append(["", "AHEI", "", cell, N])

    table = pd.DataFrame(rows, columns=["Group", "Variable", "Category", "Primary population, N (%) or Mean (SE)", "Total N"])
    table["Group"] = table["Group"].replace({"": pd.NA}).ffill()
    return table

# === RUN ===
table1 = build_descriptive_table(df_my_cov_aligned_short)
display(table1.head(60))


Unnamed: 0,Group,Variable,Category,"Primary population, N (%) or Mean (SE)",Total N
0,Sociodemographics,Sex,Male,1135873547.3743942 (48.0%),2365383000.0
1,Sociodemographics,Sex,Female,1229508997.2389026 (52.0%),2365383000.0
2,Sociodemographics,Race,3.0,1591458244.6893723 (67.3%),2365383000.0
3,Sociodemographics,Race,4.0,265114577.43691424 (11.2%),2365383000.0
4,Sociodemographics,Race,1.0,188519572.7387189 (8.0%),2365383000.0
5,Sociodemographics,Race,2.0,141886779.2933978 (6.0%),2365383000.0
6,Sociodemographics,Race,6.0,119501350.10689543 (5.1%),2365383000.0
7,Sociodemographics,Race,7.0,58902020.34799794 (2.5%),2365383000.0
8,Sociodemographics,Education,4,714811280.8727111 (30.2%),2364855000.0
9,Sociodemographics,Education,5,682262137.699723 (28.9%),2364855000.0
