## import previous cov file

In [135]:
import pandas as pd
from pathlib import Path

p = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output/cov_addv5_99_23.parquet")
df_my_cov_aligned_short = pd.read_parquet(p)  # uses pyarrow/fastparquet if available
print(df_my_cov_aligned_short.shape)
df_my_cov_aligned_short.head()


(128809, 69)


Unnamed: 0,SEQN,SDDSRVYR,sdmvpsu,sdmvstra,RIDAGEYR,SEX,RACE,re,household_size,DMDHHSIZ,...,household_size_structural_missing,chol_rx_structural_missing,RIAGENDR,drinking,alcg2,perE_alco,METSCORE_fromPAQ,LTPA_fromPAQ,met_hr_recalc_from,chol_rx200
0,1,1.0,1,5,2.0,F,4.0,Other Hispanic,3,3.0,...,False,False,2.0,,,0.0,,,,
1,2,1.0,3,1,77.0,M,3.0,Mexican American,1,1.0,...,False,False,1.0,0.065753,2.0,0.0,,,from_new_PAQ,1.0
2,3,1.0,2,7,10.0,F,3.0,Mexican American,4,4.0,...,False,False,2.0,,,0.0,,,,0.0
3,4,1.0,1,2,1.0,M,4.0,Other Hispanic,7,7.0,...,False,False,1.0,,,0.0,,,,
4,5,1.0,2,8,49.0,M,3.0,Mexican American,3,3.0,...,False,False,1.0,1.714286,2.0,9.101101,,,from_new_PAQ,1.0


## check missing 

In [136]:
df_my_cov_aligned_short.columns

Index(['SEQN', 'SDDSRVYR', 'sdmvpsu', 'sdmvstra', 'RIDAGEYR', 'SEX', 'RACE',
       're', 'household_size', 'DMDHHSIZ', 'EDU', 'pir', 'SMK_AVG', 'SMK',
       'met_hr', 'SMK_STATUS', 'CIGS_PER_DAY', 'PACK_YEARS', 'FORMER_SMOKER',
       'METSCORE', 'LTPA', 'bmi_cat', 'BMI_CLAS', 'DIABE', 'HYPERTEN',
       'chol_rx', 'CVD', 'cancer', 'probable_depression', 'ahei_total',
       'unemployment2', 'ins', 'HOQ065', 'marriage', 'SNAP', 'FS', 'WTINT2YR',
       'WTMEC2YR', 'WTSAF2YR', 'WTINT4YR', 'WTMEC4YR', 'WTPH2YR', 'WTINTPRP',
       'WTMECPRP', 'WTSAFPRP', 'wt_int', 'wt_mec', 'wt_fasting',
       'wt_phlebotomy', 'marriage_prev', 'marriage_label', 'marriage3',
       'SNAP_src', 'SNAP_bin', 'SNAP_src_rank', 'SNAP_indiv_only',
       'SNAP_indiv_plus_singleton', 'bmi', 'HOQ065_structural_missing',
       'household_size_structural_missing', 'chol_rx_structural_missing',
       'RIAGENDR', 'drinking', 'alcg2', 'perE_alco', 'METSCORE_fromPAQ',
       'LTPA_fromPAQ', 'met_hr_recalc_from', 'c

In [137]:
df = df_my_cov_aligned_short

# use every column except the grouper
# ignore PACK_YEARS, CIGS_PER_DAY, probable_depression and etc as missing naturally 
# ignore dulicate safe saved old column name

exclude = {"SDDSRVYR","CIGS_PER_DAY","PACK_YEARS","probable_depression","wt_phlebotomy", "WTSAFPRP",
           "WTINT2YR", "WTMEC2YR", "WTPH2YR", "WTSAF2YR", "WTMEC4YR", "WTINTPRP", "WTMECPRP", "WTINT4YR",
           "SNAP", "SNAP_src", "SNAP_bin", "SNAP_src_rank", "bmi", "RIAGENDR",
           "SNAP_indiv_only","FS", "ahei_total", "HOQ065", "marriage_label", "marriage_prev",
           "METSCORE_fromPAQ","perE_alco","LTPA_fromPAQ","SNAP_indiv_plus_singleton", 
           "SMK_AVG", "BMI_CLAS", "household_size", "DMDHHSIZ", "chol_rx", "chol_rx_structural_missing"}
cols_all = [c for c in df.columns if c not in exclude]

# % missing by cycle (split into two lines)
is_na = df[cols_all].isna()
pct_miss = is_na.groupby(df["SDDSRVYR"]).mean().mul(100)

# keep only columns that exceed 80% missing in ANY cycle
pct_miss_gt80 = pct_miss.loc[:, (pct_miss > 80).any(axis=0)].round(1)

print(pct_miss_gt80)


Empty DataFrame
Columns: []
Index: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 12.0, 66.0]


## fix SMK coding 

In [138]:
import pandas as pd
import numpy as np

def smoking_table_9918(df, weight_col=None, verbose=True):
    """
    1999–2018 only (SDDSRVYR 1..10).
    Denominator = text-only SMK_STATUS (NEVER/FORMER/CURRENT) to match earlier tables.
    Intensity bins:
      - If CIGS_PER_DAY is category-coded {1,2,3}, map:
            1 -> <15, 2 -> 15–24.9, 3 -> ≥25
      - Else (numeric), use cutpoints: <15, 15–24.9, ≥25.
    """
    # ---- filter 1999–2018 ----
    if "SDDSRVYR" not in df.columns:
        raise KeyError("SDDSRVYR missing; required to filter 1999–2018.")
    d = df[df["SDDSRVYR"].isin(range(1, 11))].copy()

    # ---- status (text-only, earlier behavior) ----
    norm = lambda s: s.astype("string").str.strip().str.upper().replace({"<NA>": pd.NA})
    status = norm(d["SMK_STATUS"]).where(lambda s: s.isin(["NEVER","FORMER","CURRENT"]))
    denom = status.notna()
    if denom.sum() == 0:
        raise ValueError("No rows with text SMK_STATUS in 1999–2018.")

    # ---- weights ----
    if weight_col and (weight_col in d.columns):
        w = d[weight_col].astype(float)
        weighted = True
    else:
        w = pd.Series(1.0, index=d.index)
        weighted = False
        weight_col = None
    W = float(w[denom].sum()) or 1.0
    fmt = lambda n: f"{int(n) if not weighted else round(n,1)} ({round(n/W*100,1)}%)"

    # ---- cigs/day best source ----
    num = lambda s: pd.to_numeric(s, errors="coerce")
    cigs = num(d.get("CIGS_PER_DAY")).fillna(num(d.get("SMK_AVG")))

    # ---- auto-detect category-coded 1/2/3 ----
    vals = pd.Series(sorted(cigs.dropna().unique()))
    is_cat = len(vals) > 0 and set(cigs.dropna().unique()).issubset({1.0, 2.0, 3.0})
    if verbose:
        print("CIGS_PER_DAY unique (1999–2018):", vals.tolist()[:12], "..." if len(vals) > 12 else "")
        print("Detected category-coded:", is_cat)

    # ---- build masks ----
    nonsmokers = denom & (status == "NEVER")
    former     = denom & (status == "FORMER")
    current    = (status == "CURRENT")

    if is_cat:
        # Map category codes to bins
        cat = cigs.round().astype("Int64")
        lt15   = denom & current & (cat == 1)
        m15_25 = denom & current & (cat == 2)
        ge25   = denom & current & (cat == 3)
        cur_m  = denom & current & cat.isna()
    else:
        lt15   = denom & current & cigs.notna() & (cigs < 15)
        m15_25 = denom & current & cigs.notna() & (cigs >= 15) & (cigs < 25)
        ge25   = denom & current & cigs.notna() & (cigs >= 25)
        cur_m  = denom & current & cigs.isna()

    # ---- table ----
    rows = [
        ("Nonsmokers",             fmt(float(w[nonsmokers].sum()))),
        ("Former smokers",         fmt(float(w[former].sum()))),
        ("<15 cigarettes/day",     fmt(float(w[lt15].sum()))),
        ("15–24.9 cigarettes/day", fmt(float(w[m15_25].sum()))),
        ("≥ 25 cigarettes/day",    fmt(float(w[ge25].sum()))),
    ]
    # Optional: show current smokers with missing cigs/day
    if cur_m.any():
        rows.append(("Current smokers (cigs/day missing)", fmt(float(w[cur_m].sum()))))

    return pd.DataFrame(rows, columns=["Smoking status", "n (%)"])

# ---- Run (unweighted & weighted if available) ----
tbl_9918_unw = smoking_table_9918(df, weight_col=None, verbose=True)
from IPython.display import display
display(tbl_9918_unw)

if "WTINT2YR" in df.columns:
    tbl_9918_w = smoking_table_9918(df, weight_col="WTINT2YR", verbose=False)
    display(tbl_9918_w)


CIGS_PER_DAY unique (1999–2018): [1.0, 2.0, 3.0] 
Detected category-coded: True


Unnamed: 0,Smoking status,n (%)
0,Nonsmokers,29985 (54.5%)
1,Former smokers,13598 (24.7%)
2,<15 cigarettes/day,6930 (12.6%)
3,15–24.9 cigarettes/day,3268 (5.9%)
4,≥ 25 cigarettes/day,1128 (2.1%)
5,Current smokers (cigs/day missing),105 (0.2%)


Unnamed: 0,Smoking status,n (%)
0,Nonsmokers,1166855257.6 (53.9%)
1,Former smokers,529356851.4 (24.4%)
2,<15 cigarettes/day,253974450.0 (11.7%)
3,15–24.9 cigarettes/day,153443828.2 (7.1%)
4,≥ 25 cigarettes/day,58825513.7 (2.7%)
5,Current smokers (cigs/day missing),3121633.4 (0.1%)


#### Align variables (create standardized columns)

In [139]:
import pandas as pd
import numpy as np

def align_smoking_vars(df):
    """
    - For SDDSRVYR in {12, 66} (post-2018):
        * SMK_STATUS_STD: map 1/2/3 (or "1"/"2"/"3") → CURRENT/FORMER/NEVER
        * CIGS_PER_DAY_CAT: bin numeric CIGS_PER_DAY (or SMK_AVG) into 1/2/3 using:
              1: <15,  2: 15–24.9,  3: ≥25
    - For 1999–2018 (SDDSRVYR 1..10):
        * SMK_STATUS_STD: keep text if already one of {NEVER, FORMER, CURRENT}
                          (if coded 1/2/3 as digits/strings, map them as well—harmless)
        * CIGS_PER_DAY_CAT: keep if already {1,2,3}; else derive from numeric if present.
    Produces:
        - df["SMK_STATUS_STD"]  (string; NEVER/FORMER/CURRENT or <NA>)
        - df["CIGS_PER_DAY_CAT"] (Int64; 1/2/3 or <NA>)
    """
    d = df.copy()

    # helpers
    norm = lambda s: s.astype("string").str.strip().str.upper().replace({"<NA>": pd.NA})
    num  = lambda s: pd.to_numeric(s, errors="coerce")

    # cycle flags
    sdd = d.get("SDDSRVYR")
    if sdd is None:
        raise KeyError("SDDSRVYR is required to align by cycle.")

    is_9918 = sdd.isin(range(1, 11))   # 1999–2018
    is_2019p = sdd.isin([12, 66])      # 2019–20 and 2021–22 style

    # ----- SMK_STATUS_STD (text) -----
    st_txt = norm(d.get("SMK_STATUS"))
    st_num = num(d.get("SMK_STATUS"))

    # start with text values if already proper
    smk_std = st_txt.where(st_txt.isin(["NEVER","FORMER","CURRENT"]))

    # map numeric and digit-strings 1/2/3 everywhere (harmless for early cycles)
    smk_std = smk_std.fillna(st_num.map({1:"CURRENT", 2:"FORMER", 3:"NEVER"}))
    smk_std = smk_std.fillna(st_txt.map({"1":"CURRENT", "2":"FORMER", "3":"NEVER"}))

    # OPTIONAL fallback via SMK + PACK_YEARS if you want to fill a few stragglers
    smk_txt = norm(d.get("SMK"))
    smk_bin = num(d.get("SMK"))
    pack    = num(d.get("PACK_YEARS"))
    smk_std = smk_std.fillna(smk_txt.where(smk_txt.isin(["NEVER","FORMER","CURRENT"])))
    need = smk_std.isna() & smk_bin.notna()
    smk_std = smk_std.mask(need & (smk_bin == 1), "CURRENT")
    smk_std = smk_std.mask(need & (smk_bin == 0) & (pack > 0), "FORMER")
    smk_std = smk_std.mask(need & (smk_bin == 0) & ~(pack > 0), "NEVER")

    d["SMK_STATUS_STD"] = smk_std

    # ----- CIGS_PER_DAY_CAT (1/2/3 categories) -----
    # best numeric source
    cigs_num = num(d.get("CIGS_PER_DAY")).fillna(num(d.get("SMK_AVG")))

    # if an early-cycle row already has categorical 1/2/3, keep it
    cigs_cat_existing = num(d.get("CIGS_PER_DAY"))
    keep_existing_cat = cigs_cat_existing.isin([1,2,3])

    # derive categories for numeric values (esp. post-2018)
    cat_from_numeric = pd.Series(pd.NA, index=d.index, dtype="Int64")
    cat_from_numeric = cat_from_numeric.mask(cigs_num.notna() & (cigs_num < 15), 1)
    cat_from_numeric = cat_from_numeric.mask(cigs_num.notna() & (cigs_num >= 15) & (cigs_num < 25), 2)
    cat_from_numeric = cat_from_numeric.mask(cigs_num.notna() & (cigs_num >= 25), 3)

    # combine: prefer existing 1/2/3 when clearly categorical; else use derived
    cigs_cat = pd.Series(pd.NA, index=d.index, dtype="Int64")
    cigs_cat = cigs_cat.where(~keep_existing_cat, cigs_cat_existing.astype("Int64"))
    cigs_cat = cigs_cat.fillna(cat_from_numeric)

    d["CIGS_PER_DAY_CAT"] = cigs_cat

    return d


#### Build the table using the standardized columns for all cycles

In [140]:
def smoking_table_all(df, weight_col=None, include_missing_row=True):
    if "SMK_STATUS_STD" not in df.columns or "CIGS_PER_DAY_CAT" not in df.columns:
        raise KeyError("Run align_smoking_vars(df) first to create SMK_STATUS_STD and CIGS_PER_DAY_CAT.")

    status = df["SMK_STATUS_STD"]
    denom  = status.isin(["NEVER","FORMER","CURRENT"])

    # weights
    if weight_col and (weight_col in df.columns):
        w = df[weight_col].astype(float)
        weighted = True
    else:
        w = pd.Series(1.0, index=df.index)
        weighted = False
    W = float(w[denom].sum()) or 1.0
    fmt = lambda n: f"{int(n) if not weighted else round(n,1)} ({round(n/W*100,1)}%)"

    # masks
    nonsmokers = denom & (status == "NEVER")
    former     = denom & (status == "FORMER")
    current    = (status == "CURRENT")

    cat = df["CIGS_PER_DAY_CAT"]
    lt15   = denom & current & (cat == 1)
    m15_25 = denom & current & (cat == 2)
    ge25   = denom & current & (cat == 3)
    cur_m  = denom & current & cat.isna()

    rows = [
        ("Nonsmokers",             fmt(float(w[nonsmokers].sum()))),
        ("Former smokers",         fmt(float(w[former].sum()))),
        ("<15 cigarettes/day",     fmt(float(w[lt15].sum()))),
        ("15–24.9 cigarettes/day", fmt(float(w[m15_25].sum()))),
        ("≥ 25 cigarettes/day",    fmt(float(w[ge25].sum()))),
    ]
    if include_missing_row and cur_m.any():
        rows.append(("Current smokers (cigs/day missing)", fmt(float(w[cur_m].sum()))))

    return pd.DataFrame(rows, columns=["Smoking status", "n (%)"])


#### Run it

In [141]:
# 1) standardize
df_std = align_smoking_vars(df)

# 2) unweighted (all cycles)
from IPython.display import display
display(smoking_table_all(df_std))

# 3) weighted (if you want survey-weighted shares)
if "WTINT2YR" in df_std.columns:
    display(smoking_table_all(df_std, weight_col="WTINT2YR"))


Unnamed: 0,Smoking status,n (%)
0,Nonsmokers,40662 (55.8%)
1,Former smokers,17856 (24.5%)
2,<15 cigarettes/day,8490 (11.7%)
3,15–24.9 cigarettes/day,4198 (5.8%)
4,≥ 25 cigarettes/day,1478 (2.0%)
5,Current smokers (cigs/day missing),139 (0.2%)


Unnamed: 0,Smoking status,n (%)
0,Nonsmokers,1325190532.0 (54.8%)
1,Former smokers,587226559.3 (24.3%)
2,<15 cigarettes/day,273391549.1 (11.3%)
3,15–24.9 cigarettes/day,165271479.4 (6.8%)
4,≥ 25 cigarettes/day,63187298.7 (2.6%)
5,Current smokers (cigs/day missing),3638111.3 (0.2%)


In [142]:
# Check that post-2018 rows got categories
print("Post-2018 CIGS_PER_DAY_CAT value counts:")
print(df_std.loc[df_std["SDDSRVYR"].isin([12,66]), "CIGS_PER_DAY_CAT"].value_counts(dropna=False))

# Confirm CURRENT recognition across all cycles after standardization
print("\nCURRENT total (all cycles):", int((df_std["SMK_STATUS_STD"]=="CURRENT").sum()))


Post-2018 CIGS_PER_DAY_CAT value counts:
CIGS_PER_DAY_CAT
<NA>    24621
1        1579
2         938
3         355
Name: count, dtype: Int64

CURRENT total (all cycles): 14305


#### add new standardize var back

In [143]:
# 1) run the function on THIS dataframe
df_my_cov_aligned_short = align_smoking_vars(df_my_cov_aligned_short)

# 2) verify columns exist
print({"has_SMK_STATUS_STD": "SMK_STATUS_STD" in df_my_cov_aligned_short.columns,
       "has_CIGS_PER_DAY_CAT": "CIGS_PER_DAY_CAT" in df_my_cov_aligned_short.columns})

# 3) peek
df_my_cov_aligned_short[["SMK_STATUS_STD","CIGS_PER_DAY_CAT","CIGS_PER_DAY"]].tail(25)


{'has_SMK_STATUS_STD': True, 'has_CIGS_PER_DAY_CAT': True}


Unnamed: 0,SMK_STATUS_STD,CIGS_PER_DAY_CAT,CIGS_PER_DAY
128784,CURRENT,2.0,15.0
128785,NEVER,,
128786,NEVER,,
128787,NEVER,,
128788,CURRENT,1.0,10.0
128789,,,
128790,,,
128791,NEVER,,
128792,NEVER,,
128793,NEVER,,


## drop intermediate/unrelated column 

In [144]:
# Drop in place, ignore if a column isn't present
df_my_cov_aligned_short.drop(columns=["SMK", "SMK_AVG", "PACK_YEARS"], errors="ignore", inplace=True)

# (Optional) verify
[c for c in ["SMK","SMK_AVG","PACK_YEARS"] if c in df_my_cov_aligned_short.columns]


[]

## Fix BMI  

In [145]:
df = df_my_cov_aligned_short

# use every column except the grouper
# ignore PACK_YEARS, CIGS_PER_DAY, probable_depression and etc as missing naturally 
# ignore dulicate safe saved old column name

exclude = {"SDDSRVYR", "CIGS_PER_DAY_CAT","CIGS_PER_DAY","PACK_YEARS","probable_depression","wt_phlebotomy", "WTSAFPRP",
           "WTINT2YR", "WTMEC2YR", "WTPH2YR", "WTSAF2YR", "WTMEC4YR", "WTINTPRP", "WTMECPRP", "WTINT4YR",
           "SNAP", "SNAP_src", "SNAP_bin", "SNAP_src_rank", "bmi", "RIAGENDR",
           "SNAP_indiv_only","FS", "ahei_total", "HOQ065", "marriage_label", "marriage_prev",
           "METSCORE_fromPAQ","perE_alco","LTPA_fromPAQ","SNAP_indiv_plus_singleton", 
           "SMK_AVG", "BMI_CLAS", "household_size", "DMDHHSIZ", "chol_rx", "chol_rx_structural_missing"}
cols_all = [c for c in df.columns if c not in exclude]

# % missing by cycle (split into two lines)
is_na = df[cols_all].isna()
pct_miss = is_na.groupby(df["SDDSRVYR"]).mean().mul(100)

# keep only columns that exceed 80% missing in ANY cycle
pct_miss_gt80 = pct_miss.loc[:, (pct_miss > 80).any(axis=0)].round(1)

print(pct_miss_gt80)


Empty DataFrame
Columns: []
Index: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 12.0, 66.0]


In [146]:
df_my_cov_aligned_short[["bmi_cat", "bmi"]]

Unnamed: 0,bmi_cat,bmi
0,UNDER,
1,NORMAL,
2,UNDER,
3,,
4,OVER,
...,...,...
128804,UNDER,15.4
128805,,
128806,OVER,26.4
128807,OVER,25.5


#### add raw bmi pre 2018 from lu

In [147]:
import pandas as pd
from pathlib import Path

ROOT = Path("/Users/dengshuyue/Desktop/SDOH/analysis")
LU_PATH = ROOT / "data/cov/nhanes_primary_anal_full_singleimputation_v2.csv"

# 1) read only needed columns
usecols = ["SEQN", "bmi"]
lu = pd.read_csv(LU_PATH, usecols=usecols)

# 2) basic cleaning
lu["SEQN"] = pd.to_numeric(lu["SEQN"], errors="coerce").astype("Int64")
lu["bmi"]  = pd.to_numeric(lu["bmi"], errors="coerce")

# 3) keep one row per SEQN (if duplicates exist)
lu = lu.dropna(subset=["SEQN"]).drop_duplicates(subset=["SEQN"], keep="last")

# 4) rename to avoid touching your existing 'bmi'
lu = lu.rename(columns={"bmi": "bmi_lu"})

# 5) merge WITHOUT overwriting your current bmi
df_my_cov_aligned_short["SEQN"] = pd.to_numeric(df_my_cov_aligned_short["SEQN"], errors="coerce").astype("Int64")
df_my_cov_aligned_short = df_my_cov_aligned_short.merge(lu, on="SEQN", how="left")

# (optional) if you only want to fill gaps in your current bmi:
# df_my_cov_aligned_short["bmi"] = df_my_cov_aligned_short["bmi"].fillna(df_my_cov_aligned_short["bmi_lu"])


In [148]:
df_my_cov_aligned_short[["bmi_cat", "bmi", "bmi_lu"]]

Unnamed: 0,bmi_cat,bmi,bmi_lu
0,UNDER,,14.90
1,NORMAL,,24.90
2,UNDER,,17.63
3,,,15.20
4,OVER,,29.10
...,...,...,...
128804,UNDER,15.4,
128805,,,
128806,OVER,26.4,
128807,OVER,25.5,


#### re define bmi category 

In [149]:
import pandas as pd
import numpy as np

def define_bmi_cat(df, bmi_col="bmi", bmi_lu_col="bmi_lu"):
    # prefer own bmi, fall back to bmi_lu
    bmi_combined = df[bmi_col].combine_first(df[bmi_lu_col])
    bmi_combined = pd.to_numeric(bmi_combined, errors="coerce")

    cat = pd.Series(pd.NA, index=df.index, dtype="string")
    cat = cat.mask(bmi_combined.notna() & (bmi_combined < 18.5), "UNDER")
    cat = cat.mask(bmi_combined.notna() & (bmi_combined >= 18.5) & (bmi_combined < 25), "NORMAL")
    cat = cat.mask(bmi_combined.notna() & (bmi_combined >= 25), "OVER")

    return cat

# apply
df_my_cov_aligned_short["bmi_cat_new"] = define_bmi_cat(df_my_cov_aligned_short)

# quick check
df_my_cov_aligned_short[["bmi","bmi_lu","bmi_cat_new"]].head(10)


Unnamed: 0,bmi,bmi_lu,bmi_cat_new
0,,14.9,UNDER
1,,24.9,NORMAL
2,,17.63,UNDER
3,,15.2,UNDER
4,,29.1,OVER
5,,22.56,NORMAL
6,,29.39,OVER
7,,15.51,UNDER
8,,18.48,UNDER
9,,30.94,OVER


In [150]:
import pandas as pd
import numpy as np

# choose your dataframe
df = df_my_cov_aligned_short  # update if named differently

# 1) filter to age > 20 (RIDAGEYR is NHANES age in years)
df_age = df[pd.to_numeric(df["RIDAGEYR"], errors="coerce") > 20].copy()

# 2) category counts & percentages for bmi_cat_new
cat_counts = df_age["bmi_cat_new"].value_counts(dropna=False)
cat_perc = (cat_counts / cat_counts.sum() * 100).round(1)
cat_table = pd.DataFrame({"n": cat_counts, "%": cat_perc})
print("BMI category counts/% (age > 20):")
print(cat_table)

# 3) numeric BMI summary using combined BMI (prefer 'bmi', fallback to 'bmi_lu')
bmi_combined = (
    pd.to_numeric(df_age["bmi"], errors="coerce")
      .combine_first(pd.to_numeric(df_age["bmi_lu"], errors="coerce"))
)

num_table = bmi_combined.describe(percentiles=[0.25, 0.5, 0.75]).round(2)
print("\nCombined BMI numeric summary (age > 20):")
print(num_table)


BMI category counts/% (age > 20):
                 n     %
bmi_cat_new             
OVER         48307  68.1
NORMAL       18760  26.4
<NA>          2665   3.8
UNDER         1224   1.7

Combined BMI numeric summary (age > 20):
count    68291.00
mean        29.09
std          6.96
min         11.10
25%         24.30
50%         27.98
75%         32.50
max        130.21
Name: bmi, dtype: float64


In [151]:
#### drop old bmi col

In [152]:
cols_to_drop = ["bmi_cat_old", "BMI_CLAS", "bmi_cat"]
df_my_cov_aligned_short.drop(columns=cols_to_drop, errors="ignore", inplace=True)

In [153]:
df_my_cov_aligned_short[["bmi","bmi_lu","bmi_cat_new"]].head(10)

Unnamed: 0,bmi,bmi_lu,bmi_cat_new
0,,14.9,UNDER
1,,24.9,NORMAL
2,,17.63,UNDER
3,,15.2,UNDER
4,,29.1,OVER
5,,22.56,NORMAL
6,,29.39,OVER
7,,15.51,UNDER
8,,18.48,UNDER
9,,30.94,OVER


In [154]:
df_my_cov_aligned_short.columns

Index(['SEQN', 'SDDSRVYR', 'sdmvpsu', 'sdmvstra', 'RIDAGEYR', 'SEX', 'RACE',
       're', 'household_size', 'DMDHHSIZ', 'EDU', 'pir', 'met_hr',
       'SMK_STATUS', 'CIGS_PER_DAY', 'FORMER_SMOKER', 'METSCORE', 'LTPA',
       'DIABE', 'HYPERTEN', 'chol_rx', 'CVD', 'cancer', 'probable_depression',
       'ahei_total', 'unemployment2', 'ins', 'HOQ065', 'marriage', 'SNAP',
       'FS', 'WTINT2YR', 'WTMEC2YR', 'WTSAF2YR', 'WTINT4YR', 'WTMEC4YR',
       'WTPH2YR', 'WTINTPRP', 'WTMECPRP', 'WTSAFPRP', 'wt_int', 'wt_mec',
       'wt_fasting', 'wt_phlebotomy', 'marriage_prev', 'marriage_label',
       'marriage3', 'SNAP_src', 'SNAP_bin', 'SNAP_src_rank', 'SNAP_indiv_only',
       'SNAP_indiv_plus_singleton', 'bmi', 'HOQ065_structural_missing',
       'household_size_structural_missing', 'chol_rx_structural_missing',
       'RIAGENDR', 'drinking', 'alcg2', 'perE_alco', 'METSCORE_fromPAQ',
       'LTPA_fromPAQ', 'met_hr_recalc_from', 'chol_rx200', 'SMK_STATUS_STD',
       'CIGS_PER_DAY_CAT', 'bmi_

## drop other unrelated col

In [155]:
drop_cols = [
    "chol_rx_structural_missing",
    "HOQ065_structural_missing",
    "household_size_structural_missing",
    #"METSCORE_fromPAQ",
    #"LTPA_fromPAQ",
    "met_hr_recalc_from"
]

df_my_cov_aligned_short.drop(columns=drop_cols, errors="ignore", inplace=True)


In [159]:
df_my_cov_aligned_short.shape

(128809, 64)

## Save 

In [160]:
from pathlib import Path

out_path = Path("/Users/dengshuyue/Desktop/SDOH/analysis/output/cov_addv6_99_23.parquet")
out_path.parent.mkdir(parents=True, exist_ok=True)

# Save with snappy compression (good default); requires pyarrow or fastparquet installed
df_my_cov_aligned_short.to_parquet(out_path, index=False, compression="snappy")
print(f"Saved: {out_path}")


Saved: /Users/dengshuyue/Desktop/SDOH/analysis/output/cov_addv6_99_23.parquet
