<a href="https://colab.research.google.com/github/abzzy001/Machine-Learning-Model-for-Pediatric-Type-2-Diabetes-Risk-Prediction/blob/main/01_NHANES_Fairness_PedsT2D_v2_DIQ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Equitable AI for Early Detection (NHANES)
## Notebook 1 — Data ingestion, harmonization, and analytic cohort (v2 with DIQ)

**Paper topic:** *Equitable AI for Early Detection: A Fairness-Aware Machine Learning Model for Pediatric Type 2 Diabetes Risk Prediction in Underserved US Populations Using Multi-cycle NHANES Data*

This notebook:
- Loads NHANES `.xpt` files for cycles **H, I, J**
- Merges modules on `SEQN`
- Builds pediatric cohort (10–19)
- Creates lab outcomes (HbA1c, fasting glucose)
- Adds DIQ-derived outcomes (diagnosis, treatment, T2D-leaning phenotype)
- Creates underserved indicators
- Creates 6-year weights
- Exports analytic CSVs for Notebook 2

> **Colab tip:** Upload your `.xpt` files into the Colab session (Files panel), or mount Drive and set `DATA_DIR` accordingly.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:


from pathlib import Path

DATA_DIR = Path("/content/drive/MyDrive/nhanes_data")
DATA_DIR
sorted([p.name for p in DATA_DIR.glob("*.xpt")])

import os
from pathlib import Path
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)

DATA_DIR = Path("/content/drive/MyDrive/nhanes_data")
CYCLES = ["H", "I", "J"]

FILES = {
    "DEMO": "DEMO_{c}.xpt",
    "BMX":  "BMX_{c}.xpt",
    "BPX":  "BPX_{c}.xpt",
    "DBQ":  "DBQ_{c}.xpt",
    "GHB":  "GHB_{c}.xpt",
    "GLU":  "GLU_{c}.xpt",
    "PAQ":  "PAQ_{c}.xpt",
    "DIQ":  "DIQ_{c}.xpt",
}

def read_xpt(path: Path) -> pd.DataFrame:
    return pd.read_sas(path, format="xport")

def load_cycle(cycle: str) -> dict:
    out = {}
    for mod, patt in FILES.items():
        fp = DATA_DIR / patt.format(c=cycle)
        if not fp.exists():
            raise FileNotFoundError(f"Missing {fp}")
        df = read_xpt(fp)
        df.columns = [str(c).upper() for c in df.columns]
        out[mod] = df
    return out

# File check (NO assertion crash)
missing = []
for c in CYCLES:
    for mod, patt in FILES.items():
        fp = DATA_DIR / patt.format(c=c)
        if not fp.exists():
            missing.append(str(fp))

if missing:
    print("❌ Missing files:")
    for m in missing:
        print(" -", m)
else:
    print("✅ All expected NHANES .xpt files found")


✅ All expected NHANES .xpt files found


In [None]:
import os
from pathlib import Path

# Check if Google Drive is mounted
if os.path.exists('/content/drive'):
    print('Google Drive is mounted successfully.')
else:
    print('Google Drive is NOT mounted. Please run the drive.mount() cell (cell NOFaJE4H51Gn) first.')

# Define DATA_DIR as it is in the notebook
DATA_DIR = Path("/content/drive/My Drive/nhanes_data")

# Check if the DATA_DIR exists
if DATA_DIR.exists():
    print(f'DATA_DIR ({DATA_DIR}) exists. Listing contents:')
    for item in os.listdir(DATA_DIR):
        print(f' - {item}')
else:
    print(f'DATA_DIR ({DATA_DIR}) does NOT exist. Please create this folder in your Google Drive or update DATA_DIR to the correct path.')

# Check for a specific file, e.g., DEMO_H.xpt
demo_h_path = DATA_DIR / "DEMO_H.xpt"
if demo_h_path.exists():
    print(f'DEMO_H.xpt found at {demo_h_path}')
else:
    print(f'DEMO_H.xpt NOT found at {demo_h_path}. Make sure your NHANES .xpt files are in this directory.')

Google Drive is mounted successfully.
DATA_DIR (/content/drive/My Drive/nhanes_data) exists. Listing contents:
 - DEMO_J.xpt
 - BPX_J.xpt
 - BMX_J.xpt
 - PAQ_J.xpt
 - DBQ_J.xpt
 - GHB_J.xpt
 - GLU_J.xpt
 - DEMO_I.xpt
 - BPX_I.xpt
 - BMX_I.xpt
 - DBQ_I.xpt
 - PAQ_I.xpt
 - GHB_I.xpt
 - GLU_I.xpt
 - DEMO_H.xpt
 - BMX_H.xpt
 - BPX_H.xpt
 - DBQ_H.xpt
 - PAQ_H.xpt
 - GLU_H.xpt
 - GHB_H.xpt
 - DIQ_J.xpt
 - DIQ_I.xpt
 - DIQ_H.xpt
 - nhanes_peds_hba1c_analytic_v2.csv
 - nhanes_peds_combo_analytic_v2.csv
DEMO_H.xpt found at /content/drive/My Drive/nhanes_data/DEMO_H.xpt


In [None]:
# --- Load each cycle and merge modules on SEQN ---
cycles = []
for c in CYCLES:
    d = load_cycle(c)

    demo = d["DEMO"].copy()
    demo["NHANES_CYCLE"] = c

    df = demo
    for mod in ["BMX","BPX","DBQ","GHB","GLU","PAQ","DIQ"]:
        df = df.merge(d[mod], on="SEQN", how="left", suffixes=("", f"_{mod}"))

    cycles.append(df)

raw = pd.concat(cycles, ignore_index=True)
print("Merged shape:", raw.shape)
raw.head()


Merged shape: (29400, 309)


Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,DMQMILIZ,DMQADFC,DMDBORN4,DMDCITZN,DMDYRSUS,DMDEDUC3,DMDEDUC2,DMDMARTL,RIDEXPRG,SIALANG,SIAPROXY,SIAINTRP,FIALANG,FIAPROXY,FIAINTRP,MIALANG,MIAPROXY,MIAINTRP,AIALANGA,DMDHHSIZ,DMDFMSIZ,DMDHHSZA,DMDHHSZB,DMDHHSZE,DMDHRGND,DMDHRAGE,DMDHRBR4,DMDHREDU,DMDHRMAR,DMDHSEDU,WTINT2YR,WTMEC2YR,SDMVPSU,SDMVSTRA,INDHHIN2,INDFMIN2,INDFMPIR,NHANES_CYCLE,BMDSTATS,BMXWT,BMIWT,BMXRECUM,BMIRECUM,BMXHEAD,BMIHEAD,BMXHT,BMIHT,BMXBMI,BMDBMIC,BMXLEG,BMILEG,BMXARML,BMIARML,BMXARMC,BMIARMC,BMXWAIST,BMIWAIST,BMXSAD1,BMXSAD2,BMXSAD3,BMXSAD4,BMDAVSAD,BMDSADCM,PEASCST1,PEASCTM1,PEASCCT1,BPXCHR,BPAARM,BPACSZ,BPXPLS,BPXPULS,BPXPTY,BPXML1,BPXSY1,BPXDI1,BPAEN1,BPXSY2,BPXDI2,BPAEN2,BPXSY3,BPXDI3,BPAEN3,BPXSY4,BPXDI4,BPAEN4,DBQ010,DBD030,DBD041,DBD050,DBD055,...,PAQ742,PAQ744,PAQ746,PAQ748,PAQ755,PAQ759A,PAQ759B,PAQ759C,PAQ759D,PAQ759E,PAQ759F,PAQ759G,PAQ759H,PAQ759I,PAQ759J,PAQ759K,PAQ759L,PAQ759M,PAQ759N,PAQ759O,PAQ759P,PAQ759Q,PAQ759R,PAQ759S,PAQ759T,PAQ759U,PAQ759V,PAQ762,PAQ764,PAQ766,PAQ679,PAQ750,PAQ770,PAQ772A,PAQ772B,PAQ772C,PAAQUEX,DIQ010,DID040,DIQ160,DIQ170,DIQ172,DIQ175A,DIQ175B,DIQ175C,DIQ175D,DIQ175E,DIQ175F,DIQ175G,DIQ175H,DIQ175I,DIQ175J,DIQ175K,DIQ175L,DIQ175M,DIQ175N,DIQ175O,DIQ175P,DIQ175Q,DIQ175R,DIQ175S,DIQ175T,DIQ175U,DIQ175V,DIQ175W,DIQ175X,DIQ180,DIQ050,DID060,DIQ060U,DIQ070,DIQ230,DIQ240,DID250,DID260,DIQ260U,DIQ275,DIQ280,DIQ291,DIQ300S,DIQ300D,DID310S,DID310D,DID320,DID330,DID341,DID350,DIQ350U,DIQ360,DIQ080,DMDHRAGZ,DMDHREDZ,DMDHRMAZ,DMDHSEDZ,BMXHIP,BMIHIP,DBQ930,DBQ935,DBQ940,DBQ945
0,73557.0,8.0,2.0,1.0,69.0,,4.0,4.0,1.0,,1.0,1.0,1.0,1.0,,,3.0,4.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,3.0,3.0,5.397605e-79,5.397605e-79,2.0,1.0,69.0,1.0,3.0,4.0,,13281.237386,13481.042095,1.0,112.0,4.0,4.0,0.84,H,1.0,78.3,,,,,,171.3,,26.7,,39.2,,40.2,,35.3,,100.0,,20.5,20.6,,,20.6,,1.0,620.0,,,1.0,4.0,86.0,1.0,1.0,140.0,122.0,72.0,2.0,114.0,76.0,2.0,102.0,74.0,2.0,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,62.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,2.0,1.0,1.0,1.0,2.0,5.397605e-79,,1.0,12.0,99.0,9999.0,9999.0,6666.0,6666.0,5555.0,,2.0,5.397605e-79,,4.0,2.0,,,,,,,,,,
1,73558.0,8.0,2.0,1.0,54.0,,3.0,3.0,1.0,,2.0,,1.0,1.0,,,3.0,1.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,4.0,4.0,5.397605e-79,2.0,5.397605e-79,1.0,54.0,1.0,3.0,1.0,1.0,23682.057386,24471.769625,1.0,108.0,7.0,7.0,1.78,H,1.0,89.5,,,,,,176.8,,28.6,,40.0,,41.5,,34.7,,107.6,,24.2,24.5,,,24.4,,1.0,766.0,,,1.0,4.0,74.0,1.0,1.0,170.0,156.0,62.0,2.0,160.0,80.0,2.0,156.0,42.0,2.0,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,23.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,2.0,2.0,5.0,1.0,6.0,1.0,1.0,9.0,,,128.0,82.0,9999.0,9999.0,147.0,147.0,6.0,1.0,1.0,1.0,1.0,,,,,,,,,,
2,73559.0,8.0,2.0,1.0,72.0,,3.0,3.0,2.0,,1.0,1.0,1.0,1.0,,,4.0,1.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,,2.0,2.0,5.397605e-79,5.397605e-79,2.0,1.0,72.0,1.0,4.0,1.0,3.0,57214.803319,57193.285376,1.0,109.0,10.0,10.0,4.51,H,1.0,88.9,,,,,,175.3,,28.9,,40.0,,41.0,,33.5,,109.2,,25.8,25.4,,,25.6,,1.0,665.0,,,1.0,4.0,68.0,1.0,1.0,160.0,140.0,90.0,2.0,140.0,76.0,2.0,146.0,80.0,2.0,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,57.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,16.0,2.0,2.0,2.0,2.0,,3.0,1.0,1.0,14.0,3.0,136.0,89.0,6666.0,6666.0,9999.0,6666.0,2.0,1.0,1.0,2.0,2.0,,,,,,,,,,
3,73560.0,8.0,2.0,1.0,9.0,,3.0,3.0,1.0,119.0,,,1.0,1.0,,3.0,,,,1.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,4.0,4.0,5.397605e-79,2.0,5.397605e-79,1.0,33.0,1.0,3.0,1.0,4.0,55201.178592,55766.512438,2.0,109.0,9.0,9.0,2.52,H,1.0,32.2,,,,,,137.3,,17.1,2.0,33.5,,29.5,,21.0,,61.0,,14.8,15.0,,,14.9,,1.0,803.0,,,1.0,2.0,64.0,1.0,1.0,130.0,108.0,38.0,2.0,102.0,34.0,2.0,104.0,38.0,2.0,,,,,,,,,...,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,1.0,5.0,4.0,,1.0,2.0,,,,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,73561.0,8.0,2.0,2.0,73.0,,3.0,3.0,1.0,,2.0,,1.0,1.0,,,5.0,1.0,,1.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,,2.0,2.0,5.397605e-79,5.397605e-79,2.0,1.0,78.0,1.0,5.0,1.0,5.0,63709.667069,65541.871229,2.0,116.0,15.0,15.0,5.0,H,3.0,52.0,,,,,,162.4,,19.7,,36.3,,37.5,,25.2,,,1.0,,,,,,1.0,1.0,949.0,,,1.0,3.0,92.0,1.0,1.0,170.0,136.0,86.0,2.0,134.0,88.0,1.0,142.0,86.0,2.0,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### 1) Pediatric cohort
We use ages **10–19 years** (RIDAGEYR).

In [None]:
df = raw.copy()
df = df[df["RIDAGEYR"].between(10, 19, inclusive="both")]

for c in ["RIDAGEYR","INDFMPIR","BMXBMI","BMXWT","BMXHT","LBXGH","LBXGLU","DIQ160"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

print("Pediatric cohort shape:", df.shape)
df[["RIDAGEYR","RIAGENDR","BMXBMI","LBXGH","LBXGLU"]].describe(include="all")


Pediatric cohort shape: (5201, 309)


Unnamed: 0,RIDAGEYR,RIAGENDR,BMXBMI,LBXGH,LBXGLU
count,5201.0,5201.0,4930.0,3403.0,1574.0
mean,14.193617,1.502019,23.496978,5.257537,96.033672
std,2.882241,0.500044,6.369516,0.361408,12.713409
min,10.0,1.0,12.6,4.0,63.0
25%,12.0,1.0,19.1,5.1,91.0
50%,14.0,2.0,22.0,5.3,96.0
75%,17.0,2.0,26.4,5.4,100.0
max,19.0,2.0,72.6,10.2,369.0


### 2) Blood pressure averages

In [None]:
def row_mean(cols):
    x = df[cols].apply(pd.to_numeric, errors="coerce")
    return x.mean(axis=1, skipna=True)

sys_cols = [c for c in ["BPXSY1","BPXSY2","BPXSY3","BPXSY4"] if c in df.columns]
dia_cols = [c for c in ["BPXDI1","BPXDI2","BPXDI3","BPXDI4"] if c in df.columns]

if sys_cols:
    df["SBP_MEAN"] = row_mean(sys_cols)
if dia_cols:
    df["DBP_MEAN"] = row_mean(dia_cols)

df[["SBP_MEAN","DBP_MEAN"]].describe()


Unnamed: 0,SBP_MEAN,DBP_MEAN
count,4750.0,4750.0
mean,107.660281,57.93396
std,9.852295,13.06787
min,74.0,5.397605e-79
25%,100.666667,52.0
50%,107.333333,59.33333
75%,114.0,66.66667
max,158.666667,97.0


### 3) Lab-defined outcomes

In [None]:
df["dysglycemia_hba1c"] = np.where(df["LBXGH"].notna(), (df["LBXGH"] >= 5.7).astype(int), np.nan)
df["diabetes_hba1c"]    = np.where(df["LBXGH"].notna(), (df["LBXGH"] >= 6.5).astype(int), np.nan)

df["dysglycemia_combo"] = np.where(
    df["LBXGH"].notna() | df["LBXGLU"].notna(),
    ((df["LBXGH"] >= 5.7) | (df["LBXGLU"] >= 100)).astype(int),
    np.nan
)
df["diabetes_combo"] = np.where(
    df["LBXGH"].notna() | df["LBXGLU"].notna(),
    ((df["LBXGH"] >= 6.5) | (df["LBXGLU"] >= 126)).astype(int),
    np.nan
)

df[["LBXGH","LBXGLU","dysglycemia_hba1c","dysglycemia_combo"]].head(12)


Unnamed: 0,LBXGH,LBXGLU,dysglycemia_hba1c,dysglycemia_combo
15,,,,
16,,,,
19,5.1,89.0,0.0,0.0
21,,,,
22,5.5,,0.0,0.0
27,5.4,,0.0,0.0
30,5.7,94.0,1.0,1.0
31,,,,
42,5.2,,0.0,0.0
44,5.3,,0.0,0.0


### 4) DIQ-derived outcomes (NEW)

In [None]:
df["dx_diabetes"] = np.where(df["DIQ010"].notna(), (df["DIQ010"] == 1).astype(int), np.nan)
df["on_insulin"] = np.where(df["DIQ050"].notna(), (df["DIQ050"] == 1).astype(int), np.nan)
df["on_oral_meds"] = np.where(df["DIQ070"].notna(), (df["DIQ070"] == 1).astype(int), np.nan)

df["peds_t2d_like"] = np.where(
    (
        (df["dx_diabetes"] == 1) &
        (df["RIDAGEYR"] >= 10) &
        (df["BMXBMI"] >= 25) &
        ((df["on_insulin"] == 0) | (df["on_insulin"].isna()))
    ),
    1,
    0
)

df[["DIQ010","DIQ050","DIQ070","DIQ160","dx_diabetes","on_insulin","on_oral_meds","peds_t2d_like"]].head(12)


Unnamed: 0,DIQ010,DIQ050,DIQ070,DIQ160,dx_diabetes,on_insulin,on_oral_meds,peds_t2d_like
15,2.0,2.0,,,0.0,0.0,,0
16,2.0,2.0,,,0.0,0.0,,0
19,2.0,2.0,,2.0,0.0,0.0,,0
21,2.0,2.0,,2.0,0.0,0.0,,0
22,2.0,2.0,,2.0,0.0,0.0,,0
27,2.0,2.0,,2.0,0.0,0.0,,0
30,2.0,2.0,,2.0,0.0,0.0,,0
31,2.0,2.0,,,0.0,0.0,,0
42,2.0,2.0,,2.0,0.0,0.0,,0
44,2.0,2.0,,2.0,0.0,0.0,,0


### 5) Equity / underserved indicators

In [None]:
df["sex_female"] = np.where(df["RIAGENDR"].notna(), (df["RIAGENDR"] == 2).astype(int), np.nan)
df["low_income_13"] = np.where(df["INDFMPIR"].notna(), (df["INDFMPIR"] < 1.3).astype(int), np.nan)
df["low_income_20"] = np.where(df["INDFMPIR"].notna(), (df["INDFMPIR"] < 2.0).astype(int), np.nan)

race_col = "RIDRETH3" if "RIDRETH3" in df.columns else "RIDRETH1"
df["race_eth"] = df[race_col]

df[["INDFMPIR","low_income_13","race_eth","sex_female"]].head()


Unnamed: 0,INDFMPIR,low_income_13,race_eth,sex_female
15,0.41,1.0,4.0,1.0
16,1.79,0.0,4.0,0.0
19,1.58,0.0,4.0,0.0
21,0.58,1.0,1.0,0.0
22,2.97,0.0,3.0,1.0


### 6) Multi-cycle weights

In [None]:
if "WTMEC2YR" in df.columns:
    df["WTMEC6YR"] = pd.to_numeric(df["WTMEC2YR"], errors="coerce") / len(CYCLES)
else:
    df["WTMEC6YR"] = np.nan

if "WTSAF2YR" in df.columns:
    df["WTSAF6YR"] = pd.to_numeric(df["WTSAF2YR"], errors="coerce") / len(CYCLES)
else:
    df["WTSAF6YR"] = np.nan

print("Design cols present:", [c for c in ["SDMVPSU","SDMVSTRA"] if c in df.columns])
df[["WTMEC2YR","WTMEC6YR","WTSAF2YR","WTSAF6YR"]].head()


Design cols present: ['SDMVPSU', 'SDMVSTRA']


Unnamed: 0,WTMEC2YR,WTMEC6YR,WTSAF2YR,WTSAF6YR
15,12764.396165,4254.798722,,
16,6460.787962,2153.595987,,
19,12665.77009,4221.923363,33038.13,11012.71
21,11758.631127,3919.543709,5.397605e-79,1.799202e-79
22,70708.03403,23569.344677,,


### 7) Feature selection

In [None]:
base_features = ["RIDAGEYR","sex_female","BMXBMI","SBP_MEAN","DBP_MEAN","INDFMPIR"]

candidate_prefixes = ("PAQ","DBQ")
cand = [c for c in df.columns if c.startswith(candidate_prefixes)]

numeric_cand = []
for c in cand:
    s = pd.to_numeric(df[c], errors="coerce")
    if s.notna().mean() >= 0.70:
        numeric_cand.append(c)

numeric_cand = [c for c in numeric_cand if c not in {"SEQN","NHANES_CYCLE"}]
features = [c for c in (base_features + numeric_cand) if c in df.columns]

print(f"Using {len(features)} features")
print(features[:30], "..." if len(features)>30 else "")


Using 10 features
['RIDAGEYR', 'sex_female', 'BMXBMI', 'SBP_MEAN', 'DBP_MEAN', 'INDFMPIR', 'DBQ197', 'DBQ360', 'DBQ370', 'DBQ400'] 


### 8) Build analytic datasets + export

In [None]:
model_cols = [
    "SEQN","NHANES_CYCLE",
    "race_eth","low_income_13","low_income_20","sex_female",
    "WTMEC6YR","WTSAF6YR","SDMVPSU","SDMVSTRA"
]
outcomes = [
    "LBXGH","LBXGLU",
    "dysglycemia_hba1c","dysglycemia_combo","diabetes_hba1c","diabetes_combo",
    "dx_diabetes","on_insulin","on_oral_meds","peds_t2d_like"
]

keep_cols = [c for c in (model_cols + features + outcomes) if c in df.columns]
model_df = df[keep_cols].copy()

df_hba1c = model_df.dropna(subset=["LBXGH","WTMEC6YR","race_eth","sex_female","INDFMPIR","BMXBMI","RIDAGEYR"], how="any").copy()
df_hba1c["y_lab"] = (df_hba1c["LBXGH"] >= 5.7).astype(int)
df_hba1c["y_dx"]  = df_hba1c["dx_diabetes"].fillna(0).astype(int)
df_hba1c["y_t2d_like"] = df_hba1c["peds_t2d_like"].astype(int)
df_hba1c["sample_weight"] = df_hba1c["WTMEC6YR"].astype(float)

df_combo = model_df.dropna(subset=["race_eth","sex_female","INDFMPIR","BMXBMI","RIDAGEYR"], how="any").copy()
df_combo = df_combo[df_combo["LBXGH"].notna() | df_combo["LBXGLU"].notna()].copy()

df_combo["y_lab"] = ((df_combo["LBXGH"] >= 5.7) | (df_combo["LBXGLU"] >= 100)).astype(int)
df_combo["y_dx"]  = df_combo["dx_diabetes"].fillna(0).astype(int)
df_combo["y_t2d_like"] = df_combo["peds_t2d_like"].astype(int)

if df_combo["WTSAF6YR"].notna().any():
    df_combo["sample_weight"] = df_combo["WTSAF6YR"].fillna(df_combo["WTMEC6YR"]).astype(float)
    weight_note = "WTSAF6YR available: used WTSAF6YR when present; otherwise fell back to WTMEC6YR."
else:
    df_combo["sample_weight"] = df_combo["WTMEC6YR"].astype(float)
    weight_note = "WTSAF6YR not present: used WTMEC6YR throughout (note as limitation if using fasting glucose)."

print(weight_note)
print("HbA1c dataset:", df_hba1c.shape, "y_lab prev:", df_hba1c["y_lab"].mean())
print("Combo dataset:", df_combo.shape, "y_lab prev:", df_combo["y_lab"].mean())


WTSAF6YR available: used WTSAF6YR when present; otherwise fell back to WTMEC6YR.
HbA1c dataset: (3055, 34) y_lab prev: 0.08150572831423895
Combo dataset: (3056, 34) y_lab prev: 0.19306282722513088


In [None]:
OUT_DIR = DATA_DIR
p1 = OUT_DIR / "nhanes_peds_hba1c_analytic_v2.csv"
p2 = OUT_DIR / "nhanes_peds_combo_analytic_v2.csv"

df_hba1c.to_csv(p1, index=False)
df_combo.to_csv(p2, index=False)

print("Saved:")
print(" -", p1)
print(" -", p2)


Saved:
 - /content/drive/My Drive/nhanes_data/nhanes_peds_hba1c_analytic_v2.csv
 - /content/drive/My Drive/nhanes_data/nhanes_peds_combo_analytic_v2.csv


✅ Notebook 1 v2 done.