<a href="https://colab.research.google.com/github/abzzy001/Machine-Learning-Model-for-Pediatric-Type-2-Diabetes-Risk-Prediction/blob/main/03_NHANES_Fairness_PedsT2D_v2_DIQ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Equitable AI for Early Detection (NHANES)
## Notebook 3 — Publication pack (v2 with DIQ)


In [19]:
from google.colab import drive
drive.mount("/content/gdrive")



Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [20]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path("/content/drive/MyDrive/nhanes_data/results")

print("Files in DATA_DIR:")
for f in DATA_DIR.iterdir():
    print(" -", f.name)

df = pd.read_csv(DATA_DIR / "nhanes_peds_hba1c_analytic_v2.csv")
print("Data loaded. Shape:", df.shape)
df.head()


Files in DATA_DIR:
 - fairness_summary_race_mitigation.csv
 - income_mitigation_cv5_before_after.csv
 - nhanes_peds_combo_analytic_v2.csv
 - race_mitigation_cv5_before_after.csv
 - nhanes_peds_hba1c_analytic_v2.csv
Data loaded. Shape: (3055, 34)


Unnamed: 0,SEQN,NHANES_CYCLE,race_eth,low_income_13,low_income_20,sex_female,WTMEC6YR,WTSAF6YR,SDMVPSU,SDMVSTRA,...,diabetes_hba1c,diabetes_combo,dx_diabetes,on_insulin,on_oral_meds,peds_t2d_like,y_lab,y_dx,y_t2d_like,sample_weight
0,73576.0,H,4.0,0.0,1.0,0.0,4221.923363,11012.709893,1.0,104.0,...,0.0,0.0,0.0,0.0,,0,0,0,0,4221.923363
1,73579.0,H,3.0,0.0,0.0,1.0,23569.344677,,1.0,110.0,...,0.0,0.0,0.0,0.0,,0,0,0,0,23569.344677
2,73584.0,H,3.0,0.0,0.0,0.0,24060.746515,,1.0,105.0,...,0.0,0.0,0.0,0.0,,0,0,0,0,24060.746515
3,73587.0,H,6.0,0.0,0.0,0.0,5174.364709,12355.377283,2.0,115.0,...,0.0,0.0,0.0,0.0,,0,1,0,0,5174.364709
4,73599.0,H,3.0,0.0,1.0,1.0,9096.060346,,2.0,107.0,...,0.0,0.0,0.0,0.0,,0,0,0,0,9096.060346


In [None]:
methods_template = r'''
### Data source and study design
We conducted a cross-sectional machine learning study using publicly available data from NHANES. We combined three NHANES cycles (2013–2014, 2015–2016, and 2017–2018) and followed NHANES analytic guidance for multi-cycle weighting.

### Study population
We included participants aged 10–19 years with available glycemic measurements and non-missing core covariates.

### Outcomes
Primary outcome: lab-defined dysglycemia using HbA1c (LBXGH) ≥5.7%. Sensitivity outcomes included self-reported diagnosed diabetes (DIQ010), treatment indicators (DIQ050 insulin; DIQ070 oral medications), and a conservative pediatric Type 2 diabetes–leaning phenotype derived from diagnosis, age, BMI proxy, and non-insulin treatment pattern. Because NHANES does not directly encode diabetes type, the T2D-leaning phenotype was treated as a supportive sensitivity label rather than definitive subtype classification.

### Predictors
Predictors included age, sex, race/ethnicity, BMI, blood pressure averages, and poverty-income ratio, plus selected diet/physical activity variables.

### Survey weighting
We constructed 6-year weights by dividing 2-year weights by three. Primary analyses used MEC exam weights for HbA1c; fasting weights were used for fasting glucose when available.

### Model development, fairness, and mitigation
We trained weighted models under cross-validation, evaluated performance (AUC/PR-AUC/sensitivity/specificity), assessed fairness (demographic parity and equalized odds) across race/ethnicity, sex, and low-income status, and applied equalized-odds post-processing using Fairlearn’s ThresholdOptimizer.
'''
print(methods_template)


In [21]:
target_col = "y_lab"  # or y_dx / y_t2d_like
df = df.copy()
df["y"] = df[target_col].astype(int)

def make_table1(df_in):
    rows = []
    for grp, sub in [
        ("Overall", df_in),
        ("y=0", df_in[df_in["y"]==0]),
        ("y=1", df_in[df_in["y"]==1])
    ]:
        rows.append({
            "Group": grp,
            "N": len(sub),
            "Age_mean": sub["RIDAGEYR"].mean(),
            "BMI_mean": sub["BMXBMI"].mean(),
            "PIR_mean": sub["INDFMPIR"].mean(),
            "SBP_mean": sub["SBP_MEAN"].mean() if "SBP_MEAN" in sub.columns else np.nan,
            "DBP_mean": sub["DBP_MEAN"].mean() if "DBP_MEAN" in sub.columns else np.nan,
            "Low_income_<1.3_%": 100*sub["low_income_13"].mean() if "low_income_13" in sub.columns else np.nan,
            "Dx_diabetes_%": 100*sub["y_dx"].mean() if "y_dx" in sub.columns else np.nan,
            "Insulin_use_%": 100*sub["on_insulin"].mean() if "on_insulin" in sub.columns else np.nan,
            "Oral_meds_%": 100*sub["on_oral_meds"].mean() if "on_oral_meds" in sub.columns else np.nan,
        })
    return pd.DataFrame(rows)

t1 = make_table1(df)
t1


Unnamed: 0,Group,N,Age_mean,BMI_mean,PIR_mean,SBP_mean,DBP_mean,Low_income_<1.3_%,Dx_diabetes_%,Insulin_use_%,Oral_meds_%
0,Overall,3055,15.386907,24.467005,2.037394,108.947888,59.43574,41.963993,0.523732,0.261866,5.434783
1,y=0,2806,15.401996,24.151247,2.064501,108.680391,59.483185,41.197434,0.249465,0.035638,2.898551
2,y=1,249,15.216867,28.025301,1.731928,111.952778,58.902778,50.60241,3.614458,2.811245,13.043478


In [22]:
OUT_DIR = DATA_DIR
t1_path = OUT_DIR / "table1_cohort_characteristics_v2.csv"
t1.to_csv(t1_path, index=False)
print("Saved:", t1_path)


Saved: /content/drive/MyDrive/nhanes_data/results/table1_cohort_characteristics_v2.csv
