SOCIO ECONOMIC HEALTH DISPARITY

In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load NHANES datasets
files = {
    "DEMO_L": "DEMO_L.XPT",    # Demographics
    "BMX_L": "BMX_L.XPT",      # Body Measures
    "BPX_L": "BPXO_L.XPT",      # Blood Pressure
    "DIQ_L": "DIQ_L.XPT",      # Diabetes
    "MCQ_L": "MCQ_L.XPT",      # Cardiovascular Disease
    "GLU_L": "GLU_L.XPT",      # Glucose
    "INS_L": "INS_L.XPT",      # Insulin
    "HSCRP_L": "HSCRP_L.XPT",  # High-Sensitivity C-Reactive Protein
    "TCHOL_L": "TCHOL_L.XPT",  # Total Cholesterol
    "PAQ_L": "PAQ_L.XPT"       # Physical Activity
}

dataframes = {name: pd.read_sas(f"{filename}", format="xport") for name, filename in files.items()}

# Merge datasets on 'SEQN'
df = dataframes["DEMO_L"]
for name, df_other in dataframes.items():
    if name != "DEMO_L":
        df = df.merge(df_other, on="SEQN", how="inner")

# Select relevant columns
selected_columns = [
    "SEQN", "RIDAGEYR", "RIAGENDR", "RIDRETH3", "DMDEDUC2", "INDFMPIR",  # Socioeconomic & Demographics
    "BMXBMI", "BMXWAIST",  # Body Measures
    "BPXSY1", "BPXDI1",  # Blood Pressure
    "DIQ010", "MCQ160E",  # Diabetes & CVD
    "LBXGLU", "LBXINS", "LBXHSCRP", "LBXTC",  # Lab results
    "PAD790Q", "PAD790U", "PAD800", "PAD810Q", "PAD810U", "PAD820"  # Physical Activity
]
df = df[[col for col in selected_columns if col in df.columns]]

# Convert non-numeric columns to NaN before computing median
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Handle missing values: Fill NaNs with column medians
df.fillna(df.median(), inplace=True)

# Normalize numerical features for clustering
numeric_features = ["BMXBMI", "BMXWAIST", "BPXSY1", "BPXDI1", "LBXGLU", "LBXINS", "LBXHSCRP", "LBXTC"]
numeric_features = [col for col in numeric_features if col in df.columns]
scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Save final dataset
df.to_csv("NHANES_SES_HealthClusters.csv", index=False)
print("Final dataset saved as 'NHANES_SES_HealthClusters.csv'")


Final dataset saved as 'NHANES_SES_HealthClusters.csv'
