In [17]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load NHANES datasets
files = {
    "DEMO_L": "DEMO_L.XPT",     # Demographics
    "HOQ_L": "HOQ_L.XPT",       # Housing
    "INQ_L": "INQ_L.XPT",       # Total Savings
    "BMX_L": "BMX_L.XPT",       # BMI
    "BPXO_L": "BPXO_L.XPT",     # Blood pressure
    "TCHOL_L": "TCHOL_L.XPT",   # Cholesterol
    "GLU_L": "GLU_L.XPT",       # Glucose
    "INS_L": "INS_L.XPT",       # Insulin
    "HSCRP_L": "HSCRP_L.XPT",   # High-sensitivity C-reactive protein
    "GHB_L": "GHB_L.XPT",       # Glycohemoglobin
    "FERTIN_L": "FERTIN_L.XPT", # Ferritin
    "FOLATE": "FOLATE_L.XPT"      # RBC Folate
}

dataframes = {name: pd.read_sas(f"{filename}", format="xport") for name, filename in files.items()}

# Merge datasets on 'SEQN', avoiding the column 'WTPH2YR_x'
df = dataframes["DEMO_L"]
for name, df_other in dataframes.items():
    if name != "DEMO_L":
        if "WTPH2YR" in df_other.columns:
            df_other = df_other.drop(columns=["WTPH2YR"])
        df = df.merge(df_other, on="SEQN", how="inner")

# Select relevant columns
selected_columns = [
    "SEQN", "RIDAGEYR", "RIAGENDR", "DMDEDUC2", "INDFMPIR",  # Demographics (DEMO_L)
    "HOD051",                       # Housing (HOQ_L)
    "IND310",                       # Total savings (INQ_L)
    # "OCD150",                     # Occupational (OCQ_L)
    "BMXBMI", "BMXWAIST",           # BMI (BMX_L)
    "BPXOSY1", "BPXODI1",           # Blood pressure (BPXO_L)
    "LBXTC", #"LBXHDL", "LBXLDL",    # Cholesterol (TCHOL_L)
    "LBXGLU",                       # Glucose (GLU_L)
    "LBXINS",                       # Insulin (INS_L)
    "LBXHSCRP",                     # High-sensitivity C-reactive protein (HSCRP_L)
    "LBXGH",                        # Glycohemoglobin (GHB_L)
    "LBXFER",                       # Ferritin (FERTIN_L)
    "LBDRFOSI"                      # RBC Folate (FOLATE
]
df = df[[col for col in selected_columns if col in df.columns]]

# Convert non-numeric columns to NaN before computing median
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Handle missing values: Fill NaNs with column medians
df.fillna(df.median(), inplace=True)

# # Normalize numerical features for clustering
# numeric_features = ["BMXBMI", "BMXWAIST", "BPXSY1", "BPXDI1", "LBXGLU", "LBXINS", "LBXHSCRP", "LBXTC"]
# numeric_features = [col for col in numeric_features if col in df.columns]
# scaler = StandardScaler()
# df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Save final dataset
df.to_csv("NHANES_SES_HealthIndicator_RawData.csv", index=False)
print("Final dataset saved as 'SES_RAW DATA/health indicators/NHANES_SES_HealthIndicator_RawData.csv'")

Final dataset saved as 'SES_RAW DATA/health indicators/NHANES_SES_HealthIndicator_RawData.csv'


In [18]:
file = "NHANES_SES_HealthIndicator_RawData.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,DMDEDUC2,INDFMPIR,HOD051,IND310,BMXBMI,BMXWAIST,BPXOSY1,BPXODI1,LBXTC,LBXGLU,LBXHSCRP,LBXGH,LBXFER,LBDRFOSI
0,130380.0,44.0,2.0,3.0,1.41,6.0,1.0,29.7,93.5,111.0,79.0,187.0,156.0,5.62,6.2,13.3,830.0
1,130395.0,33.0,2.0,3.0,1.1,2.0,1.0,54.6,142.5,112.0,82.0,174.0,100.0,11.83,5.0,113.0,1410.0
2,130402.0,26.0,2.0,5.0,5.0,5.0,1.0,26.4,88.5,107.0,70.0,135.0,73.0,0.49,5.2,60.5,917.0
3,130424.0,19.0,2.0,4.0,2.33,5.0,1.0,26.4,88.5,107.0,70.0,165.0,87.0,1.83,5.0,52.9,778.0
4,130433.0,14.0,2.0,4.0,5.0,11.0,1.0,17.3,65.2,96.0,61.0,169.0,102.0,0.11,5.7,43.5,1040.0


In [19]:
df.columns

Index(['SEQN', 'RIDAGEYR', 'RIAGENDR', 'DMDEDUC2', 'INDFMPIR', 'HOD051',
       'IND310', 'BMXBMI', 'BMXWAIST', 'BPXOSY1', 'BPXODI1', 'LBXTC', 'LBXGLU',
       'LBXHSCRP', 'LBXGH', 'LBXFER', 'LBDRFOSI'],
      dtype='object')