In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load NHANES datasets
files = {
    "DEMO_L": "DEMO_L.XPT",     # Demographics
    "HOQ_L": "HOQ_L.XPT",       # Housing
    "INQ_L": "INQ_L.XPT",       # Total Savings
    "BMX_L": "BMX_L.XPT",       # BMI
    "BPXO_L": "BPXO_L.XPT",     # Blood pressure
    "TCHOL_L": "TCHOL_L.XPT",   # Cholesterol
    "GLU_L": "GLU_L.XPT",       # Glucose
    "INS_L": "INS_L.XPT",       # Insulin
    "HSCRP_L": "HSCRP_L.XPT",   # High-sensitivity C-reactive protein
    "GHB_L": "GHB_L.XPT",       # Glycohemoglobin
    "FERTIN_L": "FERTIN_L.XPT", # Ferritin
    "FOLATE": "FOLATE_L.XPT"      # RBC Folate
}

dataframes = {name: pd.read_sas(f"{filename}", format="xport") for name, filename in files.items()}

# Merge datasets on 'SEQN', avoiding the column 'WTPH2YR_x'
df = dataframes["DEMO_L"]
for name, df_other in dataframes.items():
    if name != "DEMO_L":
        df_other = df_other.drop(columns=["WTPH2YR"], errors='ignore')
        df = df.merge(df_other, on="SEQN", how="outer")

# Select relevant columns
selected_columns = [
    "SEQN", "RIDAGEYR", "RIAGENDR", "DMDEDUC2", "INDFMPIR",  # Demographics (DEMO_L)
    "HOD051",                       # Housing (HOQ_L)
    "IND310",                       # Total savings (INQ_L)
    # "OCD150",                     # Occupational (OCQ_L)
    "BMXBMI", "BMXWAIST",           # BMI (BMX_L)
    "BPXOSY1", "BPXODI1",           # Blood pressure (BPXO_L)
    "LBXTC", #"LBXHDL", "LBXLDL",    # Cholesterol (TCHOL_L)
    "LBXGLU",                       # Glucose (GLU_L)
    "LBXINS",                       # Insulin (INS_L)
    "LBXHSCRP",                     # High-sensitivity C-reactive protein (HSCRP_L)
    "LBXGH",                        # Glycohemoglobin (GHB_L)
    "LBXFER",                       # Ferritin (FERTIN_L)
    "LBDRFOSI"                      # RBC Folate (FOLATE
]
df = df[[col for col in selected_columns if col in df.columns]]

# Convert non-numeric columns to NaN before computing median
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Handle missing values: Fill NaNs with column medians
df.fillna(df.median(), inplace=True)

# # Normalize numerical features for clustering
# numeric_features = ["BMXBMI", "BMXWAIST", "BPXSY1", "BPXDI1", "LBXGLU", "LBXINS", "LBXHSCRP", "LBXTC"]
# numeric_features = [col for col in numeric_features if col in df.columns]
# scaler = StandardScaler()
# df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Save final dataset
df.to_csv("NHANES_SES_HealthIndicator_RawData.csv", index=False)
print("Final dataset saved as 'SES_RAW DATA/health indicators/NHANES_SES_HealthIndicator_RawData.csv'")

Final dataset saved as 'SES_RAW DATA/health indicators/NHANES_SES_HealthIndicator_RawData.csv'


In [14]:
file = "NHANES_SES_HealthIndicator_RawData.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,DMDEDUC2,INDFMPIR,HOD051,IND310,BMXBMI,BMXWAIST,BPXOSY1,BPXODI1,LBXTC,LBXGLU,LBXHSCRP,LBXGH,LBXFER,LBDRFOSI
0,130378.0,43.0,1.0,5.0,5.0,10.0,1.0,27.0,98.3,135.0,98.0,264.0,113.0,1.78,5.6,38.0,698.0
1,130379.0,66.0,1.0,5.0,5.0,9.0,1.0,33.5,114.7,121.0,84.0,214.0,99.0,2.03,5.6,38.0,933.0
2,130380.0,44.0,2.0,3.0,1.41,6.0,1.0,29.7,93.5,111.0,79.0,187.0,156.0,5.62,6.2,13.3,830.0
3,130381.0,5.0,2.0,4.0,1.53,4.0,1.0,23.8,70.4,117.0,72.0,178.0,100.0,1.45,5.5,38.0,1050.0
4,130382.0,2.0,1.0,4.0,3.6,8.0,1.0,26.4,92.7,117.0,72.0,178.0,100.0,1.45,5.5,24.4,1050.0


In [15]:
df.columns

Index(['SEQN', 'RIDAGEYR', 'RIAGENDR', 'DMDEDUC2', 'INDFMPIR', 'HOD051',
       'IND310', 'BMXBMI', 'BMXWAIST', 'BPXOSY1', 'BPXODI1', 'LBXTC', 'LBXGLU',
       'LBXHSCRP', 'LBXGH', 'LBXFER', 'LBDRFOSI'],
      dtype='object')

In [16]:
df.shape

(11933, 17)