In [1]:
# Data Sources (NHANES Variables)

# Dietary Data:

# Total energy intake (kcal) 
# Macronutrient distribution (carbohydrates, proteins, fats) 
# Micronutrient intake (vitamins, minerals) 
# Food group consumption (fruits, vegetables, dairy, meat, grains)


# Health & Biomarkers:

# BMI, waist circumference
# Blood glucose, HbA1c (diabetes risk)
# Cholesterol levels (LDL, HDL, triglycerides)
# Blood pressure


# Lifestyle & Demographics:

# Physical activity levels
# Smoking and alcohol consumption
# Age, sex, ethnicity, socioeconomic status

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load NHANES datasets
dataframes = {
    "DR1TOT_L": pd.read_sas("DR1TOT_L.xpt", format="xport"),  # Day 1 Total Nutrient Intakes
    "DR2TOT_L": pd.read_sas("DR2TOT_L.xpt", format="xport"),  # Day 2 Total Nutrient Intakes
    "DBQ_L": pd.read_sas("DBQ_L.xpt", format="xport"),  # Diet Behavior & Nutrition
    "DEMO_L": pd.read_sas("DEMO_L.xpt", format="xport"),  # Demographics
    "BMX_L": pd.read_sas("BMX_L.xpt", format="xport"),  # Body Measures (BMI)
    "DIQ_L": pd.read_sas("DIQ_L.xpt", format="xport")  # Diabetes
}

# Convert byte-string columns to standard string format
def decode_bytes(df):
    for col in df.select_dtypes(include=['object']):
        df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
    return df

dataframes = {name: decode_bytes(df) for name, df in dataframes.items()}

# Merge datasets on 'SEQN' (Unique Participant ID)
df = dataframes["DR1TOT_L"]
for name, df_other in dataframes.items():
    if name != "DR1TOT_L":
        df = df.merge(df_other, on="SEQN", how="inner")

# Select relevant features
final_columns = [
    "SEQN",  # Unique ID
    "DR1TKCAL", "DR1TPROT", "DR1TCARB", "DR1TSUGR", "DR1TFIBE", "DR1TTFAT", "DR1TSFAT", "DR1TMFAT", "DR1TPFAT",  # Day 1 Nutrients
    "DR2TKCAL", "DR2TPROT", "DR2TCARB", "DR2TSUGR", "DR2TFIBE", "DR2TTFAT", "DR2TSFAT", "DR2TMFAT", "DR2TPFAT",  # Day 2 Nutrients
    "DBQ700", "DBQ197",  # Dietary Behavior
    "RIDAGEYR", "RIAGENDR", "INDFMPIR",  # Demographics & Socioeconomic
    "BMXBMI",  # BMI
    "DIQ010"  # Diabetes status
]

# Retain only available columns
df = df[[col for col in final_columns if col in df.columns]]

# Convert non-numeric values to NaN before computing median
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Handle missing values: Fill NaNs with column medians
df.fillna(df.median(), inplace=True)

# Normalize numerical features for clustering
numeric_features = [col for col in df.columns if col not in ["SEQN", "RIAGENDR", "DIQ010"]]
scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Save final dataset
df.to_csv("NHANES_2021_Dietary_Clustering.csv", index=False)

print("Final dataset saved as 'NHANES_2021_Dietary_Clustering.csv'")


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/DR1TOT_L.xpt'