Physical Activity Clustering – Identify patterns in physical activity levels and their correlation with health outcomes.

In [7]:
# 1. Physical Activity Data
# PAXMIN, PAXMIAF, PAXTMIN, PAXTMAF (Physical Activity Monitor - PAM)
# These files contain accelerometer-based physical activity measurements, which provide objective measures of movement intensity and duration.
# PAQ (Physical Activity Questionnaire)
# Self-reported physical activity levels, including time spent in moderate and vigorous activities.

# 2. Health Outcomes Data
# BMX (Body Measures)
# Contains BMI, waist circumference, and other obesity-related metrics.
# DIQ (Diabetes)
# Includes self-reported diabetes diagnosis and prediabetes indicators.
# MCQ (Medical Conditions)
# Contains self-reported cardiovascular disease (heart attack, stroke, heart failure, etc.) and other health conditions.

# 3. Demographics and Other Relevant Data
# DEMO (Demographics)
# Includes age, gender, race, socioeconomic status, which could be useful for additional analysis of clustering patterns.
# BPX (Blood Pressure & Hypertension Status)
# Important for understanding cardiovascular risks.
# LAB Data (Glucose, Cholesterol, etc.)
# Useful for linking physical activity patterns with metabolic health indicators.

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def convert_to_weekly(frequency, unit):
    """Converts activity frequency to a weekly scale based on unit."""
    conversion_factors = {1: 1, 2: 7, 3: 30, 4: 365}  # Day, Week, Month, Year
    return frequency * conversion_factors.get(unit, 1)

# Load NHANES datasets
dataframes = {
    "PAQ_L": pd.read_sas("PAQ_L.xpt", format="xport"),
    "BMX_L": pd.read_sas("BMX_L.xpt", format="xport"),
    "DIQ_L": pd.read_sas("DIQ_L.xpt", format="xport"),
    "MCQ_L": pd.read_sas("MCQ_L.xpt", format="xport"),
    "BPXO_L": pd.read_sas("BPXO_L.xpt", format="xport"),
    "GLU_L": pd.read_sas("GLU_L.xpt", format="xport"),
    "INS_L": pd.read_sas("INS_L.xpt", format="xport"),
    "HSCRP_L": pd.read_sas("HSCRP_L.xpt", format="xport"),
    "TCHOL_L": pd.read_sas("TCHOL_L.xpt", format="xport"),
    "DEMO_L": pd.read_sas("DEMO_L.xpt", format="xport")
}

# Convert byte-string columns to standard string format
def decode_bytes(df):
    for col in df.select_dtypes(include=['object']):
        df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
    return df

dataframes = {name: decode_bytes(df) for name, df in dataframes.items()}

# Merge datasets on 'SEQN'
df = dataframes["PAQ_L"]
for name, df_other in dataframes.items():
    if name != "PAQ_L":
        df = df.merge(df_other, on="SEQN", how="inner")

# Convert activity frequencies to weekly scale if available
if "PAD790Q" in df.columns and "PAD790U" in df.columns:
    df["ModerateActivityWeekly"] = df.apply(lambda row: convert_to_weekly(row["PAD790Q"], row["PAD790U"]), axis=1)
if "PAD810Q" in df.columns and "PAD810U" in df.columns:
    df["VigorousActivityWeekly"] = df.apply(lambda row: convert_to_weekly(row["PAD810Q"], row["PAD810U"]), axis=1)

# Calculate Total Physical Activity if necessary columns exist
if "ModerateActivityWeekly" in df.columns and "PAD800" in df.columns and "VigorousActivityWeekly" in df.columns and "PAD820" in df.columns:
    df["TotalPhysicalActivity"] = (df["ModerateActivityWeekly"] * df["PAD800"]) + (df["VigorousActivityWeekly"] * df["PAD820"] * 2)

# Categorize Physical Activity Levels (Based on WHO Guidelines)
def categorize_activity(minutes):
    if minutes < 150:
        return "Low"
    elif 150 <= minutes <= 300:
        return "Moderate"
    else:
        return "High"

df["ActivityCategory"] = df["TotalPhysicalActivity"].apply(categorize_activity) if "TotalPhysicalActivity" in df.columns else "Unknown"

# Create binary health condition flags based on available columns
if "BMXBMI" in df.columns:
    df["Obese"] = (df["BMXBMI"] >= 30).astype(int)
if "DIQ010" in df.columns:
    df["Diabetes"] = (df["DIQ010"] == 1).astype(int)
if "MCQ160E" in df.columns:
    df["CVD"] = df["MCQ160E"].notnull().astype(int)
if "BPXSY1" in df.columns and "BPXDI1" in df.columns:
    df["Hypertension"] = ((df["BPXSY1"] >= 130) | (df["BPXDI1"] >= 80)).astype(int)

# Select only relevant columns that exist in the dataset
final_columns = [
    "SEQN", "PAD790Q", "PAD790U", "PAD800", "PAD810Q", "PAD810U", "PAD820", "PAD680",
    "TotalPhysicalActivity", "ActivityCategory", "BMXBMI", "BMXWAIST", "Obese",
    "DIQ010", "Diabetes", "MCQ160E", "CVD", "BPXSY1", "BPXDI1", "Hypertension",
    "LBXGLU", "LBXINS", "LBXHSCRP", "LBXTC", "RIDAGEYR", "RIAGENDR", "INDFMPIR"
]
df = df[[col for col in final_columns if col in df.columns]]

# Convert non-numeric columns to NaN before computing median
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Handle missing values: Fill NaNs with column medians
df.fillna(df.median(), inplace=True)

# Normalize numerical features for clustering
numeric_features = [col for col in ["TotalPhysicalActivity", "BMXBMI", "BMXWAIST", "BPXSY1", "BPXDI1", "LBXGLU", "LBXINS", "LBXHSCRP", "LBXTC"] if col in df.columns]
scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Save final dataset
df.to_csv("NHANES_2021_Clustering_Dataset.csv", index=False)

print("Final dataset saved as 'NHANES_2021_Clustering_Dataset.csv'")
