In [None]:
# ============================================================================
# Import Libraries
# ============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings

warnings.filterwarnings("ignore")
plt.style.use("seaborn-v0_8-darkgrid")
sns.set_palette("husl")

In [None]:
# ============================================================================
# Load and Explore Data
# ============================================================================
df = pd.read_csv("dataset/obesity.csv")

print(f"Dataset shape: {df.shape}")
print(f"\nFirst 5 rows:")
print(df.head())
print(f"\nData types:")
print(df.dtypes)
print(f"\nMissing values:")
print(df.isnull().sum())
print(f"\nDescriptive statistics:")
print(df.describe())

In [None]:
# ============================================================================
# Define Variables
# ============================================================================
# Quantitative variables
quant_vars = ["Age", "Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"]

# Qualitative variables
qual_vars = [
    "Gender",
    "family_history_with_overweight",
    "FAVC",
    "CAEC",
    "SMOKE",
    "CALC",
    "SCC",
    "MTRANS",
]

# Target variable
target = "NObeyesdad"

print(f"Quantitative variables: {quant_vars}")
print(f"Qualitative variables: {qual_vars}")
print(f"Target variable: {target}")
print(f"\nTarget distribution:")
print(df[target].value_counts().sort_index())

In [None]:
# ============================================================================
# Visualize Target Distribution
# ============================================================================
fig, ax = plt.subplots(figsize=(12, 6))
df[target].value_counts().sort_index().plot(kind="bar", ax=ax, color="steelblue")
ax.set_title("Distribution of Obesity Levels", fontsize=16, fontweight="bold")
ax.set_xlabel("Obesity Level", fontsize=12)
ax.set_ylabel("Frequency", fontsize=12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# MCA on Categorical Variables
# ============================================================================
df_qual = df[qual_vars].copy()

# Create dummy variables for categorical features
df_dummies = pd.get_dummies(df_qual, drop_first=False)
print(f"Indicator matrix shape: {df_dummies.shape}")

# Standardize
scaler_mca = StandardScaler()
X_mca_scaled = scaler_mca.fit_transform(df_dummies)

# Apply PCA on indicator matrix (this is MCA)
n_components_mca = 5
pca_mca = PCA(n_components=n_components_mca)
X_mca_transformed = pca_mca.fit_transform(X_mca_scaled)

print("\nVariance explained by each dimension:")
for i, var in enumerate(pca_mca.explained_variance_ratio_, 1):
    print(f"Dimension {i}: {var*100:.2f}%")
print(f"\nCumulative: {pca_mca.explained_variance_ratio_.sum()*100:.2f}%")

In [None]:
# ============================================================================
# MCA Visualizations
# ============================================================================
# Scree plot
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(
    range(1, n_components_mca + 1),
    pca_mca.explained_variance_ratio_ * 100,
    marker="o",
    linestyle="-",
    linewidth=2,
    markersize=8,
)
ax.set_title("MCA - Scree Plot", fontsize=16, fontweight="bold")
ax.set_xlabel("Dimension", fontsize=12)
ax.set_ylabel("Variance Explained (%)", fontsize=12)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Individuals projection
fig, ax = plt.subplots(figsize=(12, 8))
scatter = ax.scatter(
    X_mca_transformed[:, 0],
    X_mca_transformed[:, 1],
    c=pd.Categorical(df[target]).codes,
    cmap="viridis",
    alpha=0.6,
    s=50,
)
ax.set_title("MCA - Individuals Projection (Dim 1-2)", fontsize=16, fontweight="bold")
ax.set_xlabel(f"Dim 1 ({pca_mca.explained_variance_ratio_[0]*100:.2f}%)", fontsize=12)
ax.set_ylabel(f"Dim 2 ({pca_mca.explained_variance_ratio_[1]*100:.2f}%)", fontsize=12)
plt.colorbar(scatter, label="Obesity Level", ax=ax)
plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# PCA on Quantitative Variables - Correlation Matrix
# ============================================================================
df_quant = df[quant_vars].copy()

print("Quantitative variables summary:")
print(df_quant.describe())

# Correlation matrix
corr_matrix = df_quant.corr()
print("\nCorrelation matrix:")
print(corr_matrix)

# Visualize correlation
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(
    corr_matrix,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    center=0,
    ax=ax,
    square=True,
    linewidths=1,
)
ax.set_title(
    "Correlation Matrix - Quantitative Variables", fontsize=16, fontweight="bold"
)
plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# PCA on Quantitative Variables
# ============================================================================
# Standardize data
scaler_pca = StandardScaler()
X_quant_scaled = scaler_pca.fit_transform(df_quant)

# Apply PCA
pca_full = PCA()
X_pca_full = pca_full.fit_transform(X_quant_scaled)

print("Variance explained by each PC:")
for i, var in enumerate(pca_full.explained_variance_ratio_, 1):
    print(f"PC{i}: {var*100:.2f}%")

# Keep first 3 components
n_components_pca = 3
pca = PCA(n_components=n_components_pca)
X_pca = pca.fit_transform(X_quant_scaled)

print(
    f"\n{n_components_pca} PCs explain {pca.explained_variance_ratio_.sum()*100:.2f}% variance"
)

In [None]:
# ============================================================================
# PCA Scree Plot
# ============================================================================
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Variance explained
ax1.bar(
    range(1, len(pca_full.explained_variance_ratio_) + 1),
    pca_full.explained_variance_ratio_ * 100,
    alpha=0.7,
    color="steelblue",
)
ax1.set_title("PCA - Variance per PC", fontsize=14, fontweight="bold")
ax1.set_xlabel("Principal Component", fontsize=12)
ax1.set_ylabel("Variance Explained (%)", fontsize=12)
ax1.grid(True, alpha=0.3)

# Cumulative variance
cumsum_var = np.cumsum(pca_full.explained_variance_ratio_)
ax2.plot(
    range(1, len(cumsum_var) + 1),
    cumsum_var * 100,
    marker="o",
    linestyle="-",
    linewidth=2,
    markersize=8,
    color="darkred",
)
ax2.axhline(y=80, color="green", linestyle="--", label="80% threshold")
ax2.set_title("PCA - Cumulative Variance", fontsize=14, fontweight="bold")
ax2.set_xlabel("Number of Components", fontsize=12)
ax2.set_ylabel("Cumulative Variance (%)", fontsize=12)
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# PCA Variable Contributions
# ============================================================================
# Contributions to PCs
components_df = pd.DataFrame(
    pca.components_.T,
    columns=[f"PC{i+1}" for i in range(n_components_pca)],
    index=quant_vars,
)

print("Variable contributions:")
print(components_df)

# Visualize
fig, axes = plt.subplots(1, n_components_pca, figsize=(18, 5))
for i in range(n_components_pca):
    axes[i].barh(quant_vars, components_df[f"PC{i+1}"], color="coral")
    axes[i].set_title(f"PC{i+1} Contributions", fontweight="bold")
    axes[i].set_xlabel("Contribution", fontsize=10)
    axes[i].axvline(x=0, color="black", linewidth=0.8)
    axes[i].grid(True, alpha=0.3, axis="x")
plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# PCA Correlation Circle
# ============================================================================
fig, ax = plt.subplots(figsize=(10, 10))

# Draw circle
circle = plt.Circle((0, 0), 1, color="navy", fill=False, linewidth=2)
ax.add_patch(circle)

# Plot variable vectors
for i, var in enumerate(quant_vars):
    ax.arrow(
        0,
        0,
        components_df.loc[var, "PC1"],
        components_df.loc[var, "PC2"],
        head_width=0.05,
        head_length=0.05,
        fc="red",
        ec="red",
        linewidth=2,
    )
    ax.text(
        components_df.loc[var, "PC1"] * 1.15,
        components_df.loc[var, "PC2"] * 1.15,
        var,
        fontsize=12,
        ha="center",
        va="center",
        bbox=dict(boxstyle="round,pad=0.3", facecolor="yellow", alpha=0.7),
    )

ax.set_xlim(-1.2, 1.2)
ax.set_ylim(-1.2, 1.2)
ax.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.2f}%)", fontsize=12)
ax.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.2f}%)", fontsize=12)
ax.set_title("PCA - Correlation Circle", fontsize=16, fontweight="bold")
ax.axhline(y=0, color="black", linewidth=0.8)
ax.axvline(x=0, color="black", linewidth=0.8)
ax.grid(True, alpha=0.3)
ax.set_aspect("equal")
plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# PCA Individuals Projection
# ============================================================================
fig, ax = plt.subplots(figsize=(12, 8))
scatter = ax.scatter(
    X_pca[:, 0],
    X_pca[:, 1],
    c=pd.Categorical(df[target]).codes,
    cmap="viridis",
    alpha=0.6,
    s=50,
)
ax.set_title("PCA - Individuals (PC1-PC2)", fontsize=16, fontweight="bold")
ax.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.2f}%)", fontsize=12)
ax.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.2f}%)", fontsize=12)
ax.axhline(y=0, color="black", linewidth=0.8, alpha=0.3)
ax.axvline(x=0, color="black", linewidth=0.8, alpha=0.3)
ax.grid(True, alpha=0.3)
plt.colorbar(scatter, label="Obesity Level", ax=ax)
plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# Hierarchical Clustering
# ============================================================================
# Use PCA data for clustering
X_clustering = X_pca.copy()

# Perform hierarchical clustering (Ward method)
linkage_matrix = linkage(X_clustering, method="ward", metric="euclidean")

# Plot dendrogram
fig, ax = plt.subplots(figsize=(15, 8))
dendrogram(linkage_matrix, ax=ax, no_labels=True, color_threshold=50)
ax.set_title("Hierarchical Clustering - Dendrogram", fontsize=16, fontweight="bold")
ax.set_xlabel("Individual Index", fontsize=12)
ax.set_ylabel("Distance (Ward)", fontsize=12)
ax.axhline(y=50, color="red", linestyle="--", linewidth=2, label="Cut threshold")
ax.legend()
plt.tight_layout()
plt.show()

# Cut dendrogram to create 7 clusters
n_clusters_hier = 7
clusters_hier = fcluster(linkage_matrix, n_clusters_hier, criterion="maxclust")

print(f"Cluster distribution:")
unique, counts = np.unique(clusters_hier, return_counts=True)
for cluster, count in zip(unique, counts):
    print(f"Cluster {cluster}: {count} ({count/len(clusters_hier)*100:.1f}%)")

In [None]:
# ============================================================================
# Hierarchical Clustering Visualization
# ============================================================================
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 7))

# Hierarchical clusters
scatter1 = ax1.scatter(
    X_pca[:, 0], X_pca[:, 1], c=clusters_hier, cmap="tab10", alpha=0.6, s=50
)
ax1.set_title("Hierarchical Clustering (7 clusters)", fontsize=14, fontweight="bold")
ax1.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.2f}%)", fontsize=12)
ax1.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.2f}%)", fontsize=12)
ax1.grid(True, alpha=0.3)
plt.colorbar(scatter1, ax=ax1, label="Cluster")

# True obesity levels
scatter2 = ax2.scatter(
    X_pca[:, 0],
    X_pca[:, 1],
    c=pd.Categorical(df[target]).codes,
    cmap="viridis",
    alpha=0.6,
    s=50,
)
ax2.set_title("True Obesity Levels", fontsize=14, fontweight="bold")
ax2.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.2f}%)", fontsize=12)
ax2.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.2f}%)", fontsize=12)
ax2.grid(True, alpha=0.3)
plt.colorbar(scatter2, ax=ax2, label="Obesity Level")

plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# K-Means Elbow Method
# ============================================================================
# Find optimal K using Elbow method
inertias = []
K_range = range(2, 11)

for k in K_range:
    kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans_temp.fit(X_clustering)
    inertias.append(kmeans_temp.inertia_)

# Plot elbow curve
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(K_range, inertias, marker="o", linestyle="-", linewidth=2, markersize=8)
ax.set_title("K-Means - Elbow Method", fontsize=16, fontweight="bold")
ax.set_xlabel("Number of Clusters (K)", fontsize=12)
ax.set_ylabel("Inertia", fontsize=12)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# K-Means Algorithm
# ============================================================================
# Apply K-means with 7 clusters
n_clusters_kmeans = 7
kmeans = KMeans(n_clusters=n_clusters_kmeans, random_state=42, n_init=10)
clusters_kmeans = kmeans.fit_predict(X_clustering)

print(f"K-means inertia: {kmeans.inertia_:.2f}")
print("\nCluster distribution:")
unique_km, counts_km = np.unique(clusters_kmeans, return_counts=True)
for cluster, count in zip(unique_km, counts_km):
    print(f"Cluster {cluster}: {count} ({count/len(clusters_kmeans)*100:.1f}%)")

In [None]:
# ============================================================================
# K-Means Visualization
# ============================================================================
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 7))

# K-means clusters
scatter1 = ax1.scatter(
    X_pca[:, 0], X_pca[:, 1], c=clusters_kmeans, cmap="tab10", alpha=0.6, s=50
)
centers = kmeans.cluster_centers_
ax1.scatter(
    centers[:, 0],
    centers[:, 1],
    c="red",
    marker="X",
    s=300,
    edgecolors="black",
    linewidths=2,
    label="Centroids",
)
ax1.set_title("K-Means Clustering (7 clusters)", fontsize=14, fontweight="bold")
ax1.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.2f}%)", fontsize=12)
ax1.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.2f}%)", fontsize=12)
ax1.legend()
ax1.grid(True, alpha=0.3)
plt.colorbar(scatter1, ax=ax1, label="Cluster")

# True obesity levels
scatter2 = ax2.scatter(
    X_pca[:, 0],
    X_pca[:, 1],
    c=pd.Categorical(df[target]).codes,
    cmap="viridis",
    alpha=0.6,
    s=50,
)
ax2.set_title("True Obesity Levels", fontsize=14, fontweight="bold")
ax2.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.2f}%)", fontsize=12)
ax2.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.2f}%)", fontsize=12)
ax2.grid(True, alpha=0.3)
plt.colorbar(scatter2, ax=ax2, label="Obesity Level")

plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# LDA Preparation
# ============================================================================
# Use standardized quantitative variables
X_lda = X_quant_scaled.copy()
y_lda = df[target].copy()

# Encode target variable
le = LabelEncoder()
y_lda_encoded = le.fit_transform(y_lda)

print("Target classes:")
for i, class_name in enumerate(le.classes_):
    print(f"{i}: {class_name}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_lda, y_lda_encoded, test_size=0.3, random_state=42, stratify=y_lda_encoded
)

print(f"\nTraining set: {len(X_train)}")
print(f"Test set: {len(X_test)}")

In [None]:
# ============================================================================
# LDA
# ============================================================================
# Apply LDA
n_components_lda = min(len(quant_vars), len(le.classes_) - 1)
lda = LinearDiscriminantAnalysis(n_components=n_components_lda)
X_lda_transformed = lda.fit_transform(X_train, y_train)

print(f"Discriminant axes: {n_components_lda}")
print("\nVariance explained:")
explained_var_ratio = lda.explained_variance_ratio_
for i, var in enumerate(explained_var_ratio, 1):
    print(f"LD{i}: {var*100:.2f}%")
print(f"\nCumulative: {explained_var_ratio.sum()*100:.2f}%")

In [None]:
# ============================================================================
# LDA Projection Visualization
# ============================================================================
X_test_lda = lda.transform(X_test)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 7))

# Training data
scatter1 = ax1.scatter(
    X_lda_transformed[:, 0],
    X_lda_transformed[:, 1],
    c=y_train,
    cmap="viridis",
    alpha=0.6,
    s=50,
)
ax1.set_title("LDA - Training Data (LD1-LD2)", fontsize=14, fontweight="bold")
ax1.set_xlabel(f"LD1 ({explained_var_ratio[0]*100:.2f}%)", fontsize=12)
ax1.set_ylabel(f"LD2 ({explained_var_ratio[1]*100:.2f}%)", fontsize=12)
ax1.grid(True, alpha=0.3)
plt.colorbar(scatter1, ax=ax1, label="Obesity Level")

# Test data
scatter2 = ax2.scatter(
    X_test_lda[:, 0], X_test_lda[:, 1], c=y_test, cmap="viridis", alpha=0.6, s=50
)
ax2.set_title("LDA - Test Data (LD1-LD2)", fontsize=14, fontweight="bold")
ax2.set_xlabel(f"LD1 ({explained_var_ratio[0]*100:.2f}%)", fontsize=12)
ax2.set_ylabel(f"LD2 ({explained_var_ratio[1]*100:.2f}%)", fontsize=12)
ax2.grid(True, alpha=0.3)
plt.colorbar(scatter2, ax=ax2, label="Obesity Level")

plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# LDA Classification Performance
# ============================================================================
y_pred = lda.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {accuracy*100:.2f}%")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))

In [None]:
# ============================================================================
# LDA Confusion Matrix
# ============================================================================
cm = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    ax=ax,
    xticklabels=le.classes_,
    yticklabels=le.classes_,
)
ax.set_title("LDA - Confusion Matrix", fontsize=16, fontweight="bold")
ax.set_xlabel("Predicted Label", fontsize=12)
ax.set_ylabel("True Label", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# Summary
# ============================================================================
print("=" * 80)
print("ANALYSIS SUMMARY")
print("=" * 80)
print(
    f"""
1. MCA - {pca_mca.explained_variance_ratio_.sum()*100:.1f}% variance (5 dims)
2. PCA - {pca.explained_variance_ratio_.sum()*100:.1f}% variance (3 PCs)
3. Hierarchical Clustering - 7 clusters (Ward method)
4. K-Means - 7 clusters
5. LDA - {accuracy*100:.2f}% classification accuracy

Key Findings:
- Quantitative variables strongly correlate with obesity levels
- Weight, Height, Age are main PC1 contributors
- LDA provides best classification performance
- Clear separation between extreme obesity levels
"""
)