<a href="https://colab.research.google.com/github/alfredqbit/NU-DDS-8515/blob/main/sepulvedaADDS-8515-8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 # K-Means Clustering and Discriminant Analysis on the Mall Customers Dataset

This notebook:
 - Loads the Mall Customers dataset from Kaggle (or a public mirror).
 - Performs K-Means clustering on standardized annual income and spending score.
 - Uses the Elbow method and silhouette analysis to select the number of clusters.
 - Visualizes the resulting clusters and centroids.
 - Uses the K-Means clusters as labels for LDA and QDA classification.
 - Evaluates and visualizes LDA and QDA performance and decision boundaries.

All plots are saved into a `figures/` subdirectory for inclusion in the LaTeX report.

In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import (
    silhouette_score,
    confusion_matrix,
    classification_report,
    accuracy_score,
)

sns.set(style="whitegrid")

FIG_DIR = Path("figures")
FIG_DIR.mkdir(exist_ok=True)

# 1. Load and inspect the Mall Customers dataset
#
The dataset is originally from Kaggle:
https://www.kaggle.com/datasets/vjchoudhary7/customer-segmentation-tutorial-in-python
#
We try, in order:
 1. Kaggle input path (when running on Kaggle).
 2. Local CSV (e.g., after manual download).
 3. A public GitHub mirror of the same CSV.

In [None]:
def load_mall_customers(
    source: str = "auto",
    local_path: str = "Mall_Customers.csv",
) -> pd.DataFrame:
    # Load the Mall Customers dataset.

    # Parameters
    # ----------
    #source : {'auto', 'local', 'url'}
    #    - 'auto': try Kaggle path, then local_path, then public URL.
    #    - 'local': require Mall_Customers.csv at local_path.
    #    - 'url': use a GitHub raw mirror of the Kaggle CSV.
    # local_path : str
    #    Path to a local Mall_Customers.csv file.

    #Returns
    # -------
    # df : pandas.DataFrame

    # 1. Kaggle standard path
    kaggle_path = Path(
        "/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv"
    )
    if source == "auto" and kaggle_path.exists():
        print(f"Loading dataset from Kaggle path: {kaggle_path}")
        return pd.read_csv(kaggle_path)

    # 2. Local CSV
    local_file = Path(local_path)
    if source in ("auto", "local") and local_file.exists():
        print(f"Loading dataset from local file: {local_file}")
        return pd.read_csv(local_file)

    # 3. Public GitHub mirror (raw CSV)
    if source in ("auto", "url"):
        url = "https://raw.githubusercontent.com/e-vdb/Mall-customers-clustering/main/Mall_Customers.csv"
        print(f"Loading dataset from GitHub raw URL: {url}")
        return pd.read_csv(url)

    raise FileNotFoundError("Could not load Mall_Customers.csv from any source.")


df = load_mall_customers(source="auto")
df.head()

# Basic structure and summary statistics.


In [None]:
df.info()
df.describe(include="all")

# 2. Feature selection and scaling
#
We use the two numerical behavioral features most directly related to purchasing behavior:
 - `Annual Income (k$)`
 - `Spending Score (1-100)`
#
 CustomerID is an identifier and is dropped. Gender and Age are left out of the clustering
 so we can work in a 2D space that is easy to visualize and interpret.

In [None]:
feature_cols = ["Annual Income (k$)", "Spending Score (1-100)"]
X = df[feature_cols].copy()

# Quick scatter plot to see structure
plt.figure()
sns.scatterplot(
    data=df,
    x="Annual Income (k$)",
    y="Spending Score (1-100)"
)
plt.title("Raw data: Annual Income vs Spending Score")
plt.tight_layout()
plt.savefig(FIG_DIR / "raw_income_spend_scatter.png", dpi=300)
plt.close()

# 3. K-Means clustering: Elbow method and silhouette analysis
#
 We standardize the features and run K-Means for K = 2,...,10.
 For each K we record:
 - inertia (within-cluster sum of squares)
 - average silhouette score
#
Then we choose K guided by these diagnostics (often K≈5 for this dataset).

In [None]:
K_range = range(2, 11)
inertias = []
sil_scores = []

for K in K_range:
    pipe = Pipeline(
        steps=[
            ("scaler", StandardScaler()),
            ("kmeans", KMeans(n_clusters=K, n_init=10, random_state=42)),
        ]
    )
    pipe.fit(X)
    inertia = pipe.named_steps["kmeans"].inertia_
    inertias.append(inertia)

    # Silhouette on the standardized space
    X_scaled = pipe.named_steps["scaler"].transform(X)
    labels = pipe.named_steps["kmeans"].labels_
    sil = silhouette_score(X_scaled, labels)
    sil_scores.append(sil)

# Elbow plot
plt.figure()
plt.plot(list(K_range), inertias, marker="o")
plt.xlabel("Number of clusters K")
plt.ylabel("Inertia (within-cluster sum of squares)")
plt.title("Elbow method for K-Means")
plt.tight_layout()
plt.savefig(FIG_DIR / "elbow_inertia.png", dpi=300)
plt.close()

# Silhouette vs K
plt.figure()
plt.plot(list(K_range), sil_scores, marker="o")
plt.xlabel("Number of clusters K")
plt.ylabel("Average silhouette score")
plt.title("Silhouette score vs K")
plt.tight_layout()
plt.savefig(FIG_DIR / "silhouette_score_by_k.png", dpi=300)
plt.close()

list(zip(K_range, inertias, sil_scores))

Based on the elbow and silhouette diagnostics, many analyses (and the literature) settle on K ≈ 5 for this dataset. We can also pick K that maximizes the silhouette.
#
 We'll:
 - compute the best K by silhouette,
 - and you can override it manually if you want a specific K.

In [None]:
best_K_sil = K_range[int(np.argmax(sil_scores))]
print(f"Best K by silhouette: {best_K_sil}")

# You can override this if you want a fixed K (e.g., 5).
K_final = best_K_sil  # or set to 5 explicitly
print(f"Using K_final = {K_final}")

# 4. Fit final K-Means and interpret clusters
#
We fit a final K-Means model with K_final clusters on standardized income and spending score.
 Then:
 - Attach cluster labels to the original DataFrame.
 - Compute centroids in the ORIGINAL feature units.
 - Visualize clusters in the income-spending plane.

In [None]:
kmeans_pipe = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("kmeans", KMeans(n_clusters=K_final, n_init=10, random_state=42)),
    ]
)

kmeans_pipe.fit(X)
X_scaled = kmeans_pipe.named_steps["scaler"].transform(X)
labels = kmeans_pipe.named_steps["kmeans"].labels_
df["cluster"] = labels

# Back-transform centroids to original feature space
centers_scaled = kmeans_pipe.named_steps["kmeans"].cluster_centers_
centers_orig = kmeans_pipe.named_steps["scaler"].inverse_transform(centers_scaled)
centers_df = pd.DataFrame(centers_orig, columns=feature_cols)
centers_df["cluster"] = range(K_final)

centers_df

Scatter plot of clusters in income-spending space with centroids overlaid.

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=df,
    x="Annual Income (k$)",
    y="Spending Score (1-100)",
    hue="cluster",
    palette="tab10",
    s=60,
    alpha=0.8,
)
plt.scatter(
    centers_df["Annual Income (k$)"],
    centers_df["Spending Score (1-100)"],
    s=200,
    c="black",
    marker="X",
    label="Centroids",
)
plt.title(f"K-Means Clusters (K = {K_final})")
plt.legend()
plt.tight_layout()
plt.savefig(FIG_DIR / "kmeans_clusters_income_spend.png", dpi=300)
plt.close()

# Silhouette score for final clustering
final_sil = silhouette_score(X_scaled, labels)
print(f"Final average silhouette score (K={K_final}): {final_sil:.3f}")

# 5. Prepare labeled data for discriminant analysis
#
 We now treat the K-Means cluster labels as class labels for supervised classification.
 We'll:
 - Create X and y arrays.
 - Split into train (80%) and test (20%) sets with stratification.

In [None]:
X_clf = X.values  # same feature set as clustering
y_clf = df["cluster"].values

X_train, X_test, y_train, y_test = train_test_split(
    X_clf, y_clf, test_size=0.2, stratify=y_clf, random_state=42
)

X_train.shape, X_test.shape

# 6. Linear Discriminant Analysis (LDA)
#
 We build an LDA classifier in a pipeline:
 - StandardScaler
 - LinearDiscriminantAnalysis
#
 Then we:
 - Fit on the training set.
 - Evaluate on the test set with accuracy, confusion matrix, and classification report.
 - Save confusion matrix as a PNG.

In [None]:
lda_pipe = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("lda", LinearDiscriminantAnalysis()),
    ]
)

lda_pipe.fit(X_train, y_train)
y_pred_lda = lda_pipe.predict(X_test)

acc_lda = accuracy_score(y_test, y_pred_lda)
print(f"LDA test accuracy: {acc_lda:.3f}\n")

print("LDA classification report:")
print(classification_report(y_test, y_pred_lda))

cm_lda = confusion_matrix(y_test, y_pred_lda)

plt.figure(figsize=(6, 5))
sns.heatmap(cm_lda, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("LDA Confusion Matrix")
plt.tight_layout()
plt.savefig(FIG_DIR / "lda_confusion_matrix.png", dpi=300)
plt.close()

# 7. Quadratic Discriminant Analysis (QDA)
#
Same pipeline structure as LDA, but using QuadraticDiscriminantAnalysis.
We again evaluate on the test set and save the confusion matrix plot.

In [None]:
qda_pipe = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("qda", QuadraticDiscriminantAnalysis()),
    ]
)

qda_pipe.fit(X_train, y_train)
y_pred_qda = qda_pipe.predict(X_test)

acc_qda = accuracy_score(y_test, y_pred_qda)
print(f"QDA test accuracy: {acc_qda:.3f}\n")

print("QDA classification report:")
print(classification_report(y_test, y_pred_qda))

cm_qda = confusion_matrix(y_test, y_pred_qda)

plt.figure(figsize=(6, 5))
sns.heatmap(cm_qda, annot=True, fmt="d", cmap="Oranges")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("QDA Confusion Matrix")
plt.tight_layout()
plt.savefig(FIG_DIR / "qda_confusion_matrix.png", dpi=300)
plt.close()

# 8. Decision boundary visualization for LDA and QDA
#
 We visualize decision boundaries in the 2D feature space:
 - Annual Income (k$)
 - Spending Score (1-100)
#
 We use the fitted pipelines to predict class labels on a grid and show
 the boundaries along with the true K-Means clusters.

In [None]:
ef plot_decision_boundary(model, X, y, feature_names, fname, title):
    """
    Plot decision boundaries for a 2D classifier model.

    Parameters
    ----------
    model : sklearn Pipeline
        Must support .predict() on 2D input.
    X : ndarray, shape (n_samples, 2)
        Input data in ORIGINAL feature units.
    y : ndarray, shape (n_samples,)
        Class labels.
    feature_names : list of str
        Names of the two features [x1_name, x2_name].
    fname : str
        Filename to save under FIG_DIR.
    title : str
        Plot title.
    """
    x_min, x_max = X[:, 0].min() - 5, X[:, 0].max() + 5
    y_min, y_max = X[:, 1].min() - 5, X[:, 1].max() + 5

    xx, yy = np.meshgrid(
        np.linspace(x_min, x_max, 300),
        np.linspace(y_min, y_max, 300),
    )
    grid = np.c_[xx.ravel(), yy.ravel()]
    Z = model.predict(grid)
    Z = Z.reshape(xx.shape)

    plt.figure(figsize=(8, 6))
    plt.contourf(xx, yy, Z, alpha=0.3, levels=np.arange(Z.max() + 2) - 0.5, cmap="tab10")
    scatter = plt.scatter(X[:, 0], X[:, 1], c=y, cmap="tab10", edgecolor="k", s=50)
    plt.xlabel(feature_names[0])
    plt.ylabel(feature_names[1])
    plt.title(title)
    plt.tight_layout()
    plt.savefig(FIG_DIR / fname, dpi=300)
    plt.close()


feature_names = feature_cols  # ["Annual Income (k$)", "Spending Score (1-100)"]

# Fit LDA and QDA on the FULL dataset for cleaner boundaries
lda_pipe_full = Pipeline(
    steps=[("scaler", StandardScaler()), ("lda", LinearDiscriminantAnalysis())]
)
lda_pipe_full.fit(X_clf, y_clf)

qda_pipe_full = Pipeline(
    steps=[("scaler", StandardScaler()), ("qda", QuadraticDiscriminantAnalysis())]
)
qda_pipe_full.fit(X_clf, y_clf)

plot_decision_boundary(
    lda_pipe_full,
    X_clf,
    y_clf,
    feature_names,
    fname="lda_decision_boundary.png",
    title="LDA Decision Boundaries and K-Means Clusters",
)

plot_decision_boundary(
    qda_pipe_full,
    X_clf,
    y_clf,
    feature_names,
    fname="qda_decision_boundary.png",
    title="QDA Decision Boundaries and K-Means Clusters",
)