<a href="https://colab.research.google.com/github/alfredqbit/NU-DDS-8515/blob/main/sepulvedaADDS-8515-9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Clustering and Discriminant Analysis on the WDBC Dataset
#
This notebook implements the analysis described in the paper:
 - Clustering: k-means, hierarchical clustering, DBSCAN
 - Discriminant analysis: LDA and QDA
 - Evaluation and visualization
#
All figures are saved into the `figures/` subdirectory for inclusion in LaTeX.

In [None]:
import os
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import (
    adjusted_rand_score,
    classification_report,
    confusion_matrix,
    silhouette_score,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Global plotting and output configuration

FIG_DIR = Path("figures")
FIG_DIR.mkdir(parents=True, exist_ok=True)

plt.rcParams["figure.figsize"] = (8, 6)
plt.rcParams["font.size"] = 12

# 1. Data Loading and Exploratory Analysis

In [None]:
def load_wdbc(as_frame: bool = True):
    """
    Load the Wisconsin Diagnostic Breast Cancer dataset.

    Preference: ucimlrepo (direct from UCI). Fallback: scikit-learn.
    """
    try:
        from ucimlrepo import fetch_ucirepo

        bc = fetch_ucirepo(id=17)  # WDBC
        X = bc.data.features.copy()
        y = bc.data.targets.iloc[:, 0].copy()
        # Map string labels to binary {0,1}
        if y.dtype == object:
            y = y.map({"B": 0, "benign": 0, "M": 1, "malignant": 1})
        if not as_frame:
            return X.values, y.values
        return X, y
    except Exception:
        from sklearn.datasets import load_breast_cancer

        data = load_breast_cancer(as_frame=as_frame)
        if as_frame:
            X = data["data"].copy()
            y = pd.Series(data["target"], name="diagnosis")
        else:
            X, y = data["data"], data["target"]
        return X, y


# %%
X, y = load_wdbc(as_frame=True)
X.head()

# %%
print("Shape:", X.shape)
print("Target distribution:")
print(y.value_counts(normalize=True).rename("proportion"))

# %%
# Basic correlation heatmap (for sanity)
import seaborn as sns

corr = X.corr()
plt.figure()
sns.heatmap(corr, cmap="coolwarm", center=0, square=True, cbar_kws={"shrink": 0.7})
plt.title("Feature Correlation Matrix (WDBC)")
plt.tight_layout()
plt.savefig(FIG_DIR / "wdbc_corr_heatmap.png", dpi=300)
plt.close()

 # 2. PCA Visualization

In [None]:
pca = PCA(n_components=2, random_state=42)
X_std = StandardScaler().fit_transform(X)
X_pca = pca.fit_transform(X_std)

df_pca = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
df_pca["diagnosis"] = y.values

plt.figure()
sns.scatterplot(
    data=df_pca,
    x="PC1",
    y="PC2",
    hue="diagnosis",
    palette="Set1",
    alpha=0.8,
)
plt.title("WDBC: First Two Principal Components by Diagnosis")
plt.legend(title="Diagnosis", loc="best")
plt.tight_layout()
plt.savefig(FIG_DIR / "wdbc_pca_true_labels.png", dpi=300)
plt.close()

# 3. Clustering Analysis
#
We standardize the features and then apply:
 - k-means for a range of K with elbow and silhouette analysis
 - agglomerative hierarchical clustering (Ward)
 - DBSCAN

In [None]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

# 3.1 k-Means: Elbow and Silhouette

In [None]:
def run_kmeans_elbow_silhouette(X_std, y, k_range=range(2, 11)):
    inertias = []
    silhouettes = []
    ari_scores = []

    for k in k_range:
        km = KMeans(n_clusters=k, n_init=20, random_state=42)
        labels = km.fit_predict(X_std)
        inertias.append(km.inertia_)
        silhouettes.append(silhouette_score(X_std, labels))
        ari_scores.append(adjusted_rand_score(y, labels))

    return inertias, silhouettes, ari_scores


k_range = range(2, 11)
inertias, silhouettes, ari_scores = run_kmeans_elbow_silhouette(X_std, y, k_range)

# Elbow plot
plt.figure()
plt.plot(list(k_range), inertias, marker="o")
plt.xlabel("Number of clusters K")
plt.ylabel("Inertia (within-cluster sum of squares)")
plt.title("k-Means Elbow Plot (WDBC)")
plt.tight_layout()
plt.savefig(FIG_DIR / "wdbc_kmeans_elbow.png", dpi=300)
plt.close()

# Silhouette and ARI vs K
fig, ax = plt.subplots()
ax.plot(list(k_range), silhouettes, marker="o", label="Silhouette")
ax.set_xlabel("Number of clusters K")
ax.set_ylabel("Silhouette score")
ax2 = ax.twinx()
ax2.plot(list(k_range), ari_scores, marker="s", color="tab:red", label="ARI (vs. labels)")
ax2.set_ylabel("Adjusted Rand Index")
plt.title("k-Means Quality Metrics vs K")
fig.tight_layout()
plt.savefig(FIG_DIR / "wdbc_kmeans_silhouette_ari.png", dpi=300)
plt.close()

best_k = 2  # chosen based on elbow + interpretability


 3.2 k-Means: Cluster Visualization in PCA Space

In [None]:
km_best = KMeans(n_clusters=best_k, n_init=20, random_state=42)
cluster_labels = km_best.fit_predict(X_std)
ari_best = adjusted_rand_score(y, cluster_labels)
print(f"k-means with K={best_k}, ARI vs labels: {ari_best:.3f}")

df_pca["cluster"] = cluster_labels

plt.figure()
sns.scatterplot(
    data=df_pca,
    x="PC1",
    y="PC2",
    hue="cluster",
    palette="Set2",
    alpha=0.8,
)
plt.scatter(
    pca.transform(km_best.cluster_centers_)[:, 0],
    pca.transform(km_best.cluster_centers_)[:, 1],
    marker="X",
    s=120,
    edgecolor="black",
    label="Centroids",
)
plt.title(f"WDBC: PCA Projection with k-Means Clusters (K={best_k})")
plt.legend(loc="best")
plt.tight_layout()
plt.savefig(FIG_DIR / "wdbc_kmeans_pca_clusters.png", dpi=300)
plt.close()

# 3.3 Hierarchical Clustering (Ward)
#
For the dendrogram, we subsample points for readability.

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

# Subsample for dendrogram (e.g., 120 points)
rng = np.random.default_rng(42)
idx_sub = rng.choice(len(X_std), size=min(120, len(X_std)), replace=False)
Z = linkage(X_std[idx_sub], method="ward", metric="euclidean")

plt.figure(figsize=(10, 5))
dendrogram(Z, truncate_mode="level", p=5, no_labels=True)
plt.title("Hierarchical Clustering Dendrogram (Ward, Subsample)")
plt.xlabel("Sample index")
plt.ylabel("Distance")
plt.tight_layout()
plt.savefig(FIG_DIR / "wdbc_hierarchical_dendrogram.png", dpi=300)
plt.close()

# Cut into two clusters and compute ARI
agg = AgglomerativeClustering(n_clusters=2, linkage="ward")
hier_labels = agg.fit_predict(X_std)
print("Agglomerative (Ward), ARI vs labels:", adjusted_rand_score(y, hier_labels))

# 3.4 DBSCAN
#
We perform a quick sweep over eps values to see cluster structure.

In [None]:
def run_dbscan_sweep(X_std, y, eps_values=(0.5, 0.7, 0.9), min_samples=5):
    results = []
    for eps in eps_values:
        db = DBSCAN(eps=eps, min_samples=min_samples)
        labels = db.fit_predict(X_std)
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = np.sum(labels == -1)
        ari = adjusted_rand_score(y, labels) if n_clusters > 1 else np.nan
        results.append((eps, n_clusters, n_noise, ari))
    return pd.DataFrame(
        results, columns=["eps", "n_clusters", "n_noise", "ARI_vs_labels"]
    )


db_results = run_dbscan_sweep(X_std, y, eps_values=(0.5, 0.7, 0.9, 1.1))
print(db_results)

# Pick one eps that yields a small number of clusters
chosen_eps = 0.7
db = DBSCAN(eps=chosen_eps, min_samples=5)
db_labels = db.fit_predict(X_std)

df_pca["dbscan_label"] = db_labels

plt.figure()
sns.scatterplot(
    data=df_pca,
    x="PC1",
    y="PC2",
    hue="dbscan_label",
    palette="tab10",
    alpha=0.8,
)
plt.title(f"WDBC: DBSCAN Clusters in PCA Space (eps={chosen_eps})")
plt.tight_layout()
plt.savefig(FIG_DIR / "wdbc_dbscan_pca_clusters.png", dpi=300)
plt.close()

# 4. Discriminant Analysis (LDA and QDA)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4.1 LDA and QDA Pipelines (Full Feature Space)

In [None]:
lda_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("lda", LinearDiscriminantAnalysis()),
    ]
)

qda_pipe = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("qda", QuadraticDiscriminantAnalysis()),
    ]
)

lda_pipe.fit(X_train, y_train)
qda_pipe.fit(X_train, y_train)

y_pred_lda = lda_pipe.predict(X_test)
y_pred_qda = qda_pipe.predict(X_test)

print("LDA accuracy:", (y_pred_lda == y_test).mean())
print("QDA accuracy:", (y_pred_qda == y_test).mean())

print("\nLDA classification report:")
print(classification_report(y_test, y_pred_lda))

print("QDA classification report:")
print(classification_report(y_test, y_pred_qda))

# Confusion matrices
def plot_confusion(cm, title, filename):
    plt.figure()
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        cbar=False,
        xticklabels=["Benign", "Malignant"],
        yticklabels=["Benign", "Malignant"],
    )
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(FIG_DIR / filename, dpi=300)
    plt.close()


cm_lda = confusion_matrix(y_test, y_pred_lda)
cm_qda = confusion_matrix(y_test, y_pred_qda)

plot_confusion(cm_lda, "LDA Confusion Matrix (WDBC)", "wdbc_lda_confusion.png")
plot_confusion(cm_qda, "QDA Confusion Matrix (WDBC)", "wdbc_qda_confusion.png")

# 4.2 Decision Boundaries in PCA Space
#
For visualization, we reduce features to two principal components and train
LDA and QDA on this 2D representation.

In [None]:
# PCA on full dataset
pca2 = PCA(n_components=2, random_state=42)
X_std_full = StandardScaler().fit_transform(X)
X_pca2 = pca2.fit_transform(X_std_full)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(
    X_pca2, y, test_size=0.2, random_state=42, stratify=y
)

lda_pca = LinearDiscriminantAnalysis()
qda_pca = QuadraticDiscriminantAnalysis()

lda_pca.fit(X_train_pca, y_train_pca)
qda_pca.fit(X_train_pca, y_train_pca)

# Grid for decision boundaries
x_min, x_max = X_pca2[:, 0].min() - 1, X_pca2[:, 0].max() + 1
y_min, y_max = X_pca2[:, 1].min() - 1, X_pca2[:, 1].max() + 1
xx, yy = np.meshgrid(
    np.linspace(x_min, x_max, 400),
    np.linspace(y_min, y_max, 400),
)
grid = np.c_[xx.ravel(), yy.ravel()]

Z_lda = lda_pca.predict(grid).reshape(xx.shape)
Z_qda = qda_pca.predict(grid).reshape(xx.shape)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

for ax, Z, title in zip(
    axes,
    [Z_lda, Z_qda],
    ["LDA Decision Regions (PC1-PC2)", "QDA Decision Regions (PC1-PC2)"],
):
    ax.contourf(xx, yy, Z, alpha=0.3, cmap="Pastel1")
    scatter = ax.scatter(
        X_train_pca[:, 0],
        X_train_pca[:, 1],
        c=y_train_pca,
        cmap="Set1",
        edgecolor="k",
        alpha=0.8,
    )
    ax.set_xlabel("PC1")
    ax.set_ylabel("PC2")
    ax.set_title(title)

handles, labels = scatter.legend_elements()
fig.legend(handles, ["Benign", "Malignant"], title="Diagnosis", loc="upper right")
plt.tight_layout()
plt.savefig(FIG_DIR / "wdbc_lda_qda_decision_boundaries.png", dpi=300)
plt.close()