In [1]:
# Import necessary libraries
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, adjusted_rand_score, davies_bouldin_score
import numpy as np

# Load breast cancer dataset
cancer = load_breast_cancer()
X_cancer = cancer.data
y_cancer = cancer.target  # Ground truth labels for ARI

# Scale the data (critical for clustering)
scaler = StandardScaler()
X_cancer_scaled = scaler.fit_transform(X_cancer)

# Function to evaluate clustering
def evaluate_clustering(model, X, y, model_name):
    # Fit model and get labels
    labels = model.fit_predict(X)
    
    # Compute metrics
    # Silhouette score requires at least 2 clusters
    if len(np.unique(labels)) > 1:
        sil_score = silhouette_score(X, labels)
    else:
        sil_score = None
        print(f"Warning: {model_name} produced only one cluster, silhouette score not computed.")
    
    # Adjusted Rand Index (uses ground truth)
    ari_score = adjusted_rand_score(y, labels)
    
    # Davies-Bouldin Index (requires at least 2 clusters)
    if len(np.unique(labels)) > 1:
        db_score = davies_bouldin_score(X, labels)
    else:
        db_score = None
    
    # Print results
    print(f"\n{model_name} Results:")
    print(f"Number of clusters: {len(np.unique(labels))}")
    if sil_score is not None:
        print(f"Silhouette Score: {sil_score:.4f}")
    print(f"Adjusted Rand Index: {ari_score:.4f}")
    if db_score is not None:
        print(f"Davies-Bouldin Index: {db_score:.4f}")
    
    return labels, sil_score, ari_score, db_score

# Task II: Clustering on Breast Cancer Dataset
print("Breast Cancer Clustering Results:")

# (a) K-Means
kmeans = KMeans(n_clusters=2, random_state=42)  # n_clusters=2 to match binary labels
kmeans_labels, kmeans_sil, kmeans_ari, kmeans_db = evaluate_clustering(
    kmeans, X_cancer_scaled, y_cancer, "K-Means"
)

# (b) Agglomerative Hierarchical Clustering
agglo = AgglomerativeClustering(n_clusters=2)
agglo_labels, agglo_sil, agglo_ari, agglo_db = evaluate_clustering(
    agglo, X_cancer_scaled, y_cancer, "Agglomerative Clustering"
)

# (c) DBSCAN
dbscan = DBSCAN(eps=3.0, min_samples=5)  # Parameters tuned for dataset
dbscan_labels, dbscan_sil, dbscan_ari, dbscan_db = evaluate_clustering(
    dbscan, X_cancer_scaled, y_cancer, "DBSCAN"
)

# Sub-task 1: K-Means Random State Comparison
print("\nSub-task 1: K-Means Random State Comparison")
kmeans_no_random = KMeans(n_clusters=2)  # No random state
kmeans_random_42 = KMeans(n_clusters=2, random_state=42)
kmeans_random_100 = KMeans(n_clusters=2, random_state=100)

# Evaluate with different random states
_, sil_no_random, ari_no_random, db_no_random = evaluate_clustering(
    kmeans_no_random, X_cancer_scaled, y_cancer, "K-Means (no random state)"
)
_, sil_random_42, ari_random_42, db_random_42 = evaluate_clustering(
    kmeans_random_42, X_cancer_scaled, y_cancer, "K-Means (random_state=42)"
)
_, sil_random_100, ari_random_100, db_random_100 = evaluate_clustering(
    kmeans_random_100, X_cancer_scaled, y_cancer, "K-Means (random_state=100)"
)

# Compare random state effects
print("\nEffect of random state change:")
print(f"Silhouette difference (no random state vs random_state=42): {abs(sil_no_random - sil_random_42):.4f}")
print(f"ARI difference (no random state vs random_state=42): {abs(ari_no_random - ari_random_42):.4f}")
print(f"Silhouette difference (random_state=42 vs random_state=100): {abs(sil_random_42 - sil_random_100):.4f}")
print(f"ARI difference (random_state=42 vs random_state=100): {abs(ari_random_42 - ari_random_100):.4f}")

# Sub-task 3: Gaussian Mixture Model
print("\nSub-task 3: Gaussian Mixture Model on Breast Cancer")
gmm = GaussianMixture(n_components=2, random_state=42)
gmm_labels, gmm_sil, gmm_ari, gmm_db = evaluate_clustering(
    gmm, X_cancer_scaled, y_cancer, "Gaussian Mixture Model"
)

Breast Cancer Clustering Results:

K-Means Results:
Number of clusters: 2
Silhouette Score: 0.3447
Adjusted Rand Index: 0.6765
Davies-Bouldin Index: 1.3093

Agglomerative Clustering Results:
Number of clusters: 2
Silhouette Score: 0.3394
Adjusted Rand Index: 0.5750
Davies-Bouldin Index: 1.3700

DBSCAN Results:
Number of clusters: 2
Silhouette Score: 0.3125
Adjusted Rand Index: 0.1285
Davies-Bouldin Index: 2.1355

Sub-task 1: K-Means Random State Comparison

K-Means (no random state) Results:
Number of clusters: 2
Silhouette Score: 0.3434
Adjusted Rand Index: 0.6536
Davies-Bouldin Index: 1.3205

K-Means (random_state=42) Results:
Number of clusters: 2
Silhouette Score: 0.3447
Adjusted Rand Index: 0.6765
Davies-Bouldin Index: 1.3093

K-Means (random_state=100) Results:
Number of clusters: 2
Silhouette Score: 0.3434
Adjusted Rand Index: 0.6536
Davies-Bouldin Index: 1.3205

Effect of random state change:
Silhouette difference (no random state vs random_state=42): 0.0014
ARI difference (no 