Prepare base dataset

In [1]:
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
from minisom import MiniSom  # pip install minisom if not yet
import pandas as pd
import numpy as np
import os

# Load the normalized gene expression data (transposed: samples as rows)
df = pd.read_csv("example/data/GSE/GSE5281_normalized_gene_expression.csv", index_col=0).T

# Add fake metadata labels for comparison
df["Id"] = df.index
df["Species"] = ["Alzheimer's Disease" if i < 87 else "Control" for i in range(len(df))]

features_only = df.drop(columns=["Id", "Species"])
true_labels = df["Species"].replace({"Alzheimer's Disease": 1, "Control": 0}).to_numpy()


k-Means + Hierarchical

In [2]:
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.metrics import pairwise_distances_argmin_min

# Step 2a: k-means
kmeans = KMeans(n_clusters=10, random_state=0)
kmeans_labels = kmeans.fit_predict(features_only)

# Step 2b: hierarchical on k-means cluster centers
Z_kmeans = linkage(kmeans.cluster_centers_, method='average')
final_clusters_kmeans = fcluster(Z_kmeans, 2, criterion='maxclust')  # 2 clusters

# Assign final 2-cluster labels back to samples
predicted_labels_kmeans = [final_clusters_kmeans[label] for label in kmeans_labels]




SOM + Hierarchical

In [3]:
# Step 3a: SOM
som = MiniSom(x=10, y=10, input_len=features_only.shape[1], sigma=1.0, learning_rate=0.5, random_seed=42)
som.train_random(features_only.to_numpy(), 100)

# Get BMU coordinates for each sample
som_coords = np.array([som.winner(x) for x in features_only.to_numpy()])
som_coords_df = pd.DataFrame(som_coords, columns=["x", "y"])

# Step 3b: Hierarchical on SOM node coordinates
Z_som = linkage(som_coords_df.to_numpy(), method='average')
predicted_labels_som = fcluster(Z_som, 2, criterion='maxclust')


DBSCAN

In [4]:
# Step 4: DBSCAN
dbscan = DBSCAN(eps=5, min_samples=5, metric='euclidean')
predicted_labels_dbscan = dbscan.fit_predict(features_only)

# Filter out noise (-1)
valid_idx = predicted_labels_dbscan != -1


Evaluation Function

In [5]:
def evaluate_clustering(true, pred, X):
    pred = np.array(pred)
    mask = ~pd.isnull(pred)
    if len(set(pred[mask])) > 1:
        sil = silhouette_score(X[mask], pred[mask])
        ari = adjusted_rand_score(true[mask], pred[mask])
        nmi = normalized_mutual_info_score(true[mask], pred[mask])
    else:
        sil = ari = nmi = -1  # invalid
    return sil, ari, nmi


 Run Evaluation and Save Results

In [9]:
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

results = []

# ✅ kMeans + Hierarchical
sil, ari, nmi = evaluate_clustering(true_labels, predicted_labels_kmeans, features_only.to_numpy())
ccc_kmeans, _ = cophenet(Z_kmeans, pdist(kmeans.cluster_centers_))
results.append(["kMeans + Hierarchical", sil, ari, nmi, ccc_kmeans])

# ✅ SOM + Hierarchical
sil, ari, nmi = evaluate_clustering(true_labels, predicted_labels_som, features_only.to_numpy())
ccc_som, _ = cophenet(Z_som, pdist(som_coords_df.to_numpy()))
results.append(["SOM + Hierarchical", sil, ari, nmi, ccc_som])

# ✅ DBSCAN (no CCC)
sil, ari, nmi = evaluate_clustering(
    true_labels[valid_idx],
    predicted_labels_dbscan[valid_idx],
    features_only.to_numpy()[valid_idx]
)
results.append(["DBSCAN", sil, ari, nmi, None])


In [10]:
# Save results with CCC
results_df = pd.DataFrame(results, columns=["Method", "Silhouette", "ARI", "NMI", "CCC"])
os.makedirs("results", exist_ok=True)
results_df.to_csv("results/clustering_comparison_metrics.csv", index=False)

print("✅ Comparison results with CCC saved to 'results/clustering_comparison_metrics.csv'")


✅ Comparison results with CCC saved to 'results/clustering_comparison_metrics.csv'
