In [58]:
import clustbench # clustering-benchmarks
import os.path, genieclust, sklearn.cluster # we will need these later
import matplotlib.pyplot as plt, numpy as np, pandas as pd
import csv

In [59]:
def generate_embeddings(X):
    from sklearn.manifold import TSNE
    from sklearn.decomposition import PCA

    def apply_embedding(model, data):
        embedding = model.fit_transform(data)
        return embedding

    # Example usage with PCA
    pca_model = PCA(n_components=2)
    X_pca = apply_embedding(pca_model, X)
    print("PCA Result:\n", X_pca[:5])

    # Example usage with t-SNE
    tsne_model = TSNE(n_components=2, random_state=42)
    X_tsne = apply_embedding(tsne_model, X)
    print("t-SNE Result:\n", X_tsne[:5])

    X_embedded_dict = {"Base": X, "PCA": X_pca, "t-SNE": X_tsne}
    return X_embedded_dict

In [60]:
data_url = "https://github.com/gagolews/clustering-data-v1/raw/v1.1.0"

def load_data(collection, dataset):
    benchmark = clustbench.load_dataset(collection, dataset, url=data_url)
    X = benchmark.data
    print("Loaded: ", X.shape[0], " | Dimension: ", X.shape[1], " | Label count: ", len(benchmark.labels))
    print("Generating Embeddings...")
    return X, benchmark, generate_embeddings(X)
    

In [61]:
"""
each dataset can have multiple labels, 
pick one at a time and that defines your partition size, aka k
"""
def predict(embedding_technique, X, label, benchmark, plot=False):
    y_true = benchmark.labels[label] 
    (k := max(y_true))  # or benchmark.n_clusters[0]
    # testing genieclust
    g = genieclust.Genie(n_clusters=k)  # using default parameters
    (y_pred := g.fit_predict(X) + 1)  # +1 makes cluster IDs in 1..k, not 0..(k-1)
    """ 
    Overall, Genie returned a clustering quite similar to the reference one. We may consider 107
    (namely, c11 + c22 + c33 ) out of the 120 input points as correctly grouped. In particular, 
    all the red and green reference points (the 2nd and the 3rd row) have been properly discovered.
    """
    cf = genieclust.compare_partitions.compare_partitions(y_true, y_pred)
    print("Confusion Matrix:\n", cf)
    """
    Normalized Clustering Accuracy (NCA) 
    NCA is the averaged percentage of correctly classified points in each cluster 
    above the perfectly uniform label distribution.
    """
    nca_score = genieclust.compare_partitions.normalized_clustering_accuracy(y_true, y_pred)
    print("Normalized Clustering Accuracy: ", nca_score)
    if plot:
        plt.subplot(1, 2, 1)
        genieclust.plots.plot_scatter(X, labels=y_true-1, axis="equal", title="y_true")
        plt.subplot(1, 2, 2)
        genieclust.plots.plot_scatter(X, labels=y_pred-1, axis="equal", title="y_pred")
        plt.show()
    
    return cf, nca_score
    
    

In [62]:
print(os.getcwd()) # run to check current working directory and update file path if needed

/Users/aaditya/development/embedding_based_clustering_research


In [63]:
""" 
Run to set the column names for the csv file
"""
# with open('framework/results/v1_test.csv', mode='a', newline='') as file:
#     writer = csv.writer(file)
#     writer.writerow(["Collection", "Dataset", "Label", "Embedding", "NCA Score"])

' \nRun to set the column names for the csv file\n'

In [64]:
eval_collections = {"wut": ["x2"]}

with open('framework/results/v1_test.csv', mode='a', newline='') as file:
    writer = csv.writer(file)
    for collection, datasets in eval_collections.items():
        for dataset in datasets:
            print(f"Collection: {collection}, Dataset: {dataset}")
            X, benchmark, X_embedded_dict = load_data(collection, dataset)
            for label in range(0, len(benchmark.labels)):
                for key, value in X_embedded_dict.items():
                    cf, nca_score = predict(key, value, label, benchmark)
                    writer.writerow([collection, dataset, label, key, cf, nca_score])

Collection: wut, Dataset: x2
Loaded:  120  | Dimension:  2  | Label count:  2
Generating Embeddings...
PCA Result:
 [[-0.29902341  0.0486238 ]
 [-0.47315591  0.20161112]
 [-0.36135349  0.14382503]
 [-0.58381059  0.40180117]
 [-0.50968657 -0.19857204]]
t-SNE Result:
 [[0.9536017  0.11645736]
 [2.7982829  0.5024181 ]
 [2.0110471  0.0515926 ]
 [3.8441398  0.13945578]
 [0.47838864 2.5857112 ]]
Confusion Matrix:
 {'ar': 0.6882872342370341, 'r': 0.8595238095238096, 'fm': 0.7951855144568276, 'afm': 0.6882935462285218, 'mi': 0.806653849621837, 'nmi': 0.7495519545020478, 'ami': 0.7455077928160847, 'npa': 0.8375000000000001, 'psi': 0.7417149159084644, 'spsi': 0.7384863523573202, 'nca': 0.8700000000000001}
Normalized Clustering Accuracy:  0.8700000000000001
Confusion Matrix:
 {'ar': 0.6882872342370341, 'r': 0.8595238095238096, 'fm': 0.7951855144568276, 'afm': 0.6882935462285218, 'mi': 0.806653849621837, 'nmi': 0.7495519545020478, 'ami': 0.7455077928160847, 'npa': 0.8375000000000001, 'psi': 0.7417