In [28]:
import clustbench 
import os.path, genieclust, sklearn.cluster # we will need these later
import matplotlib.pyplot as plt, numpy as np, pandas as pd
import csv
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, MeanShift, SpectralClustering, AffinityPropagation, OPTICS, Birch, MiniBatchKMeans, SpectralCoclustering
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from scipy.cluster.hierarchy import linkage, fcluster
import hdbscan
from kmodes.kmodes import KModes
from fcmeans import FCM
from minisom import MiniSom
from sklearn_extra.cluster import KMedoids
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import silhouette_score


In [None]:
def generate_embeddings(X):
    """
    Generate a dictionary of embeddings from a multidimensional NumPy array of numbers.

    This function applies both traditional numerical embeddings and text-based embedding
    techniques (by converting numeric rows to strings) to produce a variety of representations.

    Parameters:
    -----------
    X : numpy.ndarray
        A 2D array of shape (n_samples, n_features) containing numeric data.

    Returns:
    --------
    dict
        A dictionary where the key "Base" corresponds to the original data and additional
        keys correspond to various embedded representations.
    """
    import numpy as np
    embeddings = {"Base": X}

    # ------------------------------------------------
    # 1. Traditional Numerical Embeddings
    # ------------------------------------------------
    try:
        from sklearn.decomposition import (
            PCA, KernelPCA, TruncatedSVD, FastICA, FactorAnalysis
        )
        from sklearn.manifold import (
            TSNE, MDS, Isomap, LocallyLinearEmbedding, SpectralEmbedding
        )
        from sklearn.random_projection import GaussianRandomProjection
        
        # PCA
        # Common default: n_components=2 for visualization; whiten=False (unless needed)
        pca_model = PCA(n_components=2, whiten=False, random_state=42)
        embeddings["PCA"] = pca_model.fit_transform(X)
        print("PCA embedding done.")

        # t-SNE
        # Common defaults: n_components=2, perplexity=30, learning_rate='auto', n_iter=1000+
        # Note: t-SNE can be slow on large datasets. 
        tsne_model = TSNE(
            n_components=2,
            perplexity=30,
            learning_rate='auto',
            max_iter=1000,
            random_state=42,
            init='pca'  # often helps with convergence
        )
        embeddings["t-SNE"] = tsne_model.fit_transform(X)
        print("t-SNE embedding done.")

        # UMAP (requires umap-learn)
        # Common defaults: n_neighbors=15, min_dist=0.1, metric='euclidean'
        # NOTE: This can be slow for large data. Increase n_epochs or do PCA first if needed.
        try:
            import umap
            umap_model = umap.UMAP(
                n_components=2,
                n_neighbors=15,
                min_dist=0.1,
                metric='euclidean',
                random_state=42
            )
            embeddings["UMAP"] = umap_model.fit_transform(X)
            print("UMAP embedding done.")
        except Exception as e:
            print("UMAP embedding failed:", e)

        # MDS
        # Default: n_components=2, metric=True (classical MDS), can be slow for large data
        mds_model = MDS(n_components=2, metric=True, random_state=42, n_init=4, max_iter=300)
        embeddings["MDS"] = mds_model.fit_transform(X)
        print("MDS embedding done.")

        # Isomap
        # Default: n_neighbors=5, n_components=2
        isomap_model = Isomap(n_components=2, n_neighbors=5)
        embeddings["Isomap"] = isomap_model.fit_transform(X)
        print("Isomap embedding done.")

        # Locally Linear Embedding (LLE)
        # Default: n_neighbors=10, n_components=2
        lle_model = LocallyLinearEmbedding(n_components=2, n_neighbors=10, random_state=42)
        embeddings["LLE"] = lle_model.fit_transform(X)
        print("LLE embedding done.")

        # Spectral Embedding
        # Default: n_components=2, affinity='nearest_neighbors', n_neighbors=5
        spectral_model = SpectralEmbedding(
            n_components=2,
            n_neighbors=5,
            random_state=42
        )
        embeddings["Spectral"] = spectral_model.fit_transform(X)
        print("Spectral embedding done.")

        # Kernel PCA with RBF kernel
        # Common defaults: n_components=2, kernel='rbf', gamma=None (auto)
        kpca_model = KernelPCA(
            n_components=2,
            kernel='rbf',
            gamma=None,
            random_state=42
        )
        embeddings["KernelPCA"] = kpca_model.fit_transform(X)
        print("Kernel PCA embedding done.")

        # Autoencoder embedding (using TensorFlow/Keras)
        # Basic architecture: input->(64)->(32)->(2)->(32)->(64)->output
        # Epochs, batch_size can be tuned for better performance
        try:
            import tensorflow as tf
            from tensorflow.keras.layers import Input, Dense
            from tensorflow.keras.models import Model

            input_dim = X.shape[1]
            encoding_dim = 2  # target dimension
            input_layer = Input(shape=(input_dim,))
            encoded = Dense(64, activation='relu')(input_layer)
            encoded = Dense(32, activation='relu')(encoded)
            bottleneck = Dense(encoding_dim, activation='linear')(encoded)
            decoded = Dense(32, activation='relu')(bottleneck)
            decoded = Dense(64, activation='relu')(decoded)
            output_layer = Dense(input_dim, activation='linear')(decoded)

            autoencoder = Model(inputs=input_layer, outputs=output_layer)
            autoencoder.compile(optimizer='adam', loss='mse')
            # Train briefly (increase epochs for better results on real data)
            autoencoder.fit(X, X, epochs=50, batch_size=32, verbose=0)
            encoder = Model(inputs=input_layer, outputs=bottleneck)
            embeddings["Autoencoder"] = encoder.predict(X)
            print("Autoencoder embedding done.")
        except Exception as e:
            print("Autoencoder embedding failed:", e)

        # Random Projection
        # Common default: n_components=2, use GaussianRandomProjection
        rp = GaussianRandomProjection(n_components=2, eps=0.1, random_state=42)
        embeddings["RandomProjection"] = rp.fit_transform(X)
        print("Random Projection embedding done.")

        # Truncated SVD
        # Common default: n_components=2, good for sparse data (like TF-IDF)
        svd = TruncatedSVD(n_components=2, random_state=42)
        embeddings["TruncatedSVD"] = svd.fit_transform(X)
        print("Truncated SVD embedding done.")

        # FastICA
        # Common defaults: n_components=2, whiten=True, max_iter=200
        ica = FastICA(n_components=2, whiten=True, max_iter=200, random_state=42)
        embeddings["FastICA"] = ica.fit_transform(X)
        print("FastICA embedding done.")

        # Factor Analysis
        # Default: n_components=2
        fa = FactorAnalysis(n_components=2, random_state=42)
        embeddings["FactorAnalysis"] = fa.fit_transform(X)
        print("Factor Analysis embedding done.")

    except Exception as e:
        print("Error in numerical embeddings:", e)

    # ------------------------------------------------
    # 2. Text-Based Embeddings on Numeric Data
    #    (Convert each row to a string, then treat it as text)
    # ------------------------------------------------
    try:
        # Convert each row (sample) to a space-separated string, e.g., "0.12 0.57 0.99 ..."
        X_as_str = [" ".join(map(str, row)) for row in X]
        print("Converted numeric data to strings.")
    except Exception as e:
        print("Error converting numeric data to strings:", e)
        X_as_str = []

    # TF-IDF Vectorizer
    # Common defaults: max_features=500 (can raise if large vocabulary)
    try:
        from sklearn.feature_extraction.text import TfidfVectorizer
        tfidf_vectorizer = TfidfVectorizer(max_features=500)
        X_tfidf = tfidf_vectorizer.fit_transform(X_as_str).toarray()
        embeddings["TF-IDF_str"] = X_tfidf
        print("TF-IDF (from numeric strings) embedding done.")
    except Exception as e:
        print("TF-IDF embedding error:", e)

    # Count Vectorizer
    # Common defaults: max_features=500
    try:
        from sklearn.feature_extraction.text import CountVectorizer
        count_vectorizer = CountVectorizer(max_features=500)
        X_count = count_vectorizer.fit_transform(X_as_str).toarray()
        embeddings["Count_str"] = X_count
        print("Count Vectorizer (from numeric strings) embedding done.")
    except Exception as e:
        print("Count Vectorizer embedding error:", e)

    # Character-level TF-IDF
    # Common defaults: analyzer='char', ngram_range=(2,4), max_features=500
    try:
        char_vectorizer = TfidfVectorizer(
            analyzer='char',
            ngram_range=(2, 4),
            max_features=500
        )
        X_char_tfidf = char_vectorizer.fit_transform(X_as_str).toarray()
        embeddings["CharTFIDF_str"] = X_char_tfidf
        print("Character-level TF-IDF (from numeric strings) embedding done.")
    except Exception as e:
        print("Character-level TF-IDF error:", e)

    # Hashing Vectorizer
    # Common defaults: n_features=500
    try:
        from sklearn.feature_extraction.text import HashingVectorizer
        hv = HashingVectorizer(n_features=500)
        X_hash = hv.transform(X_as_str).toarray()
        embeddings["Hashing_str"] = X_hash
        print("Hashing Vectorizer (from numeric strings) embedding done.")
    except Exception as e:
        print("Hashing Vectorizer error:", e)

    # SentenceTransformer embedding (requires sentence-transformers)
    # Example model: paraphrase-MiniLM-L6-v2 (small & fast)
    try:
        from sentence_transformers import SentenceTransformer
        st_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        X_st = st_model.encode(X_as_str, show_progress_bar=False)
        embeddings["SentenceTransformer_str"] = X_st
        print("SentenceTransformer (from numeric strings) embedding done.")
    except Exception as e:
        print("SentenceTransformer embedding error:", e)

    # DistilBERT embedding via HuggingFace Transformers
    # For each row-as-string, tokenize and average the hidden states
    try:
        from transformers import AutoTokenizer, AutoModel
        import torch

        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        model = AutoModel.from_pretrained("distilbert-base-uncased")

        def get_distilbert_embedding(text):
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
            with torch.no_grad():
                outputs = model(**inputs)
            # Mean pooling over token embeddings
            embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy()[0]
            return embedding

        X_distilbert = np.array([get_distilbert_embedding(txt) for txt in X_as_str])
        embeddings["DistilBERT_str"] = X_distilbert
        print("DistilBERT (from numeric strings) embedding done.")
    except Exception as e:
        print("DistilBERT embedding error:", e)

    # Universal Sentence Encoder (USE) via TensorFlow Hub
    # Good universal text embedding, some limitations on max length 
    try:
        import tensorflow_hub as hub
        use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
        X_use = use_model(X_as_str).numpy()
        embeddings["USE_str"] = X_use
        print("Universal Sentence Encoder (from numeric strings) embedding done.")
    except Exception as e:
        print("Universal Sentence Encoder embedding error:", e)

    # NMF on the TF-IDF representation (to further reduce dimensionality)
    # Common defaults: n_components=2, init='nndsvd'
    try:
        from sklearn.decomposition import NMF
        nmf_model = NMF(n_components=2, init='nndsvd', random_state=42, max_iter=200)
        X_nmf = nmf_model.fit_transform(X_tfidf)
        embeddings["NMF_TFIDF_str"] = X_nmf
        print("NMF on TF-IDF (from numeric strings) embedding done.")
    except Exception as e:
        print("NMF embedding error:", e)

    # Doc2Vec embedding using gensim
    # vector_size=50, min_count=1, epochs=40 are typical defaults for small data
    try:
        from gensim.models.doc2vec import Doc2Vec, TaggedDocument
        documents = [TaggedDocument(words=txt.split(), tags=[str(i)]) for i, txt in enumerate(X_as_str)]
        doc2vec_model = Doc2Vec(vector_size=50, min_count=1, epochs=40)
        doc2vec_model.build_vocab(documents)
        doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)
        X_doc2vec = np.array([doc2vec_model.infer_vector(txt.split()) for txt in X_as_str])
        embeddings["Doc2Vec_str"] = X_doc2vec
        print("Doc2Vec (from numeric strings) embedding done.")
    except Exception as e:
        print("Doc2Vec embedding error:", e)

    # Optional: Add Latent Dirichlet Allocation (LDA) for topic modeling on the string data
    # This can be done on the TF-IDF or Count vector for text
    try:
        from sklearn.decomposition import LatentDirichletAllocation
        # For demonstration, let's do LDA with 5 topics on the Count vector
        if "Count_str" in embeddings:
            lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
            X_lda = lda_model.fit_transform(embeddings["Count_str"])
            embeddings["LDA_Count_str"] = X_lda
            print("LDA on Count vector (from numeric strings) embedding done.")
    except Exception as e:
        print("LDA embedding error:", e)

    return embeddings


In [31]:
data_url = "https://github.com/gagolews/clustering-data-v1/raw/v1.1.0"

def load_data(collection, dataset):
    benchmark = clustbench.load_dataset(collection, dataset, url=data_url)
    X = benchmark.data
    print("Loaded: ", X.shape[0], " | Dimension: ", X.shape[1], " | Label count: ", len(benchmark.labels))
    print("Generating Embeddings...")
    return X, benchmark, generate_embeddings(X)
    
    

In [96]:
"""
each dataset can have multiple labels, 
pick one at a time and that defines your partition size, aka k

Overall, Genie returned a clustering quite similar to the reference one. We may consider 107
(namely, c11 + c22 + c33 ) out of the 120 input points as correctly grouped. In particular, 
all the red and green reference points (the 2nd and the 3rd row) have been properly discovered.

Normalized Clustering Accuracy (NCA) 
NCA is the averaged percentage of correctly classified points in each cluster 
above the perfectly uniform label distribution.
            
"""

# Function to calculate Clustering Fidelity (CF)
def clustering_fidelity(y_true, y_pred):
    # Fidelity could be defined as the percentage of data points correctly assigned to clusters.
    return sum(y_true == y_pred) / len(y_true)

# Function to calculate NCA score (Using AMI as a proxy here)
def nca_score(y_true, y_pred):
    return adjusted_mutual_info_score(y_true, y_pred)


def predict(embedding_technique, X, label, benchmark, clustering_method, plot=False):

    y_true = benchmark.labels[label] 
    (k := max(y_true))  # or benchmark.n_clusters[0]
    m = max(min(y_true),2)
    method = clustering_method.lower()
    empty = False

    # Define the clustering model
    if method == "genie":
        model = genieclust.Genie(n_clusters=k)  # using default parameters
    elif method == "kmeans":
        model = KMeans(n_clusters=k, random_state=42, n_init=10)
    elif method == "agglomerative":
        model = AgglomerativeClustering(n_clusters=k)
    elif method == "dbscan":
        model = DBSCAN(eps=0.2, min_samples=m)
    elif method == "meanshift":
        model = MeanShift()
    elif method == "spectral":
        model = SpectralClustering(n_clusters=k, random_state=42)
    elif method == "affinitypropagation":
        model =  AffinityPropagation(random_state=42)
    elif method == "optics":
        model = OPTICS()
    elif method == "gaussianmixture":
        model = GaussianMixture(n_components=k, random_state=42)
    elif method == "hdbscan":
        model = hdbscan.HDBSCAN(min_cluster_size=m)
    elif method == "kmodes":
        model = KModes(n_clusters=k, random_state=42, init="Huang")
    elif method == "birch":
        model = Birch(n_clusters=k)
    elif method == "minibatchkmeans":
        model = MiniBatchKMeans(n_clusters=k, random_state=42)
    elif method == "fcm":
        model = FCM(n_clusters=k)
    elif method == "minisom":
        model = MiniSom(x=10, y=10, input_len=X.shape[1], sigma=1.0, learning_rate=0.5)
    elif method == "kmedoids":
        model = KMedoids(n_clusters=k, random_state=42)
    elif method == "latentdirichletallocation":
        X = np.maximum(X, 0)
        model = LatentDirichletAllocation(n_components=k, random_state=42)
    elif method == "spectralcoclustering":
        model =  SpectralCoclustering(n_clusters=k)
    elif method == "bayesiangaussianmixture":
        model = BayesianGaussianMixture(n_components=k)   

    print("the model: " +  method + " has been trained now getting y_pred") 
   
    # Fit the model and predict the cluster labels
    if method == "gaussianmixture":  # Gaussian uses predict instead of fit_predict
        (y_pred := model.fit(X).predict(X) + 1)
    if method == "fcm":  # Gaussian uses predict instead of fit_predict
        if(model.centers != None):
            (y_pred := model.fit(X).predict(X) + 1) 
        else:
            empty = True 
    elif method == "minisom":
        model.train(X, 100)
        y_pred = np.array([model.winner(x) for x in X]) + 1
    elif method == "latentdirichletallocation":
        model.fit(X)
        y_pred = model.transform(X).argmax(axis=1) + 1
    elif method == "spectralcoclustering":
        model.fit(X)
        y_pred = y_pred = model.row_labels_ + 1
    else:
        (y_pred := model.fit_predict(X) + 1)
        
    # Calculate Clustering Fidelity, NCA (AMI as proxy)
    if(empty):
        cf = 0
        nca = 0
    else: 
        y_true = y_true.flatten()
        y_pred = y_pred.flatten()
        if len(y_true) != len(y_pred):
            y_pred = y_pred[:len(y_true)]
        cf = clustering_fidelity(y_true, y_pred)
        nca = nca_score(y_true, y_pred)


    if plot and not empty:
        plt.subplot(1, 2, 1)
        model.plots.plot_scatter(X, labels=y_true-1, axis="equal", title="y_true")
        plt.subplot(1, 2, 2)
        model.plots.plot_scatter(X, labels=y_pred-1, axis="equal", title="y_pred")
        plt.show()

    return cf, nca_score
    
    

TODO unable to figure out the fcm nonetype error and how to handle it would need some help - for now have commented out fcm from the list of clusters

In [61]:
import os

print(os.getcwd()) # run to check current working directory and update file path if needed

/Users/ikshitayadav/dev/embedding_based_clustering_research/framework


In [98]:
eval_collections = {"wut": ["x2"], "other": ["iris"]}
clustering_methods = ["genie", "kmeans", "agglomerative", "dbscan", "meanshift", "spectral", "affinitypropagation","optics","gaussianmixture", "hdbscan", "kmodes", "birch", "minibatchkmeans", "minisom", "kmedoids", "latentdirichletallocation", "spectralcoclustering", "bayesiangaussianmixture"]
result_csv = "/Users/ikshitayadav/dev/embedding_based_clustering_research/framework/results/v1_test.csv"

In [63]:
""" 
Run to set the column names for the csv file
"""
import os
import csv

if os.path.exists(result_csv):
    print("File already exists")
else:
    try:
        with open(result_csv, mode='w', newline='') as file: 
            writer = csv.writer(file)
            writer.writerow(["Collection", "Dataset", "Clustering Method", "Label", "Embedding", "NCA Score"])
    except Exception as e:
        print("Error writing to file: ", e)


File already exists


In [42]:
# TODO: maybe create a cache or temporary storage for the embeddings
# TODO: parallelize the embedding and clustering process per dataset? 

In [99]:
with open(result_csv, mode='a', newline='') as file:
    writer = csv.writer(file)
    for collection, datasets in eval_collections.items():
        for dataset in datasets:
            print(f"Collection: {collection}, Dataset: {dataset}")
            X, benchmark, X_embedded_dict = load_data(collection, dataset)
            
            for label in range(0, len(benchmark.labels)):
                for embedding_technique, embedded_data in X_embedded_dict.items():
                    for clustering_method in clustering_methods:
                        cf, nca_score = predict(
                            embedding_technique, 
                            embedded_data, 
                            label, 
                            benchmark, 
                            clustering_method
                        )
                        writer.writerow([
                            collection,          # e.g. "wut"
                            dataset,             # e.g. "x2"
                            clustering_method,   # e.g. "genie"
                            label,               # which label set index (0, 1, ...)
                            embedding_technique, # e.g. "PCA", "t-SNE", ...
                            cf,                  # confusion matrix
                            nca_score            # normalized clustering accuracy
                        ])
# AR (Adjusted Rand Index): Measures the similarity between two data clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings, adjusted for chance.

# R (Rand Index): Similar to AR but not adjusted for chance. It measures the percentage of correct decisions made by the clustering algorithm.

# FM (Fowlkes-Mallows Index): Measures the similarity between two clusterings by considering the geometric mean of the precision and recall.

# AFM (Adjusted Fowlkes-Mallows Index): An adjusted version of the Fowlkes-Mallows Index that accounts for chance.

# MI (Mutual Information): Measures the amount of information obtained about one clustering from the other clustering

Collection: wut, Dataset: x2
Loaded:  120  | Dimension:  2  | Label count:  2
Generating Embeddings...
PCA embedding done.
t-SNE embedding done.
UMAP embedding done.
MDS embedding done.
Isomap embedding done.
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred


  warn(


the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getting y_pred
the model: meanshift has been trained now getting y_pred
the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now getting y_pred
the model: kmodes has been trained now getting y_pred
the model: birch has been trained now getting y_pred
the model: minibatchkmeans has been trained now getting y_pred
the model: minisom has been trained now getting y_pred
the model: kmedoids has been trained now getting y_pred
the model: latentdirichletallocation has been trained now getting y_pred
the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
the model: genie has been trained now ge



the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now getting y_pred
the model: kmodes has been trained now getting y_pred
the model: birch has been trained now getting y_pred
the model: minibatchkmeans has been trained now getting y_pred
the model: minisom has been trained now getting y_pred
the model: kmedoids has been trained now getting y_pred
the model: latentdirichletallocation has been trained now getting y_pred
the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred
the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getti



the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now getting y_pred
the model: kmodes has been trained now getting y_pred
the model: birch has been trained now getting y_pred
the model: minibatchkmeans has been trained now getting y_pred
the model: minisom has been trained now getting y_pred
the model: kmedoids has been trained now getting y_pred
the model: latentdirichletallocation has been trained now getting y_pred




the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred
the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getting y_pred
the model: meanshift has been trained now getting y_pred
the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now getting y_pred
the model: kmodes has been trained now getting y_pred
the model: birch has been trained now getting y_pred
the model: minibatchkmeans has been trained now getting y_pred
the model: minisom has been trained now getting y_pred
the model: kmedoids has been trained now getting y_pred
the mo



the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred
the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getting y_pred
the model: meanshift has been trained now getting y_pred
the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now getting y_pred
the model: kmodes has been trained now getting y_pred




the model: birch has been trained now getting y_pred
the model: minibatchkmeans has been trained now getting y_pred
the model: minisom has been trained now getting y_pred
the model: kmedoids has been trained now getting y_pred
the model: latentdirichletallocation has been trained now getting y_pred
the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred
the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getting y_pred
the model: meanshift has been trained now getting y_pred
the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now ge



the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now getting y_pred
the model: kmodes has been trained now getting y_pred
the model: birch has been trained now getting y_pred
the model: minibatchkmeans has been trained now getting y_pred




the model: minisom has been trained now getting y_pred
the model: kmedoids has been trained now getting y_pred
the model: latentdirichletallocation has been trained now getting y_pred
the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred
the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getting y_pred
the model: meanshift has been trained now getting y_pred
the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now getting y_pred
the model: kmodes has been trained now getting y_pred
the model: birch has been trained now getting y_p



the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred
the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getting y_pred
the model: meanshift has been trained now getting y_pred
the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now getting y_pred
the model: kmodes has been trained now getting y_pred




the model: birch has been trained now getting y_pred
the model: minibatchkmeans has been trained now getting y_pred
the model: minisom has been trained now getting y_pred
the model: kmedoids has been trained now getting y_pred
the model: latentdirichletallocation has been trained now getting y_pred
the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred
the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getting y_pred
the model: meanshift has been trained now getting y_pred
the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now ge



the model: birch has been trained now getting y_pred
the model: minibatchkmeans has been trained now getting y_pred
the model: minisom has been trained now getting y_pred
the model: kmedoids has been trained now getting y_pred
the model: latentdirichletallocation has been trained now getting y_pred
the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred
the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getting y_pred
the model: meanshift has been trained now getting y_pred
the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now ge



the model: birch has been trained now getting y_pred
the model: minibatchkmeans has been trained now getting y_pred
the model: minisom has been trained now getting y_pred
the model: kmedoids has been trained now getting y_pred
the model: latentdirichletallocation has been trained now getting y_pred
the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred
the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getting y_pred
the model: meanshift has been trained now getting y_pred
the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now ge



the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
Collection: other, Dataset: iris
Loaded:  150  | Dimension:  4  | Label count:  1
Generating Embeddings...
PCA embedding done.
t-SNE embedding done.
UMAP embedding done.


  warn(


MDS embedding done.
Isomap embedding done.
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred
the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getting y_pred
the model: meanshift has been trained now getting y_pred


  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now getting y_pred
the model: kmodes has been trained now getting y_pred
the model: birch has been trained now getting y_pred
the model: minibatchkmeans has been trained now getting y_pred
the model: minisom has been trained now getting y_pred
the model: kmedoids has been trained now getting y_pred




the model: latentdirichletallocation has been trained now getting y_pred
the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred
the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getting y_pred
the model: meanshift has been trained now getting y_pred
the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now getting y_pred
the model: kmodes has been trained now getting y_pred




the model: birch has been trained now getting y_pred
the model: minibatchkmeans has been trained now getting y_pred
the model: minisom has been trained now getting y_pred
the model: kmedoids has been trained now getting y_pred
the model: latentdirichletallocation has been trained now getting y_pred
the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred
the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getting y_pred
the model: meanshift has been trained now getting y_pred
the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now ge



the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred
the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getting y_pred
the model: meanshift has been trained now getting y_pred
the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now getting y_pred
the model: kmodes has been trained now getting y_pred
the model: birch has been trained now getting y_pred
the model: minibatchkmeans has been trained now getting y_pred
the model: minisom has been trained now getting y_pred
the model: kmedoids has been trained now getting y_pred
the mo



the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred
the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getting y_pred
the model: meanshift has been trained now getting y_pred
the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now getting y_pred
the model: kmodes has been trained now getting y_pred
the model: birch has been trained now getting y_pred
the model: minibatchkmeans has been trained now getting y_pred
the model: minisom has been trained now getting y_pred
the model: kmedoids has been trained now getting y_pred
the mo



the model: spectralcoclustering has been trained now getting y_pred
the model: bayesiangaussianmixture has been trained now getting y_pred
the model: genie has been trained now getting y_pred
the model: kmeans has been trained now getting y_pred
the model: agglomerative has been trained now getting y_pred
the model: dbscan has been trained now getting y_pred
the model: meanshift has been trained now getting y_pred
the model: spectral has been trained now getting y_pred
the model: affinitypropagation has been trained now getting y_pred
the model: optics has been trained now getting y_pred
the model: gaussianmixture has been trained now getting y_pred
the model: hdbscan has been trained now getting y_pred
the model: kmodes has been trained now getting y_pred
the model: birch has been trained now getting y_pred
the model: minibatchkmeans has been trained now getting y_pred
the model: minisom has been trained now getting y_pred
the model: kmedoids has been trained now getting y_pred
the mo



In [10]:
import pandas as pd

def filter_and_compare_csv(file_path):
    # Read the CSV file with the first line as column labels
    df = pd.read_csv(file_path)
    
    # Initialize an empty list to store the filtered rows
    filtered_rows = []
    
    # Group the DataFrame by the first four columns
    grouped = df.groupby(['Collection', 'Dataset', 'Clustering Method', 'Label'])
    
    # Iterate over each group
    for name, group in grouped:
        # Find the "Base" row
        base_row = group[group['Embedding'] == 'Base']
        if not base_row.empty:
            base_value = base_row.iloc[0, -1]
            base_row_list = base_row.iloc[0].tolist()
            base_added = False
            
            # Iterate over the rows in the group
            for index, row in group.iterrows():
                if row['Embedding'] != 'Base' and row.iloc[-1] > base_value:
                    if not base_added:
                        filtered_rows.append(base_row_list)
                        base_added = True
                    filtered_rows.append(row.tolist())
    
    # Create a new DataFrame from the filtered rows
    filtered_df = pd.DataFrame(filtered_rows, columns=df.columns)
    
    # Remove duplicates
    filtered_df = filtered_df.drop_duplicates()
    
    return filtered_df

# Example usage
file_path = '/Users/cajoshuapark/Dev/research/embedding_based_clustering_research/framework/results/v1_test.csv'
filtered_df = filter_and_compare_csv(file_path)

# Display the filtered DataFrame
filtered_df

Unnamed: 0,Collection,Dataset,Clustering Method,Label,Embedding,Confusion Matrix,NCA Score
0,other,iris,genie,0,Base,"{'ar': 0.8857921001989628, 'r': 0.949530201342...",0.94
1,other,iris,genie,0,Autoencoder,"{'ar': 0.9037675791580496, 'r': 0.957494407158...",0.95
2,wut,x2,genie,0,Base,"{'ar': 0.6882872342370341, 'r': 0.859523809523...",0.87
3,wut,x2,genie,0,KernelPCA,"{'ar': 0.750803955434529, 'r': 0.8880952380952...",0.9
4,wut,x2,genie,1,Base,"{'ar': 0.6860113896866956, 'r': 0.871988795518...",0.379032
5,wut,x2,genie,1,UMAP,"{'ar': 0.649943821612011, 'r': 0.8665266106442...",0.548913
6,wut,x2,genie,1,Isomap,"{'ar': 0.8884037118549749, 'r': 0.955462184873...",0.75
7,wut,x2,genie,1,LLE,"{'ar': 0.6085107451148242, 'r': 0.832352941176...",0.657991
8,wut,x2,genie,1,KernelPCA,"{'ar': 0.6994166307700244, 'r': 0.878151260504...",0.387097
9,wut,x2,genie,1,Autoencoder,"{'ar': 0.799797235568715, 'r': 0.9196078431372...",0.579545
