In [1]:
import os.path
import numpy as np
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor

import clustbench
import genieclust
from sklearn.cluster import (
    KMeans, AgglomerativeClustering, DBSCAN, MeanShift, SpectralClustering,
    AffinityPropagation, OPTICS, Birch, MiniBatchKMeans, SpectralCoclustering
)
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.decomposition import (
    PCA, KernelPCA, TruncatedSVD, FastICA, FactorAnalysis, LatentDirichletAllocation, NMF
)
from sklearn.manifold import (
    TSNE, MDS, Isomap, LocallyLinearEmbedding, SpectralEmbedding
)
from sklearn.random_projection import GaussianRandomProjection
from sklearn.feature_extraction.text import (
    TfidfVectorizer, CountVectorizer, HashingVectorizer
)
from sklearn.metrics import (
    accuracy_score, rand_score, adjusted_rand_score,
    fowlkes_mallows_score, mutual_info_score, adjusted_mutual_info_score, normalized_mutual_info_score
)
import hdbscan
from kmodes.kmodes import KModes
from sklearn_extra.cluster import KMedoids
from sklearn import metrics
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sentence_transformers import SentenceTransformer
from transformers import (
    AutoTokenizer, AutoModel
)
import umap
import torch

2025-03-24 23:14:43.015065: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def preprocess_data(X):
    """Convert each row of X into a string, joined by spaces."""
    return [" ".join(map(str, row)) for row in X]

# -------------------------------------------------------------------
# 2. Text-based Embedding Functions
# -------------------------------------------------------------------
def generate_TFIDF_embedding(X, X_as_str=None):
    if X_as_str is None:
        X_as_str = preprocess_data(X)
    vectorizer = TfidfVectorizer(max_features=500)
    return vectorizer.fit_transform(X_as_str).toarray()

def generate_CountVectorizer_embedding(X, X_as_str=None):
    if X_as_str is None:
        X_as_str = preprocess_data(X)
    vectorizer = CountVectorizer(max_features=500)
    return vectorizer.fit_transform(X_as_str).toarray()

def generate_CharTFIDF_embedding(X, X_as_str=None):
    if X_as_str is None:
        X_as_str = preprocess_data(X)
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4), max_features=500)
    return vectorizer.fit_transform(X_as_str).toarray()

def generate_Hashing_embedding(X, X_as_str=None):
    if X_as_str is None:
        X_as_str = preprocess_data(X)
    vectorizer = HashingVectorizer(n_features=500)
    return vectorizer.transform(X_as_str).toarray()

def generate_SentenceTransformer_embedding(X, X_as_str=None):
    if X_as_str is None:
        X_as_str = preprocess_data(X)
    st_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    return st_model.encode(X_as_str, show_progress_bar=False)

def generate_DistilBERT_embedding(X, X_as_str=None):
    if X_as_str is None:
        X_as_str = preprocess_data(X)
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    model = AutoModel.from_pretrained("distilbert-base-uncased")

    def get_distilbert_embedding(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        # Mean pooling over token embeddings
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()[0]

    return np.array([get_distilbert_embedding(txt) for txt in X_as_str])

def generate_Doc2Vec_embedding(X, X_as_str=None):
    if X_as_str is None:
        X_as_str = preprocess_data(X)
    documents = [TaggedDocument(words=txt.split(), tags=[str(i)]) for i, txt in enumerate(X_as_str)]
    doc2vec_model = Doc2Vec(vector_size=50, min_count=1, epochs=40)
    doc2vec_model.build_vocab(documents)
    doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)
    return np.array([doc2vec_model.infer_vector(txt.split()) for txt in X_as_str])

def generate_multilingual_e5_large_instruct_embedding(X, X_as_str=None):
    if X_as_str is None:
        X_as_str = preprocess_data(X)
    model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')
    return model.encode(X_as_str, show_progress_bar=False)

def generate_KaLM_embedding(X, X_as_str=None):
    if X_as_str is None:
        X_as_str = preprocess_data(X)
    model = SentenceTransformer('HIT-TMG/KaLM-embedding-multilingual-mini-v1')
    return model.encode(X_as_str, show_progress_bar=False)


def generate_mxbai_embedding(X, X_as_str=None):
    if X_as_str is None:
        X_as_str = preprocess_data(X)
    model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')
    return model.encode(X_as_str, show_progress_bar=False)

def generate_bge_embedding(X, X_as_str=None):
    if X_as_str is None:
        X_as_str = preprocess_data(X)
    model = SentenceTransformer('BAAI/bge-reranker-large')
    return model.encode(X_as_str, show_progress_bar=False)

# -------------------------------------------------------------------
# 3. Non-text Embedding Functions (unchanged)
# -------------------------------------------------------------------
# def generate_PCA_embedding(X):
#     return PCA(n_components=2, whiten=False, random_state=42).fit_transform(X)

# def generate_TSNE_embedding(X):
#     return TSNE(
#         n_components=2,
#         perplexity=30,
#         learning_rate='auto',
#         max_iter=1000,
#         random_state=42,
#         init='pca'
#     ).fit_transform(X)

# def generate_UMAP_embedding(X):
#     return umap.UMAP(
#         n_components=2,
#         n_neighbors=15,
#         min_dist=0.1,
#         metric='euclidean',
#         random_state=42
#     ).fit_transform(X)

# def generate_MDS_embedding(X):
#     return MDS(n_components=2, metric=True, random_state=42, n_init=4, max_iter=300).fit_transform(X)

# def generate_Isomap_embedding(X):
#     return Isomap(n_components=2, n_neighbors=5).fit_transform(X)

# def generate_LLE_embedding(X):
#     return LocallyLinearEmbedding(n_components=2, n_neighbors=10, random_state=42).fit_transform(X)

# def generate_SpectralEmbedding_embedding(X):
#     return SpectralEmbedding(
#         n_components=2,
#         n_neighbors=5,
#         random_state=42
#     ).fit_transform(X)

# def generate_KernelPCA_embedding(X):
#     return KernelPCA(
#         n_components=2,
#         kernel='rbf',
#         gamma=None,
#         random_state=42
#     ).fit_transform(X)

# def generate_Gaussian_random_projection(X):
#     return GaussianRandomProjection(n_components=2, eps=0.1, random_state=42).fit_transform(X)

# def generate_TruncatedSVD_embedding(X):
#     return TruncatedSVD(n_components=2, random_state=42).fit_transform(X)

# def generate_FastICA_embedding(X):
#     return FastICA(n_components=2, whiten=True, max_iter=200, random_state=42).fit_transform(X)

# def generate_FactorAnalysis_embedding(X):
#     return FactorAnalysis(n_components=2, random_state=42).fit_transform(X)


In [3]:
def generate_embeddings(X):
    # Precompute the text representation once
    X_as_str = preprocess_data(X)
    
    # List your embedding functions as tuples: (name, function, is_text_based)
    embedding_functions = [
        # ("PCA", generate_PCA_embedding, False),
        # ("KernelPCA", generate_KernelPCA_embedding, False),
        # ("TruncatedSVD", generate_TruncatedSVD_embedding, False),
        # ("FactorAnalysis", generate_FactorAnalysis_embedding, False),
        # ("TSNE", generate_TSNE_embedding, False),
        # ("UMAP", generate_UMAP_embedding, False),
        # ("MDS", generate_MDS_embedding, False),
        # ("Isomap", generate_Isomap_embedding, False),
        # ("LLE", generate_LLE_embedding, False),
        # ("SpectralEmbedding", generate_SpectralEmbedding_embedding, False),
        # ("GaussianRP", generate_Gaussian_random_projection, False),
        ("TFIDF", generate_TFIDF_embedding, True),
        ("CountVectorizer", generate_CountVectorizer_embedding, True),
        ("CharTFIDF", generate_CharTFIDF_embedding, True),
        ("Hashing", generate_Hashing_embedding, True),
        ("SentenceTransformer", generate_SentenceTransformer_embedding, True),
        ("DistilBERT", generate_DistilBERT_embedding, True),
        ("Doc2Vec", generate_Doc2Vec_embedding, True),
        ("multilingual_e5_large_instruct", generate_multilingual_e5_large_instruct_embedding, True),
        ("KaLM", generate_KaLM_embedding, True),
        ("mxbai", generate_mxbai_embedding, True),
        ("bge", generate_bge_embedding, True),
    ]
    
    results_dict = {}
    with ThreadPoolExecutor() as executor:
        futures = {}
        for name, func, is_text in embedding_functions:
            # For text-based functions, pass the precomputed X_as_str; otherwise, just pass X.
            if is_text:
                futures[executor.submit(func, X, X_as_str)] = name
            else:
                futures[executor.submit(func, X)] = name

        for future in futures:
            func_name = futures[future]
            try:
                result = future.result()
            except Exception as e:
                print(f"Error in {func_name}: {e}")
                result = None
            results_dict[func_name] = result

    results_dict['Base'] = X
    return results_dict


In [4]:
data_url = "https://github.com/gagolews/clustering-data-v1/raw/v1.1.0"

import pickle

def get_cached_embeddings(collection, dataset, X):
    cache_dir = "embedding_cache_2"
    os.makedirs(cache_dir, exist_ok=True)
    cache_file = os.path.join(cache_dir, f"{collection}_{dataset}_embeddings.pkl")
    
    if os.path.exists(cache_file):
        print("Loading embeddings from cache:", cache_file)
        with open(cache_file, "rb") as f:
            embeddings = pickle.load(f)
    else:
        print("Cache not found. Generating embeddings...")
        embeddings = generate_embeddings(X)
        with open(cache_file, "wb") as f:
            pickle.dump(embeddings, f)
        print("Embeddings cached at:", cache_file)
    return embeddings

def load_data(collection, dataset):
    benchmark = clustbench.load_dataset(collection, dataset, url=data_url)
    X = benchmark.data
    print("Loaded: ", X.shape[0], " | Dimension: ", X.shape[1], " | Label count: ", len(benchmark.labels))
    print("Getting embeddings (with caching)...")
    X_embedded_dict = get_cached_embeddings(collection, dataset, X)
    return X, benchmark, X_embedded_dict

In [5]:
"""
each dataset can have multiple labels, 
pick one at a time and that defines your partition size, aka k

Overall, Genie returned a clustering quite similar to the reference one. We may consider 107
(namely, c11 + c22 + c33 ) out of the 120 input points as correctly grouped. In particular, 
all the red and green reference points (the 2nd and the 3rd row) have been properly discovered.

Normalized Clustering Accuracy (NCA) 
NCA is the averaged percentage of correctly classified points in each cluster 
above the perfectly uniform label distribution.
            
"""


def predict(embedding_technique, X, label, benchmark, clustering_method, plot=False):

    y_true = benchmark.labels[label] 
    (k := max(y_true))  # or benchmark.n_clusters[0]
    m = max(min(y_true),2)
    method = clustering_method.lower()

    # Define the clustering model
    if method == "genie":
        model = genieclust.Genie(n_clusters=k)  # using default parameters
    elif method == "kmeans":
        model = KMeans(n_clusters=k, random_state=42, n_init=10)
    elif method == "agglomerative":
        model = AgglomerativeClustering(n_clusters=k)
    # elif method == "dbscan":
    #     model = DBSCAN(eps=0.2, min_samples=m)
    # elif method == "meanshift":
    #     model = MeanShift()
    elif method == "spectral":
        model = SpectralClustering(n_clusters=k, random_state=42)
    # elif method == "affinitypropagation":
    #     model =  AffinityPropagation(random_state=42, convergence_iter= 5, max_iter= 100)
    # elif method == "optics":
    #     model = OPTICS()
    elif method == "gaussianmixture":
        model = GaussianMixture(n_components=k, random_state=42)
    elif method == "hdbscan":
        model = hdbscan.HDBSCAN(min_cluster_size=m)
    elif method == "kmodes":
        model = KModes(n_clusters=k, random_state=42, init="Huang")
    elif method == "birch":
        model = Birch(n_clusters=k)
    elif method == "minibatchkmeans":
        model = MiniBatchKMeans(n_clusters=k, random_state=42)
    elif method == "kmedoids":
        model = KMedoids(n_clusters=k, random_state=42)
    elif method == "latentdirichletallocation":
        X = np.maximum(X, 0)
        model = LatentDirichletAllocation(n_components=k, random_state=42)
    elif method == "spectralcoclustering":
        model =  SpectralCoclustering(n_clusters=k)
    elif method == "bayesiangaussianmixture":
        model = BayesianGaussianMixture(n_components=k)   

    print("The model: " +  method + " has been trained now getting y_pred") 
   
    # Fit the model and predict the cluster labels
    if method == "gaussianmixture":  # Gaussian uses predict instead of fit_predict
        (y_pred := model.fit(X).predict(X) + 1)
    elif method == "latentdirichletallocation":
        model.fit(X)
        y_pred = model.transform(X).argmax(axis=1) + 1
    elif method == "spectralcoclustering":
        model.fit(X)
        y_pred = y_pred = model.row_labels_ + 1
    # elif method == "optics" or method == "hdbscan" or method == "dbscan":
    elif method == "hdbscan":
        y_pred = model.fit_predict(X)
        unique_labels = np.unique(y_pred)
        if -1 in unique_labels:
            y_pred = np.where(y_pred == -1, max(unique_labels) + 1, y_pred)  # Assign noise to a new cluster
        y_pred += 1
    else:
        (y_pred := model.fit_predict(X) + 1)
        
    assert len(y_true) == len(y_pred), "Length of y_true: " + str(len(y_true)) +  "and y_pred: " + str(len(y_pred)) + "are not the same"
    print("y_true dimensions: " + str(y_true.shape))
    print("y_pred dimensions: " + str(y_pred.shape))
    print("max(y_true): " + str(max(y_true)))
    print("max(y_pred): " + str(max(y_pred)))
    print("min(y_true): " + str(min(y_true)))
    print("min(y_pred): " + str(min(y_pred)))

    nca = clustbench.get_score(y_true, y_pred)
    cf = metrics.confusion_matrix(y_true, y_pred)
    r = rand_score(y_true, y_pred)
    ar = adjusted_rand_score(y_true, y_pred)
    fm = fowlkes_mallows_score(y_true, y_pred)
    mi = mutual_info_score(y_true, y_pred)
    nmi = normalized_mutual_info_score(y_true, y_pred)
    ami = adjusted_mutual_info_score(y_true, y_pred)
    a = accuracy_score(y_true, y_pred)
        


    if plot:
        plt.subplot(1, 2, 1)
        model.plots.plot_scatter(X, labels=y_true-1, axis="equal", title="y_true")
        plt.subplot(1, 2, 2)
        model.plots.plot_scatter(X, labels=y_pred-1, axis="equal", title="y_pred")
        plt.show()

    return cf, nca, r, ar, fm, mi, nmi, ami, a
    
    

In [6]:
import os

print(os.getcwd()) # run to check current working directory and update file path if needed

/Users/cajoshuapark/Dev/research/embedding_based_clustering_research/framework


In [7]:
eval_collections = {
    # "g2mg": [
    #     "g2mg_128_10",
    #     "g2mg_128_20",
    #     "g2mg_128_30",
    #     "g2mg_128_40",
    #     "g2mg_128_50",
    #     "g2mg_128_60",
    #     "g2mg_128_70",
    #     "g2mg_128_80",
    #     "g2mg_128_90",
    #     "g2mg_16_10",
    #     "g2mg_16_20",
    #     "g2mg_16_30",
    #     "g2mg_16_40",
    #     "g2mg_16_50",
    #     "g2mg_16_60",
    #     "g2mg_16_70",
    #     "g2mg_16_80",
    #     "g2mg_16_90",
    #     "g2mg_1_10",
    #     "g2mg_1_20",
    #     "g2mg_1_30",
    #     "g2mg_1_40",
    #     "g2mg_1_50",
    #     "g2mg_1_60",
    #     "g2mg_1_70",
    #     "g2mg_1_80",
    #     "g2mg_1_90",
    #     "g2mg_2_10",
    #     "g2mg_2_20",
    #     "g2mg_2_30",
    #     "g2mg_2_40",
    #     "g2mg_2_50",
    #     "g2mg_2_60",
    #     "g2mg_2_70",
    #     "g2mg_2_80",
    #     "g2mg_2_90",
    #     "g2mg_32_10",
    #     "g2mg_32_20",
    #     "g2mg_32_30",
    #     "g2mg_32_40",
    #     "g2mg_32_50",
    #     "g2mg_32_60",
    #     "g2mg_32_70",
    #     "g2mg_32_80",
    #     "g2mg_32_90",
    #     "g2mg_4_10",
    #     "g2mg_4_20",
    #     "g2mg_4_30",
    #     "g2mg_4_40",
    #     "g2mg_4_50",
    #     "g2mg_4_60",
    #     "g2mg_4_70",
    #     "g2mg_4_80",
    #     "g2mg_4_90",
    #     "g2mg_64_10",
    #     "g2mg_64_20",
    #     "g2mg_64_30",
    #     "g2mg_64_40",
    #     "g2mg_64_50",
    #     "g2mg_64_60",
    #     "g2mg_64_70",
    #     "g2mg_64_80",
    #     "g2mg_64_90",
    #     "g2mg_8_10",
    #     "g2mg_8_20",
    #     "g2mg_8_30",
    #     "g2mg_8_40",
    #     "g2mg_8_50",
    #     "g2mg_8_60",
    #     "g2mg_8_70",
    #     "g2mg_8_80",
    #     "g2mg_8_90"
    # ],
    "other": [
        "chameleon_t4_8k",
        "chameleon_t5_8k",
        "chameleon_t7_10k",
        "chameleon_t8_8k",
        "hdbscan",
        "iris",
        "iris5",
        "square"
    ],
    "graves": [
        "dense",
        "fuzzyx",
        "line",
        "parabolic",
        "ring",
        "ring_noisy",
        "ring_outliers",
        "zigzag",
        "zigzag_noisy",
        "zigzag_outliers"
    ],
    "sipu": [
        "a1",
        "a2",
        "a3",
        "aggregation",
        "birch1",
        "birch2",
        "compound",
        "d31",
        "flame",
        "jain",
        "pathbased",
        "r15",
        "s1",
        "s2",
        "s3",
        "s4",
        "spiral",
        "unbalance",
        "worms_2",
        "worms_64"
    ],
    "mnist": [
        "digits",
        "fashion"
    ],
 
    "wut": [
        "circles",
        "cross",
        "graph",
        "isolation",
        "labirynth",
        "mk1",
        "mk2",
        "mk3",
        "mk4",
        "olympic",
        "smile",
        "stripes",
        "trajectories",
        "trapped_lovers",
        "twosplashes",
        "windows",
        "x1",
        "x2",
        "x3",
        "z1",
        "z2",
        "z3"
    ],
    "fcps": [
        "atom",
        "chainlink",
        "engytime",
        "hepta",
        "lsun",
        "target",
        "tetra",
        "twodiamonds",
        "wingnut"
    ],
    "uci": [
        "ecoli",
        "glass",
        "ionosphere",
        "sonar",
        "statlog",
        "wdbc",
        "wine",
        "yeast"
    ],
    # "h2mg": [
    #     "h2mg_128_10",
    #     "h2mg_128_20",
    #     "h2mg_128_30",
    #     "h2mg_128_40",
    #     "h2mg_128_50",
    #     "h2mg_128_60",
    #     "h2mg_128_70",
    #     "h2mg_128_80",
    #     "h2mg_128_90",
    #     "h2mg_16_10",
    #     "h2mg_16_20",
    #     "h2mg_16_30",
    #     "h2mg_16_40",
    #     "h2mg_16_50",
    #     "h2mg_16_60",
    #     "h2mg_16_70",
    #     "h2mg_16_80",
    #     "h2mg_16_90",
    #     "h2mg_1_10",
    #     "h2mg_1_20",
    #     "h2mg_1_30",
    #     "h2mg_1_40",
    #     "h2mg_1_50",
    #     "h2mg_1_60",
    #     "h2mg_1_70",
    #     "h2mg_1_80",
    #     "h2mg_1_90",
    #     "h2mg_2_10",
    #     "h2mg_2_20",
    #     "h2mg_2_30",
    #     "h2mg_2_40",
    #     "h2mg_2_50",
    #     "h2mg_2_60",
    #     "h2mg_2_70",
    #     "h2mg_2_80",
    #     "h2mg_2_90",
    #     "h2mg_32_10",
    #     "h2mg_32_20",
    #     "h2mg_32_30",
    #     "h2mg_32_40",
    #     "h2mg_32_50",
    #     "h2mg_32_60",
    #     "h2mg_32_70",
    #     "h2mg_32_80",
    #     "h2mg_32_90",
    #     "h2mg_4_10",
    #     "h2mg_4_20",
    #     "h2mg_4_30",
    #     "h2mg_4_40",
    #     "h2mg_4_50",
    #     "h2mg_4_60",
    #     "h2mg_4_70",
    #     "h2mg_4_80",
    #     "h2mg_4_90",
    #     "h2mg_64_10",
    #     "h2mg_64_20",
    #     "h2mg_64_30",
    #     "h2mg_64_40",
    #     "h2mg_64_50",
    #     "h2mg_64_60",
    #     "h2mg_64_70",
    #     "h2mg_64_80",
    #     "h2mg_64_90",
    #     "h2mg_8_10",
    #     "h2mg_8_20",
    #     "h2mg_8_30",
    #     "h2mg_8_40",
    #     "h2mg_8_50",
    #     "h2mg_8_60",
    #     "h2mg_8_70",
    #     "h2mg_8_80",
    #     "h2mg_8_90"
    # ]
}
clustering_methods = ["genie", "kmeans", "agglomerative", "spectral","gaussianmixture", "hdbscan", "kmodes", "birch", "minibatchkmeans" ,"kmedoids", "latentdirichletallocation", "spectralcoclustering", "bayesiangaussianmixture"]
# clustering_methods = ["genie"]
result_csv = "/Users/cajoshuapark/Dev/research/embedding_based_clustering_research/framework/results/josh.csv"

In [8]:
""" 
Run to set the column names for the csv file
"""
import os
import csv

if os.path.exists(result_csv):
    print("File already exists")
else:
    try:
        with open(result_csv, mode='w', newline='') as file: 
            writer = csv.writer(file)
            writer.writerow([
                "Collection", "Dataset", "Clustering Method", "Label", "Embedding",
                "Original Dimensions", "Embedding Dimensions",
                "CF", "NCA Score", "R", "AR", "FM", "MI", "NMI", "AMI", "A"
            ])
    except Exception as e:
        print("Error writing to file: ", e)



File already exists


In [9]:
# TODO: maybe create a cache or temporary storage for the embeddings
# TODO: parallelize the embedding and clustering process per dataset? 

In [None]:
with open(result_csv, mode='a', newline='') as file:
    writer = csv.writer(file)
    for collection, datasets in eval_collections.items():
        for dataset in datasets:
            print(f"Collection: {collection}, Dataset: {dataset}")
            X, benchmark, X_embedded_dict = load_data(collection, dataset)
            
            # Get the original data dimensions (e.g., "150 x 4")
            orig_dim = f"{X.shape[0]} x {X.shape[1]}"
            
            for label in range(0, len(benchmark.labels)):
                for embedding_technique, embedded_data in X_embedded_dict.items():
                    # Determine the embedding dimensions if available
                    if hasattr(embedded_data, "shape"):
                        embed_dim = f"{embedded_data.shape[0]} x {embedded_data.shape[1]}"
                    else:
                        embed_dim = "Unknown"
                    umap_model = umap.UMAP(
                        n_components=2,
                        n_neighbors=15,
                        min_dist=0.1,
                        metric='euclidean',
                        random_state=42
                    )
                    umap_embedded_data = umap_model.fit_transform(embedded_data)
                    
                    for clustering_method in clustering_methods:
                        cf, nca_score, r, ar, fm, mi, nmi, ami, a = predict(
                            embedding_technique, 
                            umap_embedded_data, 
                            label, 
                            benchmark, 
                            clustering_method
                        )
                        print(cf)
                        cf_str = ", ".join(map(str, cf.flatten())) if hasattr(cf, "flatten") else ", ".join(map(str, cf))
    
                        writer.writerow([
                            collection,          # Collection
                            dataset,             # Dataset
                            clustering_method,   # Clustering Method
                            label,               # Label index
                            embedding_technique, # Embedding technique used
                            orig_dim,            # Original data dimensions
                            embed_dim,           # Embedding dimensions
                            f"cf: {cf} ",        # Confusion matrix
                            f"nca: {nca_score} ",# Normalized Clustering Accuracy
                            f"r: {r} ",         # Rand index
                            f"ar: {ar} ",       # Adjusted Rand index
                            f"fm: {fm} ",       # Fowlkes-Mallows index
                            f"mi: {mi} ",       # Mutual Information
                            f"nmi: {nmi} ",     # Normalized Mutual Information
                            f"ami: {ami} ",     # Adjusted Mutual Information
                            f"a: {a} "         # Accuracy Score
                        ])


Collection: other, Dataset: chameleon_t4_8k
Loaded:  8000  | Dimension:  2  | Label count:  1
Getting embeddings (with caching)...
Cache not found. Generating embeddings...


No sentence-transformers model found with name BAAI/bge-reranker-large. Creating a new one with mean pooling.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of XLMRobertaModel were not initialized from the model checkpoint at BAAI/bge-reranker-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [None]:
import pandas as pd

def filter_and_compare_csv(file_path):
    # Define column names based on the CSV structure
    col_names = [
        "Collection", "Dataset", "Clustering Method", "Label", "Embedding", "Original Dimensions", "Embedding Dimensions",
        "CF", "NCA", "r", "ar", "fm", "mi", "nmi", "ami", "a"
    ]
    
    # Read CSV without a header, assigning our own column names
    df = pd.read_csv(file_path, header=None, names=col_names)
    
    # Remove extra whitespace from all string cells
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    
    # Function to clean a numeric field with a prefix
    def clean_numeric(value, prefix):
        if isinstance(value, str):
            value = value.replace(prefix, "").strip()
        try:
            return float(value)
        except Exception:
            return None

    # Clean the NCA column by removing the "nca:" prefix and converting to float
    df['NCA'] = df['NCA'].apply(lambda x: clean_numeric(x, 'nca:'))
    
    # Group by the specified columns
    grouped = df.groupby(["Collection", "Dataset", "Clustering Method", "Label"])
    
    filtered_rows = []
    
    # Iterate over each group
    for name, group in grouped:
        # Find the "Base" row in the Embedding column
        base_row = group[group['Embedding'] == 'Base']
        if not base_row.empty:
            base_value = base_row.iloc[0]['NCA']
            base_row_list = base_row.iloc[0].tolist()
            base_added = False
            
            # Compare each row's NCA value to the base_value
            for index, row in group.iterrows():
                if row['Embedding'] != 'Base' and row['NCA'] > base_value:
                    if not base_added:
                        filtered_rows.append(base_row_list)
                        base_added = True
                    filtered_rows.append(row.tolist())
    
    # Create a new DataFrame from the filtered rows and remove duplicates
    filtered_df = pd.DataFrame(filtered_rows, columns=df.columns)
    filtered_df = filtered_df.drop_duplicates()
    
    return filtered_df

# Example usage
file_path = "/Users/cajoshuapark/Dev/research/embedding_based_clustering_research/framework/results/josh.csv"
filtered_df = filter_and_compare_csv(file_path)

# Select only the desired columns for display
display_columns = ["Collection", "Dataset", "Clustering Method", "Label", "NCA"]
print(filtered_df[display_columns])
