## Evaluation of Embeddings via Clustering Distance

In [10]:
from pprint import pprint
import os
import torch
import numpy as np
import json

In [11]:
device = "cpu"
if torch.cuda.is_available():
    # print("Cuda available")
    device = torch.device('cuda')


In [12]:
from huggingface_hub import HfFolder, whoami
with open("HF_TOKEN.txt", "r") as f:
    hf_token = f.read()

HfFolder.save_token(hf_token)


In [13]:
import json
with open("text_to_eval_clusters.json", "r") as f:
    list_only_text = json.load(f)


Candidate embeddings taken from the MTEB(fra, v1) BenchMark: http://mteb-leaderboard.hf.space/?benchmark_name=MTEB%28fra%2C+v1%29:




https://huggingface.co/Salesforce/SFR-Embedding-2_R - highest in the clustering tasks in the benchmark

https://huggingface.co/manu/sentence_croissant_alpha_v0.4 - not highest but probably faster

https://huggingface.co/jinaai/jina-embeddings-v3 - has different types of embedding functions

https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0

https://huggingface.co/flaubert/flaubert_base_uncased

https://huggingface.co/dangvantuan/sentence-camembert-large


In [14]:
# after running the candidates, these were the only ones that we could run locally or in google colab
model_candidates = [
                    "Snowflake/snowflake-arctic-embed-l-v2.0",
                    "manu/sentence_croissant_alpha_v0.4"
                    ]

In [15]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score


def cluster_n(cluster_model, n_clusters, embeddings, scoring_function):

    clusters = cluster_model.fit_predict(embeddings)
    sil_sc = scoring_function(embeddings, clusters)

    print("Number of clusters: ", n_clusters)
    print("Score: ", sil_sc)
    print()

    return clusters, sil_sc

def get_optimal_n_clusters(squeezeded_embeddings, max_n_clusters=9):

    #ranges of clusters to test
    range_clusters = np.arange(start=3, stop=max_n_clusters, step=1)

    # Compute the silhouette scores for each number of clusters
    silhouette_scores = []
    clusters_labels = []
    for n_cluster in range_clusters:
        gm = GaussianMixture(n_components=n_cluster, random_state=42) # using Gaussian Mixture as in the paper references
        clusters, sil_sc = cluster_n(gm, n_cluster, squeezeded_embeddings, silhouette_score)
        silhouette_scores.append(sil_sc)
        clusters_labels.append(clusters) # saving the labels so that we don't need to recompute them after getting the optimal n

    # Getting the optimal number of clusters
    max = np.argmax(silhouette_scores)
    optimal_n = range_clusters[max]
    # print("Index", max)
    # print("Optimal Number of Clusters", optimal_n)

    # Getting the labels for the optimal number of clusters
    final_clusters = clusters_labels[max]

    return optimal_n, final_clusters, silhouette_scores

In [None]:
import time
from sentence_transformers import SentenceTransformer

model_id = "manu/sentence_croissant_alpha_v0.4"

model = SentenceTransformer(model_id, trust_remote_code=True)
print("Evaluating model: ", model_id)
sil_scores_all_docs = []
start = time.time()
for doc in list_only_text:
    doc_embds = model.encode(doc, trust_remote=True)
    print("Embedding done.")
    optimal_n, final_clusters, sil_scores = get_optimal_n_clusters(doc_embds, max_n_clusters=9)
    sil_float_array = [float(score) for score in sil_scores]
    print(sil_float_array)
    print(type(sil_float_array[0])) # checking the type is can be serialized
    sil_scores_all_docs.append(sil_float_array)
end = time.time()
print(end-start)
dict_eval = {"model": model_id,
            "sil_scores": sil_scores_all_docs,
            "mean_sils": [float(np.mean(clust)) for clust in sil_scores_all_docs],
            "time": end-start}

model.safetensors:  12%|#1        | 304M/2.56G [00:00<?, ?B/s]

In [None]:

name_json = f"{str(model_id).replace(r"/", "_")}_cluster_eval.json"
with open(name_json, "w") as f:
    json.dump(dict_eval, f)