In [5]:
!pip install sentence_transformers --upgrade
!pip install faiss-cpu
!pip install hdbscan
!pip install einops

[0mCollecting hdbscan
  Downloading hdbscan-0.8.38.post1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: hdbscan
Successfully installed hdbscan-0.8.38.post1
[0mCollecting einops
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.8.0
[0m

### The Imports

In [6]:
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder
import numpy as np
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler

### Load in the dataset

In [7]:
df = pd.read_csv('anime_dataset.csv')

### Do the same data cleaning steps we used in the training script

In [8]:
def data_preprocessor(df):
    df = df[df['Synopsis'] != 'No description available for this anime.']
    df = df[(df['Type'] != 'Music') & (df['Type'] != 'UNKNOWN')]
    df = df[~df['Genres'].str.contains('Hentai|Erotica', case=False, na=False)]
    return df

In [9]:
x = data_preprocessor(df)
corpus = x['Synopsis'].tolist()


### Define the list of models we will be testing out

In [10]:
models = [
    ('output/matryoshka_sts_Snowflake-snowflake-arctic-embed-m-2024-08-24_16-44-06', 'Fine-tuned Model'),
     ('intfloat/multilingual-e5-large-instruct','E5'),('BAAI/bge-large-en-v1.5','BGE'),('Alibaba-NLP/gte-base-en-v1.5','GTE'),('mixedbread-ai/mxbai-embed-large-v1','MixedBread'),('Snowflake/snowflake-arctic-embed-m','Pretrained Model'),('Snowflake/snowflake-arctic-embed-m-long','Snowflake Long')
]


In [11]:
def run_hdbscan_and_evaluate(embeddings, model_name):
    scaler = StandardScaler()
    normalized_embeddings = scaler.fit_transform(embeddings)
    clusterer = HDBSCAN(min_cluster_size=30, min_samples=8)
    cluster_labels = clusterer.fit_predict(normalized_embeddings)
    if len(set(cluster_labels)) > 1:  # Ensure we have at least 2 clusters
        ch_score = calinski_harabasz_score(normalized_embeddings, cluster_labels)
        db_score = davies_bouldin_score(normalized_embeddings, cluster_labels)
        print(f"\nResults for {model_name}:")
        print(f"Number of clusters: {len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)}")
        print(f"Calinski-Harabasz Index: {ch_score:.2f}")
        print(f"Davies-Bouldin Index: {db_score:.2f}")
    else:
        print(f"\nResults for {model_name}:")
        print("Insufficient clusters formed for metric calculation")
    return cluster_labels,ch_score,db_score





In [12]:
# Dictionary to store cluster assignments for each model
all_cluster_assignments = {}
# Loop through all models
for model_path, model_name in models:
    print(f"\nProcessing {model_name}...")
    try:
        model = SentenceTransformer(model_path)
        embeddings = model.encode(corpus)
        print('finished generating embeddings')
        cluster_labels,ch_score,db_score = run_hdbscan_and_evaluate(embeddings, model_name)
        all_cluster_assignments[model_name] = cluster_labels   
    except Exception as e:
        print(f"Error processing {model_name}: {str(e)}")


Processing Fine-tuned Model...
finished generating embeddings

Results for Fine-tuned Model:
Number of clusters: 10
Calinski-Harabasz Index: 73.64
Davies-Bouldin Index: 2.57

Processing E5...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/140k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

finished generating embeddings

Results for E5:
Number of clusters: 8
Calinski-Harabasz Index: 82.12
Davies-Bouldin Index: 2.89

Processing BGE...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

finished generating embeddings


KeyboardInterrupt: 