# Embeddings Comparison

This notebook compares the performance of different embedding methods (MiniLM, MPNet, OpenAI, Gemini) for clustering Stack Exchange data. We use Optuna for hyperparameter optimization and evaluate clustering quality using Adjusted Rand Index (ARI).

**Note:** This notebook requires additional dependencies not included in the main requirements.txt:
```bash
pip install optuna optuna-dashboard
```

In [None]:
import pandas as pd
from Latte import Latte
from matplotlib import pyplot as plt
import numpy as np
import warnings
import optuna
from sklearn.metrics import adjusted_rand_score

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)
optuna.logging.set_verbosity(optuna.logging.WARNING)

project_to_field = {'biology': 'Science','chemistry': 'Science','physics': 'Science','ell': 'Language','english': 'Language','linguistics': 'Language','computergraphics': 'Design','graphicdesign': 'Design','photo': 'Design','gamedev': 'Tech','softwareengineering': 'Tech'}

## Helper Functions

Define functions for hyperparameter optimization and data preprocessing.


In [None]:
def get_score(obj, N, mask):
    def objective(trial):
        # Define discrete parameter spaces
        reduce_val = trial.suggest_categorical('reduce_val', range(2, 51))
        min_cluster_size = trial.suggest_categorical('min_cluster_size', range(2, 10))
        min_samples = trial.suggest_categorical('min_samples', range(1, 10))
        cluster_level = trial.suggest_categorical('cluster_level', range(0, 10))
        
        # Run your pipeline
        obj.reduce(reduce_val)
        obj.cluster(min_cluster_size=min_cluster_size, min_samples=min_samples)
        cluster_labels = obj.get_cluster_labels(cluster_level)
        
        result = adjusted_rand_score(mask, cluster_labels)
        
        return result
    
    study= optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=N)
    return study.best_trial.value

In [33]:
def get_masks(df):
    df['field'] = df['project'].map(project_to_field)

    mask1 = np.array(df['field'] == 'Science') * 1
    mask2 = np.array(df['field'] == 'Language') * 2
    mask3 = np.array(df['field'] == 'Design') * 3
    mask4 = np.array(df['field'] == 'Tech') * 4

    df['project_tags'] = df['project'] + '_' + df['tags']
    masks = {
        'field': mask1 + mask2 + mask3 + mask4,
        'tag': pd.factorize(df['project_tags'])[0] + 1,
        'project': pd.factorize(df['project'])[0] + 1
    }
    return masks

In [28]:
sample_size = 3
df = pd.read_csv(f'data/sample_{sample_size}.csv')
masks = get_masks(df)

## Comparison Across Different Sample Sizes

We'll compare embedding methods across different sample sizes to see how performance scales.


### Sample Size: 3 documents per project


In [32]:
for mask in masks:
    print(mask)
    for embeddings in ['minilm', 'mpnet', 'openai', 'gemini']:
        latte = Latte(df, mute = True)
        latte.embed('file', embeddings_file=f'embeddings/sample_{sample_size}_{embeddings}.pkl')
        score = get_score(latte, 50, masks[mask])
        print(f'{embeddings}: {score}')

field
minilm: 0.29958105138884333
mpnet: 0.21993258127185308
openai: 0.47318915177898285
gemini: 0.6903843235861531
tag
minilm: 0.2614492753623188
mpnet: 0.24209160762398962
openai: 0.2620481927710843
gemini: 0.311268620855358
project
minilm: 0.33350710900473934
mpnet: 0.3794954918447979
openai: 0.5260757913936689
gemini: 0.7280052524770204


### Sample Size: 5 documents per project


In [34]:
sample_size = 5
df = pd.read_csv(f'data/sample_{sample_size}.csv')
masks = get_masks(df)

for mask in masks:
    print(mask)
    for embeddings in ['minilm', 'mpnet', 'openai', 'gemini']:
        latte = Latte(df, mute = True)
        latte.embed('file', embeddings_file=f'embeddings/sample_{sample_size}_{embeddings}.pkl')
        score = get_score(latte, 50, masks[mask])
        print(f'{embeddings}: {score}')

field
minilm: 0.4024136619470495
mpnet: 0.34188214937998657
openai: 0.6044783922252733
gemini: 0.6397180021937082
tag
minilm: 0.2460930215210175
mpnet: 0.32268177981430124
openai: 0.3714250217418313
gemini: 0.38263509411050395
project
minilm: 0.39973845257920493
mpnet: 0.44151490950811556
openai: 0.6345054918614583
gemini: 0.6104892777328705


### Sample Size: 10 documents per project


In [35]:
sample_size = 10
df = pd.read_csv(f'data/sample_{sample_size}.csv')
masks = get_masks(df)

for mask in masks:
    print(mask)
    for embeddings in ['minilm', 'mpnet', 'openai', 'gemini']:
        latte = Latte(df, mute = True)
        latte.embed('file', embeddings_file=f'embeddings/sample_{sample_size}_{embeddings}.pkl')
        score = get_score(latte, 50, masks[mask])
        print(f'{embeddings}: {score}')

field
minilm: 0.4914340800299857
mpnet: 0.4633643979046883
openai: 0.6066241848734035
gemini: 0.6983557992368349
tag
minilm: 0.2975292183064978
mpnet: 0.26095570778537663
openai: 0.3540780749070084
gemini: 0.3672245096341985
project
minilm: 0.45116893621902326
mpnet: 0.4010987893924442
openai: 0.6335473326176289
gemini: 0.6187082485413894


### Sample Size: 20 documents per project


In [36]:
sample_size = 20
df = pd.read_csv(f'data/sample_{sample_size}.csv')
masks = get_masks(df)

for mask in masks:
    print(mask)
    for embeddings in ['minilm', 'mpnet', 'openai', 'gemini']:
        latte = Latte(df, mute = True)
        latte.embed('file', embeddings_file=f'embeddings/sample_{sample_size}_{embeddings}.pkl')
        score = get_score(latte, 50, masks[mask])
        print(f'{embeddings}: {score}')

field
minilm: 0.45917746030670925
mpnet: 0.31942026577600713
openai: 0.5437427916348974
gemini: 0.43864853004956267
tag
minilm: 0.3125094033580749
mpnet: 0.2655310331231111
openai: 0.37495720110631847
gemini: 0.4003139988803484
project
minilm: 0.39292344325157164
mpnet: 0.44715785993947493
openai: 0.6091664160495908
gemini: 0.6043374601053005
