In [26]:
import pandas as pd
df = pd.read_csv('evaluation_cases.csv')

3. Sentence Embeddings + Other Comparisons<br>
Beyond cosine similarity:

- `SentenceTransformers`: Most popular for generating sentence embeddings using models like `BERT`, `RoBERTa`, etc. Easily compute measures of distance between embeddings.

- `Euclidean Distance`: Available via `scipy.spatial.distance.euclidean`.

- `Manhattan Distance`, `Dot Product`, or `Angular Distance`: Use `scipy` or `sklearn` for these.

In [None]:
from sentence_transformers import SentenceTransformer, util
from scipy.spatial import distance
from math import acos, pi
import torch

# load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# create dataframe to store distances
distances = pd.DataFrame(columns=['euclidean', 'manhattan'])

# loop through the evaluation cases
for index, row in df.iterrows():
    text_a = row['sent1']
    text_b = row['sent2']

    # encode sentences
    emb1 = model.encode(text_a, convert_to_tensor=True)
    emb2 = model.encode(text_b, convert_to_tensor=True)

    # compute distances
    distances.at[index, 'euclidean'] = distance.euclidean(emb1.cpu().numpy(), emb2.cpu().numpy())
    distances.at[index, 'manhattan'] = distance.cityblock(emb1.cpu().numpy(), emb2.cpu().numpy()) 
    distances.at[index, "dot_product"] = torch.dot(emb1, emb2).item()
    distances.at[index, 'angular'] = 1 - acos(1 - distance.cosine(emb1.cpu().numpy(), emb2.cpu().numpy()))/pi

the cat sat on the mat a feline rested atop a rug
he ran quickly to the store she ran quickly to the store
domestic unrest political instability in the country
turn left at the traffic light photosynthesis occurs in plant cells


In [43]:
# normalize distances 
distances['euclidean_normalized'] = distances['euclidean'] / distances['euclidean'].sum()
distances['manhattan_normalized'] = distances['manhattan'] / distances['manhattan'].sum()
distances['dot_product_normalized'] = distances['dot_product'] / distances['dot_product'].sum()
distances['angular_normalized'] = distances['angular'] / distances['angular'].sum()

In [45]:
distances

Unnamed: 0,euclidean,manhattan,dot_product,angular,euclidean_normalized,manhattan_normalized,dot_product_normalized,angular_normalized
0,0.935487,14.680565,0.562432,0.690134,0.248957,0.250257,0.274278,0.252681
1,0.568934,8.884562,0.838157,0.816367,0.151408,0.151454,0.408739,0.298898
2,0.842431,13.078042,0.645155,0.723207,0.224192,0.22294,0.314619,0.26479
3,1.41078,22.018671,0.00485,0.501544,0.375444,0.375349,0.002365,0.183631
