### Reference

https://www.sbert.net/docs/quickstart.html


In [8]:
from sentence_transformers import SentenceTransformer, util

In [None]:
model = SentenceTransformer(
    model_name_or_path="BAAI/bge-large-en-v1.5",
    device="cuda"
)

In [5]:
# our sentences

sentences = [
    "Pigs are stout-bodied, short-legged, omnivorous mammals, with thick skin usually sparsely coated with short bristles",
    "Cows are four-footed and have a large body. It has two horns, two eyes plus two ears and one nose and a mouth. Cows are herbivorous animals.",
    "Chickens are average-sized fowls, characterized by smaller heads, short beaks and wings, and a round body perched on featherless legs.",
    "NumPy (Numerical Python) is an open source Python library that's used in almost every field of science and engineering. It's the universal standard for working with numerical data in Python, and it's at the core of the scientific Python and PyData ecosystems."
]

In [6]:
sentence_embeddings = model.encode(
    sentences=sentences,
    normalize_embeddings=True
)

In [7]:
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: Pigs are stout-bodied, short-legged, omnivorous mammals, with thick skin usually sparsely coated with short bristles
Embedding: [ 0.03743297  0.0098348   0.0515419  ... -0.01589401  0.00090886
 -0.04270858]

Sentence: Cows are four-footed and have a large body. It has two horns, two eyes plus two ears and one nose and a mouth. Cows are herbivorous animals.
Embedding: [ 0.01551647 -0.00368842 -0.00027022 ...  0.00770947 -0.01678138
  0.00631208]

Sentence: Chickens are average-sized fowls, characterized by smaller heads, short beaks and wings, and a round body perched on featherless legs.
Embedding: [ 0.00284758  0.01925585  0.02077549 ... -0.00314734  0.0141667
 -0.03367088]

Sentence: NumPy (Numerical Python) is an open source Python library that's used in almost every field of science and engineering. It's the universal standard for working with numerical data in Python, and it's at the core of the scientific Python and PyData ecosystems.
Embedding: [ 0.01973291  0.00506833

In [9]:
# comparing sentence similarities 

emb1 = model.encode(
    "Pigs are stout-bodied, short-legged, omnivorous mammals, with thick skin usually sparsely coated with short bristles",
    normalize_embeddings=True
)

emb2 = model.encode(
    "Cows are four-footed and have a large body. It has two horns, two eyes plus two ears and one nose and a mouth. Cows are herbivorous animals.",
    normalize_embeddings=True
)

emb3 = model.encode(
    "Harry Potter went to Hogwarts when he was 11.",
    normalize_embeddings=True
)

cos_sim = util.cos_sim(emb1, emb2)
print("Cosine similarity:", cos_sim)

cos_sim = util.cos_sim(emb1, emb3)
print("Cosine similarity:", cos_sim)

Cosine similarity: tensor([[0.6423]])
Cosine similarity: tensor([[0.3071]])


In [16]:
# compare cosine similarities across a bunch of sentences

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")

sentences = [
    "A man is eating food.",
    "A man is eating a piece of bread.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
    "A woman is playing violin.",
    "Two men pushed carts through the woods.",
    "A man is riding a white horse on an enclosed ground.",
    "A monkey is playing drums.",
    "Someone in a gorilla costume is playing a set of drums.",
]

# Encode all sentences
embeddings = model.encode(sentences)

# Compute cosine similarity between all pairs
cos_sim = util.cos_sim(embeddings, embeddings)

# 2D matrix of cosine similarities (nXn - square)
# Notice: the values of leading diagonal are all 1.0
#
#            sentence1  sentence2  sentence3 ...
# sentence1        1.0       0.75       0.24 ... 
# sentence2       0.12        1.0       0.65 ...
# sentence3       0.29       0.33        1.0 ...
# ...              ...        ...        ... 
# ...              ...        ...        ...

print(cos_sim)

tensor([[ 1.0000,  0.7553, -0.1050,  0.2474, -0.0704, -0.0333,  0.1707,  0.0476,
          0.0630],
        [ 0.7553,  1.0000, -0.0610,  0.1442, -0.0809, -0.0216,  0.1157,  0.0362,
          0.0216],
        [-0.1050, -0.0610,  1.0000, -0.1088,  0.0217, -0.0413, -0.0928,  0.0231,
          0.0247],
        [ 0.2474,  0.1442, -0.1088,  1.0000, -0.0348,  0.0362,  0.7369,  0.0821,
          0.1389],
        [-0.0704, -0.0809,  0.0217, -0.0348,  1.0000, -0.1654, -0.0592,  0.1961,
          0.2564],
        [-0.0333, -0.0216, -0.0413,  0.0362, -0.1654,  1.0000,  0.0769, -0.0380,
         -0.0895],
        [ 0.1707,  0.1157, -0.0928,  0.7369, -0.0592,  0.0769,  1.0000,  0.0495,
          0.1191],
        [ 0.0476,  0.0362,  0.0231,  0.0821,  0.1961, -0.0380,  0.0495,  1.0000,
          0.6433],
        [ 0.0630,  0.0216,  0.0247,  0.1389,  0.2564, -0.0895,  0.1191,  0.6433,
          1.0000]])


In [18]:
# Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cos_sim) - 1):
    for j in range(i + 1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

# Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

# print(all_sentence_combinations)

# print("Top-5 most similar pairs:")
# for score, i, j in all_sentence_combinations[0:5]:
#     print("{} \t {} \t {:.4f}".format(sentences[i], sentences[j], cos_sim[i][j]))
    
print("All cosine similarities:")
for score, i, j in all_sentence_combinations:
    print("{} \t {} \t {:.4f}".format(sentences[i], sentences[j], cos_sim[i][j]))

All cosine similarities:
A man is eating food. 	 A man is eating a piece of bread. 	 0.7553
A man is riding a horse. 	 A man is riding a white horse on an enclosed ground. 	 0.7369
A monkey is playing drums. 	 Someone in a gorilla costume is playing a set of drums. 	 0.6433
A woman is playing violin. 	 Someone in a gorilla costume is playing a set of drums. 	 0.2564
A man is eating food. 	 A man is riding a horse. 	 0.2474
A woman is playing violin. 	 A monkey is playing drums. 	 0.1961
A man is eating food. 	 A man is riding a white horse on an enclosed ground. 	 0.1707
A man is eating a piece of bread. 	 A man is riding a horse. 	 0.1442
A man is riding a horse. 	 Someone in a gorilla costume is playing a set of drums. 	 0.1389
A man is riding a white horse on an enclosed ground. 	 Someone in a gorilla costume is playing a set of drums. 	 0.1191
A man is eating a piece of bread. 	 A man is riding a white horse on an enclosed ground. 	 0.1157
A man is riding a horse. 	 A monkey is pla