In [16]:
from sentence_transformers import SentenceTransformer, util

In [17]:
model = SentenceTransformer(
    model_name_or_path="BAAI/bge-large-en-v1.5",
    device="cuda"
)

In [18]:
sample_docs = [
    "Pigs are stout-bodied, short-legged, omnivorous mammals, with thick skin usually sparsely coated with short bristles",
    "Cows are four-footed and have a large body. It has two horns, two eyes plus two ears and one nose and a mouth. Cows are herbivorous animals.",
    "Chickens are average-sized fowls, characterized by smaller heads, short beaks and wings, and a round body perched on featherless legs.",
    "NumPy (Numerical Python) is an open source Python library that's used in almost every field of science and engineering. It's the universal standard for working with numerical data in Python, and it's at the core of the scientific Python and PyData ecosystems."
]

In [19]:
embeddings = model.encode(
    sentences=sample_docs,
    normalize_embeddings=True
)

print(embeddings)

[[ 0.03743297  0.0098348   0.0515419  ... -0.01589401  0.00090886
  -0.04270858]
 [ 0.01551647 -0.00368842 -0.00027022 ...  0.00770947 -0.01678138
   0.00631208]
 [ 0.00284758  0.01925585  0.02077549 ... -0.00314734  0.0141667
  -0.03367088]
 [ 0.01973291  0.00506833 -0.04329247 ...  0.01556299 -0.00930898
   0.01117883]]


In [35]:
from pprint import pprint

my_vector_store = list(zip(embeddings, sample_docs))

pprint(list(my_vector_store))

[(array([ 0.03743297,  0.0098348 ,  0.0515419 , ..., -0.01589401,
        0.00090886, -0.04270858], dtype=float32),
  'Pigs are stout-bodied, short-legged, omnivorous mammals, with thick skin '
  'usually sparsely coated with short bristles'),
 (array([ 0.01551647, -0.00368842, -0.00027022, ...,  0.00770947,
       -0.01678138,  0.00631208], dtype=float32),
  'Cows are four-footed and have a large body. It has two horns, two eyes plus '
  'two ears and one nose and a mouth. Cows are herbivorous animals.'),
 (array([ 0.00284758,  0.01925585,  0.02077549, ..., -0.00314734,
        0.0141667 , -0.03367088], dtype=float32),
  'Chickens are average-sized fowls, characterized by smaller heads, short '
  'beaks and wings, and a round body perched on featherless legs.'),
 (array([ 0.01973291,  0.00506833, -0.04329247, ...,  0.01556299,
       -0.00930898,  0.01117883], dtype=float32),
  "NumPy (Numerical Python) is an open source Python library that's used in "
  "almost every field of science

In [26]:
# serach

search_text = "what is a hog?"

In [27]:
# compute serch text embedding

search_vector = model.encode(search_text, normalize_embeddings=True) 

print(search_vector)

[ 0.02994812 -0.02032637 -0.02611768 ... -0.00986228  0.0254914
 -0.01094405]


In [42]:
# compute cosine similarities

similarities = []

for i in list(my_vector_store):
    embedding, sentence = i
    cos_sim = util.cos_sim(embedding, search_vector)
    similarities.append(
        (cos_sim, search_text, sentence)
    )
    print("Sentence: ", sentence)
    print("Search: ", search_text)
    print("Cosine similarity: ", cos_sim)
    print("\n")

# sort based on similarity
similarities = sorted(similarities, key=lambda t: t[0], reverse=True)
    
print(similarities)

Sentence:  Pigs are stout-bodied, short-legged, omnivorous mammals, with thick skin usually sparsely coated with short bristles
Search:  what is a hog?
Cosine similarity:  tensor([[0.6212]])


Sentence:  Cows are four-footed and have a large body. It has two horns, two eyes plus two ears and one nose and a mouth. Cows are herbivorous animals.
Search:  what is a hog?
Cosine similarity:  tensor([[0.4819]])


Sentence:  Chickens are average-sized fowls, characterized by smaller heads, short beaks and wings, and a round body perched on featherless legs.
Search:  what is a hog?
Cosine similarity:  tensor([[0.5185]])


Sentence:  NumPy (Numerical Python) is an open source Python library that's used in almost every field of science and engineering. It's the universal standard for working with numerical data in Python, and it's at the core of the scientific Python and PyData ecosystems.
Search:  what is a hog?
Cosine similarity:  tensor([[0.4864]])


[(tensor([[0.6212]]), 'what is a hog?', 'Pi

In [45]:
# get top k

k = 2

top_k = [i[2] for i in similarities[:k]]

print(top_k)


['Pigs are stout-bodied, short-legged, omnivorous mammals, with thick skin usually sparsely coated with short bristles', 'Chickens are average-sized fowls, characterized by smaller heads, short beaks and wings, and a round body perched on featherless legs.']
