In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
model = SentenceTransformer("all-MiniLM-L6-v2")



In [3]:
docs = [
    "The dog jumped over the cat", 
    "The cat jumped over the dog",
    "It is very warm today",
    "The cat is yellow and the dog is red",
]

In [4]:
documents = []
for i, x in enumerate(docs):
    row = {
        "index": i,
        "data": x
    }
    documents.append(row)
documents

[{'index': 0, 'data': 'The dog jumped over the cat'},
 {'index': 1, 'data': 'The cat jumped over the dog'},
 {'index': 2, 'data': 'It is very warm today'},
 {'index': 3, 'data': 'The cat is yellow and the dog is red'}]

In [5]:
dataset = [
    {'id': 1, 'text': 'The dog jumped over the cat'},
    {'id': 2, 'text': 'The cat jumped over the dog'},
    {'id': 3, 'text': 'It is very warm today'},
    {'id': 4, 'text': 'The cat is yellow and the dog is red'},
    {'id': 5, 'text': 'The dog jumped over the purple cow'}
]

In [7]:
document_embeddings = []
for x in dataset:
    embed = model.encode(x.get('text'), clean_up_tokenization_spaces=False) 
    final_data = (x.get('id'), embed)
    document_embeddings.append(final_data)

document_embeddings

[(1,
  array([ 5.44742756e-02,  3.71413566e-02,  7.23646879e-02,  7.01894835e-02,
         -5.60807297e-03, -2.30735517e-03, -3.77298556e-02,  1.51365120e-02,
          9.50729940e-03, -3.03878740e-04,  5.88888414e-02,  2.10996270e-02,
          6.14684308e-03, -6.37330860e-02,  1.38773452e-02, -9.31993593e-03,
         -1.19946703e-01, -2.11697072e-03,  6.35646209e-02, -2.08781026e-02,
         -2.80396361e-02, -3.85088623e-02,  2.66364682e-03, -5.36900423e-02,
         -2.24582162e-02,  4.27320674e-02, -6.99243620e-02, -6.33601025e-02,
          1.81462858e-02, -1.30238328e-02, -1.66832563e-02, -1.05505204e-02,
         -3.54123898e-02,  5.32195866e-02, -6.18179962e-02, -6.84991181e-02,
          6.55241832e-02, -1.77633934e-04,  4.45996523e-02,  1.20984331e-01,
         -4.50547561e-02,  1.97268352e-02, -1.27503229e-02,  1.16575370e-03,
         -3.26760225e-02,  6.10489957e-02, -1.04520954e-02, -6.91033676e-02,
          3.60625982e-02,  4.39831913e-02, -1.22471740e-02,  9.17075500

In [19]:
query = "The dog and the cat are not the same."
query_embedding = model.encode([query])

In [20]:
results = []
for doc in document_embeddings:
    doc_id = doc[0]
    doc_embedding = doc[1]
    rank = model.similarity(doc_embedding, query_embedding)
    results.append(
        (doc_id, rank)
    )
results

[(1, tensor([[0.5749]])),
 (2, tensor([[0.5860]])),
 (3, tensor([[-0.0084]])),
 (4, tensor([[0.6711]])),
 (5, tensor([[0.3221]]))]

In [23]:
results.sort(key=lambda x: x[1], reverse=True)
results

[(4, tensor([[0.6711]])),
 (2, tensor([[0.5860]])),
 (1, tensor([[0.5749]])),
 (5, tensor([[0.3221]])),
 (3, tensor([[-0.0084]]))]

In [24]:
for result in results:
    dataset_id = result[0]
    rank = result[1]
    doc = next(doc for doc in dataset if doc['id'] == dataset_id)
    print(dataset_id, rank, doc)

4 tensor([[0.6711]]) {'id': 4, 'text': 'The cat is yellow and the dog is red'}
2 tensor([[0.5860]]) {'id': 2, 'text': 'The cat jumped over the dog'}
1 tensor([[0.5749]]) {'id': 1, 'text': 'The dog jumped over the cat'}
5 tensor([[0.3221]]) {'id': 5, 'text': 'The dog jumped over the purple cow'}
3 tensor([[-0.0084]]) {'id': 3, 'text': 'It is very warm today'}
