In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

text = "Your input text goes here."
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

embeddings = outputs.last_hidden_state.mean(dim=1)  # You can adjust the aggregation method
print(embeddings)

  from .autonotebook import tqdm as notebook_tqdm


tensor([[-5.2826e-02, -2.0603e-01, -1.5062e-02,  8.0527e-02,  8.0432e-03,
          4.2127e-02,  5.2671e-02,  1.3812e-02,  3.6934e-02, -6.2355e-02,
         -1.9774e-02,  6.6487e-02,  1.9214e-01,  2.2150e-01,  2.3683e-02,
         -1.8020e-01,  8.9604e-02, -5.7049e-02, -6.4599e-02,  4.4140e-02,
         -7.2506e-02,  5.6609e-02, -6.1662e-02,  6.8534e-02, -3.7086e-03,
          3.1052e-02, -2.4233e-02, -3.4827e-02, -1.6331e-02, -1.6136e-01,
         -1.1168e-01,  6.6041e-02, -5.1588e-02, -2.0026e-01,  4.9207e-06,
         -1.1117e-01, -7.3784e-03,  6.1920e-03, -1.2841e-01,  3.0920e-02,
         -1.3898e-01,  3.1287e-01, -9.0167e-02,  6.3635e-02,  1.2263e-01,
         -2.3499e-02,  9.0801e-02,  2.0635e-01,  2.4918e-02,  1.3234e-01,
          3.2308e-02, -1.9407e-01,  9.4426e-02, -1.5740e-01,  3.2649e-01,
         -5.0985e-02,  7.4780e-02, -3.9603e-02,  3.2042e-02,  2.1790e-01,
          7.7504e-02,  1.5769e-01, -2.0265e-02, -6.4309e-02,  1.7502e-01,
          1.0918e-01,  1.0677e-01, -1.

In [6]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-mpnet-base-v2')

# Single list of sentences
sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']

#Compute embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.cos_sim(embeddings, embeddings)

#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

#Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

for pair in pairs[0:10]:
    i, j = pair['index']
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], pair['score']))

Downloading (…)a8e1d/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 586kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 64.0kB/s]
Downloading (…)b20bca8e1d/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 5.28MB/s]
Downloading (…)0bca8e1d/config.json: 100%|██████████| 571/571 [00:00<00:00, 284kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 55.6kB/s]
Downloading (…)e1d/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 191kB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [01:23<00:00, 5.24MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 26.5kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 39.9kB/s]
Downloading (…)a8e1d/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 702kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 182kB/s]
Downloading (…)8e1d/train_script.py: 100%|████

The new movie is awesome 		 The new movie is so great 		 Score: 0.9101
The cat sits outside 		 The cat plays in the garden 		 Score: 0.6738
I love pasta 		 Do you like pizza? 		 Score: 0.5189
I love pasta 		 The new movie is so great 		 Score: 0.2035
The new movie is awesome 		 Do you like pizza? 		 Score: 0.1931
The new movie is so great 		 Do you like pizza? 		 Score: 0.1788
I love pasta 		 The new movie is awesome 		 Score: 0.1752
I love pasta 		 The cat plays in the garden 		 Score: 0.1108
The cat sits outside 		 The new movie is so great 		 Score: 0.1059
The cat plays in the garden 		 Do you like pizza? 		 Score: 0.0911


In [None]:
class SimilarityFinder:

    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)

    def calculate_embeddings(self, sentences):
        return self.model.encode(sentences, convert_to_tensor=True)

    def calculate_similarity(self, embeddings1, embeddings2):
        return util.cos_sim(embeddings1, embeddings2)