In [1]:
from sentence_transformers import SentenceTransformer, util

### Similarity Search

In [2]:


def get_embeddings(texts, model_name='all-mpnet-base-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts)
    return embeddings


query_embedding = get_embeddings('How big is London')
passage_embedding = get_embeddings(['London has 9,787,426 inhabitants at the 2011 census',
                                  'London is known for its finacial district'])

print("Similarity:", util.dot_score(query_embedding, passage_embedding))

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


Similarity: tensor([[0.6286, 0.4823]])


In [3]:

print("Similarity:", util.cos_sim(query_embedding, passage_embedding))

Similarity: tensor([[0.6286, 0.4823]])


### Contradiction

In [5]:
from sentence_transformers import CrossEncoder

model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

scores = model.predict([('How many people live in Berlin?', 'Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.'), 
                        ('How many people live in Berlin?', 'Berlin is well known for its museums.')])

scores

array([ 9.218912 , -4.0780315], dtype=float32)

In [7]:
model = CrossEncoder('cross-encoder/nli-deberta-v3-base')
scores = model.predict([
    ('A man is eating pizza', 'A man eats something'), 
    ('A black race car starts up in front of a crowd of people.', 'A man is driving down a lonely road.')])

#Convert scores to labels
label_mapping = ['contradiction', 'entailment', 'neutral']
labels = [label_mapping[score_max] for score_max in scores.argmax(axis=1)]
labels 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['entailment', 'contradiction']

In [15]:
passage_embedding

array([[-0.02732209,  0.00409229, -0.02857092, ..., -0.00056617,
        -0.00517719, -0.00318573],
       [-0.04808873, -0.01012499, -0.02384099, ...,  0.01751459,
         0.00476047, -0.00586022]], dtype=float32)

### Change the length of input sequence Length

In [17]:
model = SentenceTransformer('all-mpnet-base-v2')

print('Max Sequence Length:', model.max_seq_length)

model.max_seq_length = 512

print('Max Sequence Length:', model.max_seq_length)

Max Sequence Length: 384
Max Sequence Length: 512


In [18]:
import pickle
sentences = ['London has 9,787,426 inhabitants at the 2011 census',
                                  'London is known for its finacial district']

with open('embeddings.pkl', 'wb') as fOut:
    pickle.dump({'sentences': sentences, 'embeddings': passage_embedding}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
with open('embeddings.pkl', 'rb') as fIn:
    data = pickle.load(fIn)
    sentences = data['sentences']
    embeddings = data['embeddings']

### Similarity 

In [20]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

# Single list of sentences
sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']

#Compute embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.cos_sim(embeddings, embeddings)

#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

#Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

for pair in pairs[0:10]:
    i, j = pair['index']
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], pair['score']))

The new movie is awesome 		 The new movie is so great 		 Score: 0.8939
The cat sits outside 		 The cat plays in the garden 		 Score: 0.6788
I love pasta 		 Do you like pizza? 		 Score: 0.5096
I love pasta 		 The new movie is so great 		 Score: 0.2560
I love pasta 		 The new movie is awesome 		 Score: 0.2440
A man is playing guitar 		 The cat plays in the garden 		 Score: 0.2105
The new movie is awesome 		 Do you like pizza? 		 Score: 0.1969
The new movie is so great 		 Do you like pizza? 		 Score: 0.1692
The cat sits outside 		 A woman watches TV 		 Score: 0.1310
The cat plays in the garden 		 Do you like pizza? 		 Score: 0.0900
