In [1]:
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


## Explore STS model

In [2]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


In [3]:
# experiment with the STS transformer
sentences1 = ["I'm happy", "I'm full of happiness"]
#Compute embedding for both lists
embedding_1= model.encode(sentences1[0], convert_to_tensor=True)
embedding_2 = model.encode(sentences1[1], convert_to_tensor=True)

util.pytorch_cos_sim(embedding_1, embedding_2)

tensor([[0.6003]])

In [4]:
sentences3 = 'The new movie is awesome'
sentence4 = 'The new movie is so great'
#Compute embedding for both lists
embedding_3= model.encode(sentences3, convert_to_tensor=True)
embedding_4 = model.encode(sentence4, convert_to_tensor=True)
score = util.pytorch_cos_sim(embedding_3, embedding_4)
score

tensor([[0.8939]])

In [6]:
# return just the result
type(score)
score.numpy()[0][0]

0.89390373

In [7]:
#model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') call in KNN
def semantic_text_sim(title1, title2):
    #Compute embedding for both lists
    embedding_1= model.encode(title1, convert_to_tensor=True)
    embedding_2 = model.encode(title2, convert_to_tensor=True)
    score = util.pytorch_cos_sim(embedding_1, embedding_2) # tensor returned
    return score.numpy()[0][0]

In [8]:
semantic_text_sim("I always eat cheese", "I prefer foods with cheese")

0.7872645

In [9]:
semantic_text_sim("I hate disco", "I love disco")

0.83356345

## Explore Distance Measures

In [10]:
from scipy import spatial
spatial.distance.cosine([.833], [0])

  dist = 1.0 - uv / np.sqrt(uu * vv)


0

In [11]:
spatial.distance.cosine([.83], [0.00001])

0

## Explore NLI Model

In [12]:
from sentence_transformers import CrossEncoder
nli_model = CrossEncoder('cross-encoder/nli-roberta-base')

In [13]:
scores_nli = nli_model.predict([('A man is eating pizza', 'A man eats something'), ('A black race car starts up in front of a crowd of people.', 'A man is driving down a lonely road.')])

#Convert scores to labels
label_mapping = ['contradiction', 'entailment', 'neutral']
labels = [label_mapping[score_max] for score_max in scores_nli.argmax(axis=1)]
labels

['entailment', 'contradiction']

In [14]:
scores_nli

array([[-4.2572556 ,  3.5655646 ,  0.00601628],
       [ 5.398506  , -2.89872   , -1.6895918 ]], dtype=float32)

In [15]:
scores_nli_2= nli_model.predict([("I always eat cheese", "I prefer foods with cheese")])
scores_nli_2[0][1] # entatilement is the middle score

2.744587

In [16]:
scores_nli_2= nli_model.predict([("I snack on cheese", "I eat cheese")])
scores_nli_2[0][1]

3.7934318

In [17]:
scores_nli_2= nli_model.predict(("I prefer foods with cheese","I always eat cheese"))
scores_nli_2[1]

-1.0834459

In [18]:
scores_nli_3 = nli_model.predict([("I hate disco", "I love disco")])
scores_nli_3

array([[ 5.3351293, -2.6727824, -1.9018666]], dtype=float32)

In [19]:
# Single list of sentences
sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']

#Compute embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

In [20]:
embeddings

tensor([[ 0.1392,  0.0030,  0.0470,  ...,  0.0641, -0.0163,  0.0636],
        [ 0.0227, -0.0014, -0.0056,  ..., -0.0225,  0.0846, -0.0283],
        [-0.1025, -0.0541,  0.0108,  ...,  0.1097,  0.0851, -0.0738],
        ...,
        [ 0.0054, -0.0920,  0.0140,  ...,  0.0167, -0.0086, -0.0424],
        [-0.0842, -0.0592, -0.0010,  ..., -0.0157,  0.0764,  0.0389],
        [-0.1047,  0.0302, -0.0049,  ...,  0.0555,  0.0570, -0.0948]])

In [21]:
#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.cos_sim(embeddings, embeddings)

#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

In [22]:
cosine_scores

tensor([[ 1.0000,  0.0363,  0.0081, -0.0247,  0.6788,  0.1310, -0.0029,  0.0254],
        [ 0.0363,  1.0000, -0.0368,  0.0093,  0.2105, -0.0327, -0.0136,  0.0116],
        [ 0.0081, -0.0368,  1.0000,  0.2440,  0.0230,  0.0359,  0.2560,  0.5096],
        [-0.0247,  0.0093,  0.2440,  1.0000,  0.0275, -0.0502,  0.8939,  0.1969],
        [ 0.6788,  0.2105,  0.0230,  0.0275,  1.0000,  0.0629,  0.0591,  0.0900],
        [ 0.1310, -0.0327,  0.0359, -0.0502,  0.0629,  1.0000, -0.0509,  0.0417],
        [-0.0029, -0.0136,  0.2560,  0.8939,  0.0591, -0.0509,  1.0000,  0.1692],
        [ 0.0254,  0.0116,  0.5096,  0.1969,  0.0900,  0.0417,  0.1692,  1.0000]])

In [23]:
pairs

[{'index': [0, 1], 'score': tensor(0.0363)},
 {'index': [0, 2], 'score': tensor(0.0081)},
 {'index': [0, 3], 'score': tensor(-0.0247)},
 {'index': [0, 4], 'score': tensor(0.6788)},
 {'index': [0, 5], 'score': tensor(0.1310)},
 {'index': [0, 6], 'score': tensor(-0.0029)},
 {'index': [0, 7], 'score': tensor(0.0254)},
 {'index': [1, 2], 'score': tensor(-0.0368)},
 {'index': [1, 3], 'score': tensor(0.0093)},
 {'index': [1, 4], 'score': tensor(0.2105)},
 {'index': [1, 5], 'score': tensor(-0.0327)},
 {'index': [1, 6], 'score': tensor(-0.0136)},
 {'index': [1, 7], 'score': tensor(0.0116)},
 {'index': [2, 3], 'score': tensor(0.2440)},
 {'index': [2, 4], 'score': tensor(0.0230)},
 {'index': [2, 5], 'score': tensor(0.0359)},
 {'index': [2, 6], 'score': tensor(0.2560)},
 {'index': [2, 7], 'score': tensor(0.5096)},
 {'index': [3, 4], 'score': tensor(0.0275)},
 {'index': [3, 5], 'score': tensor(-0.0502)},
 {'index': [3, 6], 'score': tensor(0.8939)},
 {'index': [3, 7], 'score': tensor(0.1969)},
 {'i

In [24]:
#Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

for pair in pairs[0:10]:
    i, j = pair['index']
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], pair['score']))

The new movie is awesome 		 The new movie is so great 		 Score: 0.8939
The cat sits outside 		 The cat plays in the garden 		 Score: 0.6788
I love pasta 		 Do you like pizza? 		 Score: 0.5096
I love pasta 		 The new movie is so great 		 Score: 0.2560
I love pasta 		 The new movie is awesome 		 Score: 0.2440
A man is playing guitar 		 The cat plays in the garden 		 Score: 0.2105
The new movie is awesome 		 Do you like pizza? 		 Score: 0.1969
The new movie is so great 		 Do you like pizza? 		 Score: 0.1692
The cat sits outside 		 A woman watches TV 		 Score: 0.1310
The cat plays in the garden 		 Do you like pizza? 		 Score: 0.0900
