In [None]:
%%capture

%cd ..

# Similaraity exploration

In [None]:
from sentence_transformers import SentenceTransformer, util


In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
model


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [None]:
# Two lists of sentences
sentences1 = [
    "The cat sits outside",
    "A man is playing guitar",
    "The new movie is awesome",
]
sentences2 = [
    "The dog plays in the garden",
    "A woman watches TV",
    "The new movie is so great",
]


In [None]:
# Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
print(embeddings1.shape)
assert embeddings1.shape == embeddings2.shape


torch.Size([3, 384])


In [None]:
# Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)
cosine_scores.shape


torch.Size([3, 3])

In [None]:
# Output the pairs with their score
for i in range(len(sentences1)):
    print(
        "{} \t\t {} \t\t Score: {:.4f}".format(
            sentences1[i], sentences2[i], cosine_scores[i][i]
        )
    )


The cat sits outside 		 The dog plays in the garden 		 Score: 0.2838
A man is playing guitar 		 A woman watches TV 		 Score: -0.0327
The new movie is awesome 		 The new movie is so great 		 Score: 0.8939


In [None]:
cosine_scores


tensor([[ 0.2838,  0.1310, -0.0029],
        [ 0.2277, -0.0327, -0.0136],
        [ 0.0543, -0.0502,  0.8939]])

Rank `sentences1` by most similar sentences in `sentences2`.

In [None]:
import torch

# do the ranking
ranks = torch.sort(cosine_scores, dim=-1, descending=True)
ranks


torch.return_types.sort(
values=tensor([[ 0.2838,  0.1310, -0.0029],
        [ 0.2277, -0.0136, -0.0327],
        [ 0.8939,  0.0543, -0.0502]]),
indices=tensor([[0, 1, 2],
        [0, 2, 1],
        [2, 0, 1]]))

In [None]:
diff_with_top = ranks.values[:, 0, None] - ranks.values

for i, (val, idx, diff) in enumerate(zip(*ranks, diff_with_top)):
    print(f"Most similar sentences to {sentences1[i]!r}:")
    for v, j in zip(val, idx):
        print((sentences2[j], v))
    print(f"Delta similarity with rank 1: {diff}")
    print()


Most similar sentences to 'The cat sits outside':
('The dog plays in the garden', tensor(0.2838))
('A woman watches TV', tensor(0.1310))
('The new movie is so great', tensor(-0.0029))
Delta similarity with rank 1: tensor([0.0000, 0.1527, 0.2866])

Most similar sentences to 'A man is playing guitar':
('The dog plays in the garden', tensor(0.2277))
('The new movie is so great', tensor(-0.0136))
('A woman watches TV', tensor(-0.0327))
Delta similarity with rank 1: tensor([0.0000, 0.2413, 0.2604])

Most similar sentences to 'The new movie is awesome':
('The new movie is so great', tensor(0.8939))
('The dog plays in the garden', tensor(0.0543))
('A woman watches TV', tensor(-0.0502))
Delta similarity with rank 1: tensor([0.0000, 0.8396, 0.9441])



# Preprocessing: remove speaker roles

In [None]:
from baseline.multi_choice.utils_multiple_choice import MuTualProcessor
from baseline.conf import DATA_DIR

p = MuTualProcessor()
test_split = p.get_test_examples(DATA_DIR / "mutual_plus")
test_split[0]


read files: 100%|██████████| 886/886 [00:00<00:00, 18149.80it/s]


<InputExample(context=M: I'm not sure what to order to drink. Beer, white wine or red wine? F: They say red meat go with red wine, light colored meat go with white and oily foods are good with beer. M: I know chicken is white meat., endings=['F: So we should order a bottle of white wine.',
 'F: I drank too much white wine and got drunk.',
 'F: I didn’t hear you. Please could you tell me again?',
 'F: I think we can order beer to go with chicken.'], label=-33, id=test-data/mutual_plus/test/test_543.txt)>

In order to keep using the dataset coming together with the repo and the utilities 
accompanying it, we can simply read examples like in the original code, and simply 
keep datapoints structure by processing them in memory. `deepcopy` the original 
object if you don't want to lose it.

In [None]:
for dp in test_split:
    dp.inplace_remove_speakers()
test_split[0]


<InputExample(context=I'm not sure what to order to drink. Beer, white wine or red wine? They say red meat go with red wine, light colored meat go with white and oily foods are good with beer. I know chicken is white meat., endings=['So we should order a bottle of white wine.',
 'I drank too much white wine and got drunk.',
 'I didn’t hear you. Please could you tell me again?',
 'I think we can order beer to go with chicken.'], label=-33, id=test-data/mutual_plus/test/test_543.txt)>