In [None]:
%%capture

%cd ..

# Similaraity exploration

In [None]:
from sentence_transformers import SentenceTransformer, util


In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
model


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [None]:
# Two lists of sentences
sentences1 = [
    "The cat sits outside",
    "A man is playing guitar",
    "The new movie is awesome",
]
sentences2 = [
    "The dog plays in the garden",
    "A woman watches TV",
    "The new movie is so great",
]


In [None]:
# Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
print(embeddings1.shape)
assert embeddings1.shape == embeddings2.shape


torch.Size([3, 384])


In [None]:
# Compute cosine-similarities
cosine_scores = util.cos_sim(embeddings1, embeddings2)
cosine_scores.shape


torch.Size([3, 3])

In [None]:
# Output the pairs with their score
for i in range(len(sentences1)):
    print(
        "{} \t\t {} \t\t Score: {:.4f}".format(
            sentences1[i], sentences2[i], cosine_scores[i][i]
        )
    )


The cat sits outside 		 The dog plays in the garden 		 Score: 0.2838
A man is playing guitar 		 A woman watches TV 		 Score: -0.0327
The new movie is awesome 		 The new movie is so great 		 Score: 0.8939


In [None]:
cosine_scores


tensor([[ 0.2838,  0.1310, -0.0029],
        [ 0.2277, -0.0327, -0.0136],
        [ 0.0543, -0.0502,  0.8939]])

Rank `sentences1` by most similar sentences in `sentences2`.

In [None]:
import torch

# do the ranking
ranks = torch.sort(cosine_scores, dim=-1, descending=True)
ranks


torch.return_types.sort(
values=tensor([[ 0.2838,  0.1310, -0.0029],
        [ 0.2277, -0.0136, -0.0327],
        [ 0.8939,  0.0543, -0.0502]]),
indices=tensor([[0, 1, 2],
        [0, 2, 1],
        [2, 0, 1]]))

In [None]:
diff_with_top = ranks.values[:, 0, None] - ranks.values

for i, (val, idx, diff) in enumerate(zip(*ranks, diff_with_top)):
    print(f"Most similar sentences to {sentences1[i]!r}:")
    for v, j in zip(val, idx):
        print((sentences2[j], v))
    print(f"Delta similarity with rank 1: {diff}")
    print()


Most similar sentences to 'The cat sits outside':
('The dog plays in the garden', tensor(0.2838))
('A woman watches TV', tensor(0.1310))
('The new movie is so great', tensor(-0.0029))
Delta similarity with rank 1: tensor([0.0000, 0.1527, 0.2866])

Most similar sentences to 'A man is playing guitar':
('The dog plays in the garden', tensor(0.2277))
('The new movie is so great', tensor(-0.0136))
('A woman watches TV', tensor(-0.0327))
Delta similarity with rank 1: tensor([0.0000, 0.2413, 0.2604])

Most similar sentences to 'The new movie is awesome':
('The new movie is so great', tensor(0.8939))
('The dog plays in the garden', tensor(0.0543))
('A woman watches TV', tensor(-0.0502))
Delta similarity with rank 1: tensor([0.0000, 0.8396, 0.9441])



# Preprocessing: remove speaker roles

In [None]:
from baseline.multi_choice.utils_multiple_choice import MuTualProcessor
from baseline.conf import DATA_DIR

p = MuTualProcessor()
test_split = p.get_test_examples(DATA_DIR / "mutual_plus")
test_split[0]


read files:   0%|          | 0/886 [00:00<?, ?it/s]

read files: 100%|██████████| 886/886 [00:00<00:00, 2514.39it/s]


<InputExample(context=M: Well, that's great that you have happy experiences of teaching in Indonesia and following up on what you just mentioned. What would you recommend for students who do not live in an English speaking country? I don't know about perfecting but they want at least to be able to communicate decently. How can they go about this? F: Yeah, it is really hard that is the real struggle because right now I do live in Holland. But I really don't socialize much with Dutch People and my boyfriend's English is so good that we just basically speak English all the time. So I have to make a real effort to practice. There isn't as much listening exposure as I want, all I have to do is turn on the TV., endings=['M: So, now you are in Holland and you and your boyfriend just basically '
 'speak English all the time?',
 'M: Really? Now you are in Indonesia and you and your boyfriend just '
 'basically speak Chinese all the time?',
 'M: I’m sorry, I don’t understand. Could you say it ag

In order to keep using the dataset coming together with the repo and the utilities 
accompanying it, we can simply read examples like in the original code, and simply 
keep datapoints structure by processing them in memory. `deepcopy` the original 
object if you don't want to lose it.

In [None]:
for dp in test_split:
    dp.inplace_remove_speakers()
test_split[0]


<InputExample(context=Well, that's great that you have happy experiences of teaching in Indonesia and following up on what you just mentioned. What would you recommend for students who do not live in an English speaking country? I don't know about perfecting but they want at least to be able to communicate decently. How can they go about this? Yeah, it is really hard that is the real struggle because right now I do live in Holland. But I really don't socialize much with Dutch People and my boyfriend's English is so good that we just basically speak English all the time. So I have to make a real effort to practice. There isn't as much listening exposure as I want, all I have to do is turn on the TV., endings=['So, now you are in Holland and you and your boyfriend just basically speak '
 'English all the time?',
 'Really? Now you are in Indonesia and you and your boyfriend just basically '
 'speak Chinese all the time?',
 'I’m sorry, I don’t understand. Could you say it again?',
 "So, yo

In [None]:
train_split = p.get_train_examples(DATA_DIR / "mutual_plus")

read files: 100%|██████████| 7088/7088 [00:00<00:00, 17101.57it/s]

{'answers': 'B', 'options': ['F:  I am so glad to hear that you will work in the library for your friend.', 'F:  Sure. You can make more money if you work as a waiter in the restaurant.', 'F:  You worked as a waiter last year. It seems that you want make more money.', 'F: Just a minute! I do not quite follow what you are saying, would you mind repeating that?'], 'article': "F: Hi, can I help you? M: I hope so. My name is Mark. I'm... F: Don't I remember you from last year? You worked in, uh, where was it? The art library? M: You have a good memory. Yeah, that was me, and I really enjoyed the work. But for this year. F: Well, if you come in earlier. You could probably have gotten the library job again, but now... M: I plan to get a job in a restaurant this year. I really need it to make more money and working as a waiter. There always the tips. But I've tried a ton of places and I haven't found anything, and then my friend Susan. She takes photography classes in Harrison Hall. And she s




In [None]:
dev_split = p.get_dev_examples(DATA_DIR / "mutual_plus")

read files: 100%|██████████| 886/886 [00:00<00:00, 1620.35it/s]

{'answers': 'C', 'options': ['F: Oh, you have been a bus driver for about one year.', 'F: You have been a bus driver for about 10 years.', 'F: Wow. You have been a bus driver for about 11 years.', 'F: I’m sorry, I didn’t catch that. Would you mind speaking more slowly?'], 'article': 'F: So, how long have you been driving this bus? M: For about a year. But before I drove this bus, I had been driving buses for 10 years.', 'id': 'data/mutual_plus/dev/dev_681.txt'}





In [None]:
test_split = p.get_test_examples(DATA_DIR / "mutual_plus")

read files:   0%|          | 0/886 [00:00<?, ?it/s]

read files: 100%|██████████| 886/886 [00:00<00:00, 19078.53it/s]

{'answers': ' ', 'options': ['M: So, now you are in Holland and you and your boyfriend just basically speak English all the time?', 'M: Really? Now you are in Indonesia and you and your boyfriend just basically speak Chinese all the time?', 'M: I’m sorry, I don’t understand. Could you say it again?', "M: So, you don't have to make a real effort to practice and now you are in Sweden? Okay, I see."], 'article': "M: Well, that's great that you have happy experiences of teaching in Indonesia and following up on what you just mentioned. What would you recommend for students who do not live in an English speaking country? I don't know about perfecting but they want at least to be able to communicate decently. How can they go about this? F: Yeah, it is really hard that is the real struggle because right now I do live in Holland. But I really don't socialize much with Dutch People and my boyfriend's English is so good that we just basically speak English all the time. So I have to make a real 




In [None]:
el = " "
str(ord(el) - ord("A"))

'-33'