In [15]:
from transformers import AutoTokenizer, AutoModel
import torch

In [16]:
#nitialize our model and tokenizer:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
###Tokenize the sentences like before:
sent = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "He found a leprechaun in his walnut shell."
]

In [17]:
# initialize dictionary: stores tokenized sentences
token = {'input_ids': [], 'attention_mask': []}
for sentence in sent:
    # encode each sentence, append to dictionary
    new_token = tokenizer.encode_plus(sentence, max_length=128,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    token['input_ids'].append(new_token['input_ids'][0])
    token['attention_mask'].append(new_token['attention_mask'][0])
# reformat list of tensors to single tensor
token['input_ids'] = torch.stack(token['input_ids'])
token['attention_mask'] = torch.stack(token['attention_mask'])

In [18]:
#Process tokens through model:
output = model(**token)
output.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [19]:
#The dense vector representations of text are contained within the outputs 'last_hidden_state' tensor
embeddings = output.last_hidden_state
embeddings

tensor([[[-0.0692,  0.6230,  0.0354,  ...,  0.8033,  1.6314,  0.3281],
         [ 0.0367,  0.6842,  0.1946,  ...,  0.0848,  1.4747, -0.3008],
         [-0.0121,  0.6543, -0.0727,  ..., -0.0326,  1.7717, -0.6812],
         ...,
         [ 0.1953,  1.1085,  0.3390,  ...,  1.2826,  1.0114, -0.0728],
         [ 0.0902,  1.0288,  0.3297,  ...,  1.2940,  0.9865, -0.1113],
         [ 0.1240,  0.9737,  0.3933,  ...,  1.1359,  0.8768, -0.1043]],

        [[-0.3212,  0.8251,  1.0554,  ..., -0.1855,  0.1517,  0.3937],
         [-0.7146,  1.0297,  1.1217,  ...,  0.0331,  0.2382, -0.1563],
         [-0.2352,  1.1353,  0.8594,  ..., -0.4310, -0.0272, -0.2968],
         ...,
         [-0.5400,  0.3236,  0.7839,  ...,  0.0022, -0.2994,  0.2659],
         [-0.5643,  0.3187,  0.9576,  ...,  0.0342, -0.3030,  0.1878],
         [-0.5172,  0.3599,  0.9336,  ...,  0.0243, -0.2232,  0.1672]],

        [[-0.7576,  0.8399, -0.3792,  ...,  0.1271,  1.2514,  0.1365],
         [-0.6591,  0.7613, -0.4662,  ...,  0

In [20]:
# To perform this operation, we first resize our attention_mask tensor:
att_mask = token['attention_mask']
att_mask.shape

torch.Size([4, 128])

In [21]:
mask = att_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([4, 128, 768])

In [22]:
mask_embeddings = embeddings * mask
mask_embeddings.shape

torch.Size([4, 128, 768])

In [23]:
#Then we sum the remained of the embeddings along axis 1:
summed = torch.sum(mask_embeddings, 1)
summed.shape

torch.Size([4, 768])

In [24]:
#Then sum the number of values that must be given attention in each position of the tensor:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([4, 768])

In [25]:
mean_pooled = summed / summed_mask
mean_pooled

tensor([[ 0.0745,  0.8637,  0.1795,  ...,  0.7734,  1.7247, -0.1803],
        [-0.3715,  0.9729,  1.0840,  ..., -0.2552, -0.2759,  0.0358],
        [-0.5030,  0.7950, -0.1240,  ...,  0.1441,  0.9704, -0.1791],
        [-0.2131,  1.0175, -0.8833,  ...,  0.7371,  0.1947, -0.3011]],
       grad_fn=<DivBackward0>)

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
#Let's calculate cosine similarity for sentence 0:
# convert from PyTorch tensor to numpy array
mean_pooled = mean_pooled.detach().numpy()
# calculate
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

array([[0.33088917, 0.72192585, 0.55483645]], dtype=float32)