In [32]:
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch

sentences = ["What is the card on the bottom of my graveyard",
             'You win some you lose some',
             'Nobody messes with me! My turn',
             'You have to guess the card on my graveyard right at the bottom',
             'Go Spider Roulette',
            'Red eyes black dragon, this pal of mine is a super rare monster']

In [36]:
model_name = 'sentence-transformers/bert-base-nli-mean-tokens'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    
    new_tokens = tokenizer.encode_plus(sentence, max_length = 128, truncation = True, padding = 'max_length', return_tensors = 'pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

output = model(**tokens)


In [37]:
tokens

{'input_ids': tensor([[  101,  2054,  2003,  1996,  4003,  2006,  1996,  3953,  1997,  2026,
          16685,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,    

In [38]:
embeddings = output.last_hidden_state
embeddings.shape

torch.Size([6, 128, 768])

Manual Mean Pooling

In [40]:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([6, 128])

In [41]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.shape).float()

In [42]:
masked_embeddings = mask*embeddings
summed = torch.sum(masked_embeddings,1)
counts = torch.clamp(mask.sum(1), min = 1e-9)
mean_pooled = summed/counts
mean_pooled

tensor([[-0.2616, -0.1636,  0.3825,  ...,  0.2016, -0.1835,  0.1085],
        [ 0.1315, -0.9066,  1.1332,  ...,  0.1003, -0.4232,  0.2612],
        [ 0.5267,  0.8520,  2.3070,  ...,  0.5416, -0.5958,  0.0028],
        [-0.4863, -0.0615,  0.3979,  ...,  0.2569,  0.0807,  0.4447],
        [-1.0186,  0.2733,  0.2572,  ..., -0.2005,  0.6120,  0.9960],
        [-0.1570,  0.5414, -0.9212,  ..., -0.4736,  0.6766, -0.1710]],
       grad_fn=<DivBackward0>)

In [45]:
from sklearn.metrics.pairwise import cosine_similarity 

#mean_pooled = mean_pooled.detach().numpy()

similarity = cosine_similarity([mean_pooled[0]],mean_pooled[1:])

similarity

array([[0.18419643, 0.08872287, 0.90662074, 0.2615884 , 0.23011158]],
      dtype=float32)

Using Sentence Library


In [33]:
model_name = 'bert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)

embedding = model.encode(sentences)

embedding.shape

(6, 768)

In [34]:
from sklearn.metrics.pairwise import cosine_similarity 

#mean_pooled = mean_pooled.detach().numpy()

cosine_similarity([embedding[0]],embedding[1:])


array([[0.1841965 , 0.08872286, 0.9066208 , 0.26158822, 0.23011152]],
      dtype=float32)