## Score similarity of movie descriptions
- given 2 movie descriptions, return a number between 0 and 1
- we'll use BERT transformers to get dense sentence(paragraph) embeddings, and then apply a similairty metric
- https://github.com/jamescalam/transformers/tree/main/course/similarity

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

In [2]:
model_name = 'sentence-transformers/stsb-distilbert-base'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [3]:
text = "Hercules must go from zero to hero to save the universe from Hades."

# https://huggingface.co/transformers/internal/tokenization_utils.html?highlight=encode_plus#transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus
# Tokenize and prepare for the model a sequence or a pair of sequences.
tokens = tokenizer.encode_plus(text, max_length=128,
                               truncation=True, padding='max_length',
                               return_tensors='pt')

In [4]:
type(tokens)

transformers.tokenization_utils_base.BatchEncoding

In [5]:
tokens.data['input_ids']

tensor([[  101, 15067,  2442,  2175,  2013,  5717,  2000,  5394,  2000,  3828,
          1996,  5304,  2013, 23003,  1012,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [6]:
# We process these tokens through our model:
outputs = model(**tokens)
outputs

BaseModelOutput(last_hidden_state=tensor([[[-0.1681,  0.0845,  0.3183,  ..., -0.4482,  0.8099, -0.6819],
         [ 0.4318, -0.0383,  1.0073,  ..., -0.2067,  0.2495, -0.9275],
         [ 0.3545, -0.0485,  0.2166,  ...,  0.1237,  0.0781, -0.6478],
         ...,
         [-0.1806, -0.2921,  0.5588,  ..., -0.1152,  0.1929, -1.2145],
         [-0.1254, -0.2497,  0.7482,  ..., -0.0880,  0.5619, -0.9426],
         [-0.2178, -0.2813,  0.6314,  ..., -0.1467,  0.2723, -1.0243]]],
       grad_fn=<NativeLayerNormBackward>), hidden_states=None, attentions=None)

In [7]:
# produce dense vectors embeddings
embeddings = outputs.last_hidden_state
embeddings

tensor([[[-0.1681,  0.0845,  0.3183,  ..., -0.4482,  0.8099, -0.6819],
         [ 0.4318, -0.0383,  1.0073,  ..., -0.2067,  0.2495, -0.9275],
         [ 0.3545, -0.0485,  0.2166,  ...,  0.1237,  0.0781, -0.6478],
         ...,
         [-0.1806, -0.2921,  0.5588,  ..., -0.1152,  0.1929, -1.2145],
         [-0.1254, -0.2497,  0.7482,  ..., -0.0880,  0.5619, -0.9426],
         [-0.2178, -0.2813,  0.6314,  ..., -0.1467,  0.2723, -1.0243]]],
       grad_fn=<NativeLayerNormBackward>)

In [8]:
embeddings.shape

torch.Size([1, 128, 768])

In [9]:
# mean pooling to create a single vector encoding
# multiply each value in our embeddings tensor by its respective attention_mask value so that we ignore non-real tokens

# resize attention_mask tensor
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([1, 128])

In [10]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([1, 128, 768])

In [11]:
attention_mask

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])

In [12]:
mask[0][0].shape

torch.Size([768])

In [13]:
mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])

In [14]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([1, 128, 768])

In [15]:
masked_embeddings

tensor([[[-0.1681,  0.0845,  0.3183,  ..., -0.4482,  0.8099, -0.6819],
         [ 0.4318, -0.0383,  1.0073,  ..., -0.2067,  0.2495, -0.9275],
         [ 0.3545, -0.0485,  0.2166,  ...,  0.1237,  0.0781, -0.6478],
         ...,
         [-0.0000, -0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [-0.0000, -0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [-0.0000, -0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000]]],
       grad_fn=<MulBackward0>)

In [16]:
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([1, 768])

In [17]:
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([1, 768])

In [18]:
summed_mask

tensor([[16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
         16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
         16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
         16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
         16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
         16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
         16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
         16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
         16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
         16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
         16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
         16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16., 16.,
         16., 16., 16., 16., 16., 16., 16., 16., 16.

Finally, we calculate the mean as the sum of the embedding activations `summed` divided by the number of values that should be given attention in each position `summed_mask`:

In [19]:
mean_pooled = summed / summed_mask

In [20]:
mean_pooled

tensor([[ 2.7997e-02,  3.9717e-02,  3.1979e-01, -1.4136e-01,  5.5878e-01,
          8.2253e-02,  5.0833e-01,  6.5488e-01,  3.4281e-01, -6.8770e-01,
         -4.9388e-01,  6.8750e-01, -1.1760e+00, -2.5420e-01,  2.1274e-01,
         -5.6078e-01, -6.0346e-01, -2.0626e-01, -9.3915e-02, -2.7426e-01,
          8.3787e-02,  1.7910e-01,  1.8278e-01,  4.8750e-01, -8.0592e-01,
          1.0510e+00, -9.6357e-02, -1.5066e-01,  1.2120e-01,  3.8352e-01,
          5.1736e-01,  5.0449e-01, -5.2063e-01, -2.3304e-01, -3.3285e-01,
          3.4771e-01,  2.4014e-01, -4.6893e-01,  9.0721e-01, -2.3205e-01,
          4.2056e-01,  2.9983e-01,  3.3246e-01, -4.8929e-01, -1.0834e+00,
         -7.1698e-01, -1.9738e-01, -6.5017e-01,  4.4382e-02, -5.4016e-01,
         -2.4586e-01, -2.3958e-01, -5.5259e-02,  4.7614e-01,  8.4970e-01,
          5.8249e-02, -1.1031e+00, -2.0930e-02, -1.3984e-01,  7.5236e-01,
         -3.4633e-01,  4.5343e-01, -4.1268e-01, -2.2891e-01, -4.4620e-01,
         -6.1007e-01, -1.4399e-01,  3.

In [21]:
mean_pooled.shape

torch.Size([1, 768])

In [22]:
import h
v = h.calculate_sentence_embedding(text)

## Cosine similarity

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
# calculate
cosine_similarity(
    [mean_pooled[0]],
    mean_pooled[1:]
)

NameError: name 'a' is not defined