<a href="https://colab.research.google.com/github/brownsloth/transformers_concepts_notebooks/blob/main/transformers_6_using_sentence_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers

In [None]:
from transformers import AutoModel, AutoTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
ckpt = 'sentence-transformers/all-mpnet-base-v2'
tokenizer = AutoTokenizer.from_pretrained(ckpt)
model = AutoModel.from_pretrained(ckpt)

In [None]:
sentences = [
    "India and Pakistan are not on the best terms right now.",
    "Wars looming large for some countries at the moment",
    "international peace is important for proper functioning of global commerce",
    "i had an ice cream today"
]

In [None]:
tokenizer.model_max_length

In [None]:
inputs = {'input_ids': [], 'attention_mask': []}
for sent in sentences:
  new_tokens = tokenizer.encode_plus(sent, max_length = tokenizer.model_max_length, truncation=True, padding ='max_length', return_tensors='pt')
  inputs['input_ids'].append(new_tokens['input_ids'][0])
  inputs['attention_mask'].append(new_tokens['attention_mask'][0])

inputs['input_ids'] = torch.stack(inputs['input_ids'])
inputs['attention_mask'] = torch.stack(inputs['attention_mask'])

In [None]:
inputs['input_ids'].shape

In [None]:
output = model(**inputs)

In [None]:
for k in output.keys():
  print(k, "-->")
  print(output[k].shape) #The [CLS] embedding -> Linear (trained on next sentence prediction task) -> tanh

We wanna get one fixed-size embedding per sentence irrespective of the sentence length. For that we do pooling over token embeddings of non-special tokens

In [None]:
embeddings = output['last_hidden_state']
attention_masks = inputs['attention_mask']
resized_attention_mask = attention_masks.unsqueeze(-1).expand(embeddings.size()).float()
print(resized_attention_mask.size())

In [None]:
masked_embeddings = embeddings*resized_attention_mask
print(masked_embeddings.size())

In [None]:
masked_embeddings.sum(dim=1).shape

In [None]:
num_1s = resized_attention_mask.count_nonzero(dim=1)

In [None]:
num_1s.shape

In [None]:
mean_pooled_embeddings = masked_embeddings.sum(dim=1) / num_1s

In [None]:
mean_pooled_embeddings.shape

## Use the mean pooled embeddings to calculate similarity


In [None]:
with torch.no_grad():
  sim_matrix = cosine_similarity(mean_pooled_embeddings, mean_pooled_embeddings)

In [None]:
for i,j in [(i,j) for i in range(len(mean_pooled_embeddings)) for j in range(len(mean_pooled_embeddings)) if i!=j]:
  print(sentences[i], " VS ", sentences[j])
  print(sim_matrix[i][j])