In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 1.1 MB/s 
Collecting torchvision
  Downloading torchvision-0.11.1-cp38-cp38-macosx_10_9_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 1.8 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp38-cp38-macosx_10_6_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 13.3 MB/s 
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.1.0-py3-none-any.whl size=120999 sha256=b328303735c777bddc74f63b8cbd25f0abbf6eb40d491b5b2f0d8920a2dc0bf5
  Stored in directory: /Users/divyansh/Library/Caches/pip/wheels/52/19/88/6625593382e23a926740e6fcee0f2df0a0de25766094842a28
Successfully built sentence-transformers
Installing collected packages: torchvision, se

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch
def calculateSimilarities(sentences, model = 'nreimers/TinyBERT_L-6_H-768_v2', tokeniser = 'nreimers/TinyBERT_L-6_H-768_v2'):

  """
    Calculate the similarity between each sentence in the list of sentences using the BERT model.
    The sentences are tokenised using the tokeniser.
    The model is the BERT model to use.
    The tokeniser is the tokeniser to use.
    The sentences are tokenised using the tokeniser.

    :param sentences: The list of sentences to calculate the similarity between first and rest of all.
    :param model: The BERT model to use.
    :param tokeniser: The tokeniser to use.
    :return: A list containing the similarity between the first sentence and the rest of the sentences.

    Example:
    sentences = ['I like to eat', 'I like to eat too', 'I like to eat too much']
    calculateSimilarities(sentences)
    [0.9, 0.8]

    Example:
    sentences = ['I like to eat', 'I like to eat too', 'I like to eat too much']
    calculateSimilarities(sentences, model = 'nreimers/TinyBERT_L-6_H-768_v2', tokeniser = 'nreimers/TinyBERT_L-6_H-768_v2')
    [0.9, 0.8]
  """

  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
  model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
  
  # initialize dictionary to store tokenized sentences
  tokens = {'input_ids': [], 'attention_mask': []}

  for sentence in sentences:
    # encode each sentence and append to dictionary
    new_tokens = tokenizer.encode_plus(sentence, max_length=128,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

  
  # reformat list of tensors into single tensor
  tokens['input_ids'] = torch.stack(tokens['input_ids'])
  tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

  # We process these tokens through our model:
  outputs = model(**tokens)

  # The dense vector representations of our text are contained within the outputs 
  # 'last_hidden_state' tensor, which we access like so:

  embeddings = outputs.last_hidden_state


  # To perform this operation, we first resize our attention_mask tensor:
  attention_mask = tokens['attention_mask']
  
  mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()

  # Each vector above represents a single token attention mask - each token now has a vector of size 768 representing it's attention_mask status. Then we multiply the two tensors to apply the attention mask:

  masked_embeddings = embeddings * mask

  # Then we sum the remained of the embeddings along axis 1:
  summed = torch.sum(masked_embeddings, 1)

  # Then sum the number of values that must be given attention in each position 
  # of the tensor:
  summed_mask = torch.clamp(mask.sum(1), min=1e-9)

  # Finally, we calculate the mean as the sum of the embedding activations summed 
  # divided by the number of values that should be given attention in each
  # position summed_mask:

  mean_pooled = summed / summed_mask

  return cosineSimilarity(mean_pooled)[0]


In [3]:
from sklearn.metrics.pairwise import cosine_similarity

def cosineSimilarity(mean_pooled):
  """
    Calculate the cosine similarity between the mean pooled embeddings of the sentences.

    :param mean_pooled: The mean pooled embeddings of the sentences.
    :return: The cosine similarity between the mean pooled embeddings of the sentences.
  """
  mean_pooled = mean_pooled.detach().numpy()

  # calculate
  return cosine_similarity(
      [mean_pooled[0]],
      mean_pooled[1:]
  )

In [6]:
!pip install prettytable
from prettytable import PrettyTable
def findSimilarities(theSentence, sentences, model, tokeniser):
  """
    Find the similarity between the given sentence and the rest of the sentences.

    :param theSentence: The sentence to find the similarity of.
    :param sentences: The list of sentences to find the similarity with.
    :param model: The BERT model to use.
    :param tokeniser: The tokeniser to use.
    :prints: A list containing the similarity between the given sentence and the rest of the sentences.

    Example:
    sentences = ['I like to eat', 'I like to eat too', 'I like to eat too much']
    findSimilarities('I like to eat', sentences)
    Output: 
          Sentence :  I like to eat.
      NUmber of sentences to compare to :  3
      +-------+------------------------+------------+
      | Index |        Sentence        | Similarity |
      +-------+------------------------+------------+
      |   1   |     I like to eat      | 0.99466836 |
      |   2   |   I like to eat too    | 0.97190154 |
      |   3   | I like to eat too much | 0.8314854  |
      +-------+------------------------+------------+
  """
  allSentences = [theSentence] + sentences
  similarities = calculateSimilarities(allSentences)

  print("Sentence : " ,  theSentence)
  print("Number of sentences to compare to : ", len(similarities))
  myTable = PrettyTable(["Index", "Sentence", "Similarity %"])

  for i in range(len(similarities)):
    myTable.add_row([i+1, sentences[i], similarities[i]*100]) 

  print(myTable)

Collecting prettytable
  Downloading prettytable-2.4.0-py3-none-any.whl (24 kB)
Installing collected packages: prettytable
Successfully installed prettytable-2.4.0


In [7]:
sentences = ['I like to eat', 'I like to eat too', 'I like to eat too much']

findSimilarities("I like to eat", sentences, 'nreimers/TinyBERT_L-6_H-768_v2', 'nreimers/TinyBERT_L-6_H-768_v2')

Downloading: 100%|██████████| 399/399 [00:00<00:00, 154kB/s]
Downloading: 100%|██████████| 625/625 [00:00<00:00, 446kB/s]
Downloading: 100%|██████████| 226k/226k [00:01<00:00, 188kB/s]
Downloading: 100%|██████████| 455k/455k [00:02<00:00, 159kB/s]
Downloading: 100%|██████████| 2.00/2.00 [00:00<00:00, 690B/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 111kB/s]
Downloading: 100%|██████████| 418M/418M [00:15<00:00, 28.7MB/s]


Sentence :  I like to eat
Number of sentences to compare to :  3
+-------+------------------------+-------------------+
| Index |        Sentence        |    Similarity %   |
+-------+------------------------+-------------------+
|   1   |     I like to eat      |       100.0       |
|   2   |   I like to eat too    | 97.61621952056885 |
|   3   | I like to eat too much | 83.08659195899963 |
+-------+------------------------+-------------------+


In [9]:
sentences = [
"He was very excited when they officially decided to rename 'Columbus Day' as 'Indigenous People's Day'.",
"She didn't like to support Amazon, but she was too lazy to go shopping anywhere else.",
"She learned the different types of clouds in second grade.",
"He was so surprised that he dropped the dumbbells right on his foot.",
"Any cop who claims not to like donuts is a liar.",
"What was I supposed to do, let her throw up on my begonias?"
]

findSimilarities("He didnt like walmart, but dont shop anywhere.", sentences, 'nreimers/TinyBERT_L-6_H-768_v2', 'nreimers/TinyBERT_L-6_H-768_v2')

Sentence :  He didnt like walmart, but dont shop anywhere.
Number of sentences to compare to :  6
+-------+---------------------------------------------------------------------------------------------------------+--------------------+
| Index |                                                 Sentence                                                |    Similarity %    |
+-------+---------------------------------------------------------------------------------------------------------+--------------------+
|   1   | He was very excited when they officially decided to rename 'Columbus Day' as 'Indigenous People's Day'. | 11.694115400314331 |
|   2   |          She didn't like to support Amazon, but she was too lazy to go shopping anywhere else.          | 78.61791849136353  |
|   3   |                        She learned the different types of clouds in second grade.                       | 17.35464036464691  |
|   4   |                   He was so surprised that he dropped the dumbbells ri