In [1]:
# imports

from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# uncased bert - all words to lower case

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print(f'Length of BERT base vocabulary: {len(tokenizer.vocab)}')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Length of BERT base vocabulary: 30522


In [3]:
text = 'A simple sentence!'

tokens = tokenizer.encode(text)
print(tokens)

[101, 1037, 3722, 6251, 999, 102]


In [4]:
tokenizer.decode(tokens)

'[CLS] a simple sentence! [SEP]'

In [5]:
text = 'Berlin is the capital of Germany but it it is not as beautiful as Jena'

tokens = tokenizer.encode(text)
print(tokens)

[101, 4068, 2003, 1996, 3007, 1997, 2762, 2021, 2009, 2009, 2003, 2025, 2004, 3376, 2004, 27510, 102]


In [6]:
print(f'Text: {text}')
print(f'Number of tokens: {len(tokens)}')
for t in tokens:
  print(f'token: {t}, token string: {tokenizer.decode([t])}')

Text: Berlin is the capital of Germany but it it is not as beautiful as Jena
Number of tokens: 17
token: 101, token string: [CLS]
token: 4068, token string: berlin
token: 2003, token string: is
token: 1996, token string: the
token: 3007, token string: capital
token: 1997, token string: of
token: 2762, token string: germany
token: 2021, token string: but
token: 2009, token string: it
token: 2009, token string: it
token: 2003, token string: is
token: 2025, token string: not
token: 2004, token string: as
token: 3376, token string: beautiful
token: 2004, token string: as
token: 27510, token string: jena
token: 102, token string: [SEP]


In [8]:
text = 'I am Arjun and i love both berlin and jena'
tokens = tokenizer.encode(text)
print(f'Text: {text}')
print(f'Number of tokens: {len(tokens)}')
for t in tokens:
  print(f'token: {t}, token string: {tokenizer.decode([t])}')

Text: I am Arjun and i love both berlin and jena
Number of tokens: 12
token: 101, token string: [CLS]
token: 1045, token string: i
token: 2572, token string: am
token: 26024, token string: arjun
token: 1998, token string: and
token: 1045, token string: i
token: 2293, token string: love
token: 2119, token string: both
token: 4068, token string: berlin
token: 1998, token string: and
token: 27510, token string: jena
token: 102, token string: [SEP]


In [9]:
## encode plus - gives token ids, attention mask, and segment ids

text = 'I am Arjun and i love both berlin and jena'
tokens = tokenizer.encode_plus(text)
print(f'Text: {text}')
print(f'Number of tokens: {len(tokens)}')

Text: I am Arjun and i love both berlin and jena
Number of tokens: 3


In [10]:
print(tokens)

{'input_ids': [101, 1045, 2572, 26024, 1998, 1045, 2293, 2119, 4068, 1998, 27510, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [12]:
python_pet = tokenizer.encode('I love my pet python')

python_language = tokenizer.encode('I love coding in python')

print(python_pet)
print(python_language)

[101, 1045, 2293, 2026, 9004, 18750, 102]
[101, 1045, 2293, 16861, 1999, 18750, 102]


In [17]:
# contextful embeddings

model = BertModel.from_pretrained('bert-base-uncased')

python_pet_embedding = model(torch.tensor(python_pet).unsqueeze(0))[0][:,5,:].detach().numpy()
python_language_embedding = model(torch.tensor(python_language).unsqueeze(0))[0][:,5,:].detach().numpy()

snake_alone_embedding = model(torch.tensor(tokenizer.encode('snake')).unsqueeze(0))[0][:,1,:].detach().numpy()
programming_alone_embedding = model(torch.tensor(tokenizer.encode('programming')).unsqueeze(0))[0][:,1,:].detach().numpy()

In [19]:
python_pet_embedding.shape

(1, 768)

In [18]:
# cosine similarity

cosine_similarity(python_language_embedding, snake_alone_embedding)

array([[0.5843479]], dtype=float32)

In [20]:
cosine_similarity(python_pet_embedding, snake_alone_embedding)

array([[0.6928656]], dtype=float32)