# Objective : Generating and Getting BERT embeddings 

Consider the sentence 'I love Paris'. Let's see how to obtain the contextualized word embedding of all the words in the sentence using the pre-trained
BERT model with Hugging Face's transformers library.


In [19]:
# Suppressing "INFO" and "WARNING" messages by setting the verbosity of the Transformers library.
from transformers import logging
logging.set_verbosity_error()

# 1. Import necessary libraries

In [20]:
from transformers import BertModel, BertTokenizer
import torch

# 2. Download and load the pretrained model

We download the pre-trained BERT model. We can check all the available pre-trained BERT models here. We use the bert-base-uncased model. As the name
suggests, it is the BERT-base model with 12 encoders, and it is trained with uncased tokens. Since we are using BERT-base, the representation size will
be 768.


In [21]:
model = BertModel.from_pretrained('bert-base-uncased')

# 3. Download and load the tokenizer

We download and load the tokenizer that was used to pre-train the bert-base-uncased model

In [22]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 4. Preprocessing the input 

a. Define the sentence 

b. Tokenize the inputs using a tokenizer

c. Adding the [CLS] and [SEP] tokens

d. Adding the [PAD] token ( assuming we have to keep our token list to 7 )

e. Adding the attention mask : We create the attention mask. We set the attention mask value to 1 if the token is not a [PAD] token, else we set the 
attention mask to 0

f. Converting tokens to the token IDs


In [23]:
sentence = 'I love Paris'

In [24]:
tokens = tokenizer.tokenize(sentence)

In [25]:
print(tokens)

['i', 'love', 'paris']


In [26]:
tokens = ['[CLS]'] + tokens + ['[SEP]']

In [27]:
print(tokens)

['[CLS]', 'i', 'love', 'paris', '[SEP]']


In [28]:
tokens = tokens + ['[PAD]'] + ['[PAD]']

In [29]:
print(tokens)

['[CLS]', 'i', 'love', 'paris', '[SEP]', '[PAD]', '[PAD]']


In [30]:
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]

In [31]:
print(attention_mask)

[1, 1, 1, 1, 1, 0, 0]


In [32]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [33]:
print(token_ids)

[101, 1045, 2293, 3000, 102, 0, 0]


In [34]:
token_ids = torch.tensor(token_ids).unsqueeze(0)
attention_mask = torch.tensor(attention_mask).unsqueeze(0)

# 5. Generating the embeddings 

a. we feed token_ids and attention_mask to model and get the embeddings.

b. Output of the model : Note that model returns the output as a tuple with two values. The first value indicates the hidden state representation,
hidden_rep, and it consists of the representation of all the tokens obtained from the final encoder (encoder 12) and the second value, cls_head, 
consists of the representation of the [CLS] token

In [35]:
objects = model(token_ids, attention_mask = attention_mask)
hidden_rep = objects.last_hidden_state
cls_head = objects.pooler_output

In [None]:
### hidden_rep contains the embedding (representation) of all the tokens in our input. Let's print the shape of hidden_rep

The size [1,7,768] indicates [batch_size, sequence_length, hidden_size] Our batch size is 1. The sequence length is the token length. 
Since we have 7 tokens, the sequence length is 7. The hidden size is the representation (embedding) size, and it is 768 for the BERT-base model.

In [36]:
print(hidden_rep.shape)

torch.Size([1, 7, 768])


In [37]:
print(cls_head.shape)

torch.Size([1, 768])


# 6. Getting Embeddings of the Token

The hidden_rep[0][0] gives the representation of the first token, which is [CLS].

The hidden_rep[0][1] gives the representation of the second token, which is I.


In [38]:
hidden_rep[0][0]

tensor([-7.1921e-02,  2.1631e-01,  4.7183e-03, -8.1534e-02, -3.0399e-01,
        -2.6997e-01,  3.6993e-01,  4.3028e-01,  1.1932e-02, -2.0674e-01,
        -8.9630e-02, -1.3917e-01,  1.7530e-01,  4.8318e-01,  3.0506e-01,
        -5.9531e-03, -1.7049e-01,  4.9769e-01,  4.6345e-01, -1.6272e-01,
         2.8592e-02, -2.6006e-01, -3.3321e-01, -8.1933e-02, -8.8631e-02,
        -3.5845e-01, -1.2788e-01, -7.6149e-02,  3.1540e-01, -1.5369e-02,
         2.4448e-01,  7.5998e-02, -6.1328e-02,  1.8551e-01,  2.3354e-01,
        -5.2520e-02,  3.3775e-01, -1.0754e-01, -3.2549e-02,  2.1909e-01,
         1.7896e-01, -8.9927e-03,  2.1548e-01, -4.8307e-02,  2.7949e-01,
        -2.8501e-01, -1.8575e+00, -3.7983e-02, -6.7010e-02, -2.6804e-01,
         2.5982e-01, -9.3902e-02,  4.1909e-01,  3.3008e-01,  5.1306e-02,
         2.5632e-01, -3.9642e-01,  6.5480e-01,  1.2961e-01,  3.6180e-01,
         1.5786e-01,  1.1035e-03, -1.5318e-01,  3.4397e-02, -1.8015e-01,
         2.6369e-01,  3.7325e-02,  2.1566e-01, -3.7

In [39]:
hidden_rep[0][1]

tensor([ 2.2365e-01,  6.5364e-01, -2.2941e-01, -4.4870e-01, -9.5561e-02,
         2.1066e-01, -1.3226e-01,  1.4089e+00,  1.0668e-01, -2.9041e-02,
        -2.0937e-01, -5.2475e-01,  3.4771e-02,  2.7329e-01,  2.9269e-01,
         2.2714e-01,  4.7734e-01,  3.4942e-01,  1.2349e-01,  8.3038e-01,
         6.9123e-01,  2.3612e-01, -8.5010e-01, -2.0249e-02,  3.0894e-01,
        -2.4169e-01, -4.3335e-01,  1.5679e-01,  9.1365e-02, -3.6651e-01,
        -1.4478e-02, -9.2568e-02,  5.8239e-01,  7.3787e-01, -7.5602e-01,
        -1.9031e-01,  3.5895e-01, -2.0138e-01, -4.4486e-01,  1.4417e-01,
         8.1282e-02, -3.0344e-01, -1.2730e-01, -6.9157e-01,  2.7232e-01,
        -1.2914e+00,  2.2492e-01, -7.1515e-02,  7.0234e-01, -7.8603e-01,
        -8.6375e-02,  1.8487e-02,  5.6987e-02,  2.9224e-01, -1.8191e-01,
         1.1739e+00, -6.1941e-01, -3.6968e-01,  4.6749e-01,  5.2700e-01,
        -2.3670e-02, -1.0395e-01,  5.6715e-01, -6.3963e-01, -4.2078e-01,
         9.4930e-01, -4.8859e-01,  1.1718e-01,  1.6