In [6]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer

model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, output_hidden_states=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

### Input

In [7]:
text = "After stealing money from the bank vault, the bank robber was seen"\
    "fishing no the Mississippi river bank."

In [9]:
token_input = tokenizer(text, return_tensors='pt')
token_input

{'input_ids': tensor([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,  1996,
          2924, 27307,  2001,  2464,  7529,  2075,  2053,  1996,  5900,  2314,
          2924,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [11]:
token_input['input_ids'].shape

torch.Size([1, 23])

#### Model forward

- Forward
    - embedding => encoder => pooler

In [12]:
model.eval()
with torch.no_grad():
    outputs = model(**token_input)

#### Output

- len(outputs) == 3
- outputs[0]
    - last_hidden_states, shape: [batch_size, seq_len, hidden_size(768)]
- outputs[1]
    - pooleer_output, shape: [batch-size, hidden_size(1*768)]
- outputs[2] (model.config.output_hidden_states=True)
    - type: tuple
    - one for the ouput of the embeddings(1), if the model has an embedding layer(12) + one for the output of each layer.
        - (1+12)*(batch_size, seq_len, hidden-size)
        
- last_hidden_state: Sequence of hidden_states at the last ouput of the last layer of the model. (token level)
- pooler_output: Last layer hidden-state of the first token of the sequence (classification token) after further processing through the layers used for the auxiliary pretraining task. (sentence level)
- hidden_states, only when the 

```python 
output_hidden_states=True
```
        
Detail information: https://huggingface.co/docs/transformers/model_doc/bert

In [14]:
len(outputs)

3

In [30]:
token_level_representation = outputs[0] # last_hidden_state
sentence_level_representation = outputs[1] # poolered output
print("token representation's shape:", token_level_representation.shape)
print("sentence representation's shape", sentence_level_representation.shape)

token representation's shape: torch.Size([1, 23, 768])
sentence representation's shape torch.Size([1, 768])


In [31]:
outputs[2][-1] == token_level_representation

tensor([[[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]]])

- outputs[2][0] is the embedding's representation
- outputs[2][-1]/[12] is the token_level_representation