<a href="https://colab.research.google.com/github/ZahraDehghani99/AI-Internship/blob/master/AutoModel_vs_AutomodelForSequenceClassification_outputs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ouput parameters of `AutoModel` and `AutoModelForSequenceClassification` Classes

## Prerequisits

In [None]:
!pip install -q transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip uninstall -y transformers accelerate
!pip install -q transformers accelerate

Found existing installation: transformers 4.35.2
Uninstalling transformers-4.35.2:
  Successfully uninstalled transformers-4.35.2
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## BERT-base

BertModel => [source code](https://huggingface.co/docs/transformers/v4.35.2/en/model_doc/bert#transformers.**BertModel**)


BertModelForSequenceClassification => [source code](https://huggingface.co/docs/transformers/v4.35.2/en/model_doc/bert#transformers.BertForSequenceClassification)

### Tokenizer

In [None]:
from transformers import AutoTokenizer

model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
tokenizer.vocab_size

30522

In [None]:
tokenizer.model_max_length

512

In [None]:
tokenizer.model_input_names

['input_ids', 'token_type_ids', 'attention_mask']

In [None]:
text = "this is a test"

In [None]:
inputs = tokenizer(text, return_tensors="pt")

In [None]:
inputs

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3231,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [None]:
type(inputs)

transformers.tokenization_utils_base.BatchEncoding

### AutoModel class (output_hidden_states = False)

In [None]:
from transformers import AutoModel

num_labels = 4
model = (AutoModel.from_pretrained(model_ckpt, num_labels=num_labels).to(device))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [None]:
model.encoder.layer[-2]

BertLayer(
  (attention): BertAttention(
    (self): BertSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
    (intermediate_act_fn): GELUActivation()
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [None]:
# previous inputs object don't have device type, so we should place them on the same device as the model (if we don't run this code and our model is in GPU, there will be a problem becaue the place of tensors and model is different.)
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3231,  102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0260,  0.1920,  0.0344,  ..., -0.1346,  0.0773,  0.2477],
         [-0.5674, -0.2561,  0.3124,  ..., -0.7818,  0.2190,  0.5927],
         [-0.2362, -0.3177,  0.6953,  ..., -0.2110,  0.1040,  0.9891],
         [-0.1820, -0.3069,  1.0218,  ..., -0.1590,  0.0841,  1.3962],
         [ 0.0534, -0.4904, -0.0244,  ...,  0.1503,  0.1478, -0.2623],
         [ 0.7536,  0.1498, -0.2997,  ...,  0.0948, -0.8660, -0.2169]]]), pooler_output=tensor([[-8.1014e-01, -1.6611e-01,  6.8871e-01,  5.6120e-01, -4.2450e-01,
         -1.0617e-01,  8.4508e-01,  1.5211e-01,  4.0698e-01, -9.9904e-01,
          3.3192e-01, -2.4647e-01,  9.5928e-01, -3.5368e-01,  8.7687e-01,
         -4.3937e-01,  1.3539e-02, -4.6144e-01,  2.8962e-01, -7.3881e-01,
          4.2921e-01,  2.6882e-01,  6.9294e-01,  1.8854e-01,  2.5857e-01,
         -3.1407e-01, -3.8146e-01,  8.7916e-01,  9.1173e-01,  5.7387e-01,
         -6.2304e-01,  1.1765e-01, -9.5845e-01, -1

In [None]:
for k, v in outputs.items():
  print(k)

last_hidden_state
pooler_output


As we can see, we set don't set `output_hidden_states = True`, so in the output, `hidden_states` parameter doesn't have value and we have value just for `last_hidden_state` and `pooler_output` parameters.

In [None]:
y = outputs["last_hidden_state"]

In [None]:
y

tensor([[[-0.0260,  0.1920,  0.0344,  ..., -0.1346,  0.0773,  0.2477],
         [-0.5674, -0.2561,  0.3124,  ..., -0.7818,  0.2190,  0.5927],
         [-0.2362, -0.3177,  0.6953,  ..., -0.2110,  0.1040,  0.9891],
         [-0.1820, -0.3069,  1.0218,  ..., -0.1590,  0.0841,  1.3962],
         [ 0.0534, -0.4904, -0.0244,  ...,  0.1503,  0.1478, -0.2623],
         [ 0.7536,  0.1498, -0.2997,  ...,  0.0948, -0.8660, -0.2169]]])

In [None]:
y.shape

torch.Size([1, 6, 768])

It is obvious that `y` is the last hidden state in out model (hidden state that is corresponds to last layer(12th layer)). The shape of this object is like (batch_size, number of tokens, hidden size), out sentence has 5 tokens and as we know in transformers based models we always put `[CLS]` token at the begining of each **sequence** and `[SEP]` token at the end of each **sentence**. So, all in all we have 6 (4 tokens in sentenece + 2 special tokens) tokens. In this example we want to extract `[CLS]` token, so in this case we should select first token of this batch.

For this aim these two lines of codes are the same and give us `[CLS]` token.

```
utputs.last_hidden_state[:,0]
```
and
```
outputs.last_hidden_state[:,0,:]
```

In [None]:
zx = outputs.last_hidden_state[:,0,:]

In [None]:
zx.size()

torch.Size([1, 768])

In [None]:
zy = outputs.last_hidden_state[:,0]

In [None]:
zy.size()

torch.Size([1, 768])

In this line of code we proof the equality of those two lines of codes.

In [None]:
torch.all(zx == zy)

tensor(True)

### AutoModel class (output_hidden_states = True)

This time we set `output_hidden_states=True`. So, in the output we will have output of each hidden states in addition to `last_hidden_state` parameter.

In [None]:
from transformers import AutoModel

num_labels = 4
model = (AutoModel.from_pretrained(model_ckpt, num_labels=num_labels, output_hidden_states=True).to(device))

In [None]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3231,  102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0260,  0.1920,  0.0344,  ..., -0.1346,  0.0773,  0.2477],
         [-0.5674, -0.2561,  0.3124,  ..., -0.7818,  0.2190,  0.5927],
         [-0.2362, -0.3177,  0.6953,  ..., -0.2110,  0.1040,  0.9891],
         [-0.1820, -0.3069,  1.0218,  ..., -0.1590,  0.0841,  1.3962],
         [ 0.0534, -0.4904, -0.0244,  ...,  0.1503,  0.1478, -0.2623],
         [ 0.7536,  0.1498, -0.2997,  ...,  0.0948, -0.8660, -0.2169]]]), pooler_output=tensor([[-8.1014e-01, -1.6611e-01,  6.8871e-01,  5.6120e-01, -4.2450e-01,
         -1.0617e-01,  8.4508e-01,  1.5211e-01,  4.0698e-01, -9.9904e-01,
          3.3192e-01, -2.4647e-01,  9.5928e-01, -3.5368e-01,  8.7687e-01,
         -4.3937e-01,  1.3539e-02, -4.6144e-01,  2.8962e-01, -7.3881e-01,
          4.2921e-01,  2.6882e-01,  6.9294e-01,  1.8854e-01,  2.5857e-01,
         -3.1407e-01, -3.8146e-01,  8.7916e-01,  9.1173e-01,  5.7387e-01,
         -6.2304e-01,  1.1765e-01, -9.5845e-01, -1

In [None]:
for k, v in outputs.items():
  print(k)

last_hidden_state
pooler_output
hidden_states


As we can see, this time `hidden_states` parameter has value.

In [None]:
len(outputs["hidden_states"])

13

The size of `hidden_state` parameter equals hidden states of the model at the output of each layer plus the optional initial embedding outputs. So, all in all the size equals (12 layers => 12 hidden states + 1 (initial embedding outputs)) = 13

In [None]:
outputs["hidden_states"][0].shape

torch.Size([1, 6, 768])

In [None]:
outputs["hidden_states"][1].shape

torch.Size([1, 6, 768])

We can see that the size of tokens in each hidden state is the same and just embedding will be reacher in top layers (because of multi head attention)

In [None]:
# extract last hidden state
x = outputs["hidden_states"][-1]

In [None]:
x.shape

torch.Size([1, 6, 768])

In [None]:
y = outputs["last_hidden_state"]

In [None]:
y.shape

torch.Size([1, 6, 768])

We want to see that x and y objects are the same and check their equality.

x => last hidden state from `hidden_states` parameter in output

y => `last_hidden_state` parameter in output

In [None]:
torch.all(x == y)

tensor(True)

In [None]:
# output of pooler layer
a = outputs[1]

In [None]:
a

tensor([[-8.1014e-01, -1.6611e-01,  6.8871e-01,  5.6120e-01, -4.2450e-01,
         -1.0617e-01,  8.4508e-01,  1.5211e-01,  4.0698e-01, -9.9904e-01,
          3.3192e-01, -2.4647e-01,  9.5928e-01, -3.5368e-01,  8.7687e-01,
         -4.3937e-01,  1.3539e-02, -4.6144e-01,  2.8962e-01, -7.3881e-01,
          4.2921e-01,  2.6882e-01,  6.9294e-01,  1.8854e-01,  2.5857e-01,
         -3.1407e-01, -3.8146e-01,  8.7916e-01,  9.1173e-01,  5.7387e-01,
         -6.2304e-01,  1.1765e-01, -9.5845e-01, -1.3886e-01,  6.2574e-01,
         -9.4867e-01,  5.1345e-02, -6.7462e-01,  1.4575e-02,  3.4477e-02,
         -8.2107e-01,  1.9136e-01,  9.7657e-01, -4.6747e-01, -1.6186e-01,
         -2.9812e-01, -9.9659e-01,  1.4625e-01, -7.9653e-01, -6.0099e-01,
         -4.7811e-01, -7.4876e-01,  1.0094e-01,  2.5489e-01,  2.9220e-01,
          4.2842e-01, -1.2625e-01,  1.3058e-01, -4.4772e-02, -4.1464e-01,
         -5.1866e-01,  1.4801e-01,  4.3303e-01, -8.2359e-01, -6.0051e-01,
         -6.6067e-01,  1.2570e-03, -1.

In [None]:
a.size()

torch.Size([1, 768])

In [None]:
b = outputs.last_hidden_state

In [None]:
b

tensor([[[-0.0260,  0.1920,  0.0344,  ..., -0.1346,  0.0773,  0.2477],
         [-0.5674, -0.2561,  0.3124,  ..., -0.7818,  0.2190,  0.5927],
         [-0.2362, -0.3177,  0.6953,  ..., -0.2110,  0.1040,  0.9891],
         [-0.1820, -0.3069,  1.0218,  ..., -0.1590,  0.0841,  1.3962],
         [ 0.0534, -0.4904, -0.0244,  ...,  0.1503,  0.1478, -0.2623],
         [ 0.7536,  0.1498, -0.2997,  ...,  0.0948, -0.8660, -0.2169]]])

In [None]:
b.size()

torch.Size([1, 6, 768])

In [None]:
zx.size()

torch.Size([1, 768])

In [None]:
torch.all(a == zx)

tensor(False)

#### Concatenate 4 last hidden states

In [None]:
hidden_states = outputs["hidden_states"]
pooled_output = torch.cat(tuple([hidden_states[i] for i in [-4, -3, -2, -1]]), dim=-1)


In [None]:
pooled_output.size()

torch.Size([1, 6, 3072])

In [None]:
pooled_output = pooled_output[:, 0, :]

In [None]:
pooled_output.size()

torch.Size([1, 3072])

### Open question ?

What is pooler layer? is it after last hidden state?

### AutoModelForSequenceClassification class (output_hidden_states = False)

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 4
model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# previous inputs object don't have device type, so we should place them on the same device as the model (if we don't run this code and our model is in GPU, there will be a problem becaue the place of tensors and model is different.)
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3231,  102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.2216,  0.1413, -0.0584, -0.2314]]), hidden_states=None, attentions=None)

As we can see, since we use `AutoModelForSequenceClassification` class, we have a classification head on top of pooler layer, so in the output we give logits which are scores for each class. (There is no pooler layer in the output)

In [None]:
for k, v in outputs.items():
  print(k)

logits


### AutoModelForSequenceClassification  class (output_hidden_states = True)

This time we set `output_hidden_states=True`.

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 4
model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels, output_hidden_states=True).to(device))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3231,  102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[0.2475, 0.4059, 0.2219, 0.2966]]), hidden_states=(tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
         [-0.6485,  0.6739, -0.0932,  ...,  0.4475,  0.6696,  0.1820],
         [-0.6270, -0.0633, -0.3143,  ...,  0.3427,  0.4636,  0.4594],
         [ 0.3964,  0.3157,  0.0201,  ...,  0.3490,  0.1725,  0.4031],
         [ 0.6010, -0.6970, -0.2001,  ...,  0.2960,  0.2060, -1.7181],
         [-0.3251, -0.3188, -0.1163,  ..., -0.3960,  0.4112, -0.0776]]]), tensor([[[ 0.1594, -0.0126, -0.0831,  ...,  0.0845,  0.0346,  0.0609],
         [-0.4994,  0.3792, -0.0207,  ...,  0.1002,  0.5073, -0.0968],
         [-0.9795, -0.3698, -0.5920,  ...,  0.1966,  0.4721,  0.4123],
         [ 0.6016,  0.2221,  0.1561,  ...,  0.4176,  0.0414,  0.5796],
         [ 0.7979, -0.5858, -0.7035,  ...,  0.2299,  0.2113, -2.1547],
         [-0.2022, -0.0519, -0.0880,  ..., -0.3461,  0.7971, -0.0570]]]), tensor([[[-2.2218e-02, -1.9562e-01, -1.5

In [None]:
for k, v in outputs.items():
  print(k)

logits
hidden_states


As we can see, this time we have `hidden_states` in addition to `logits` parameter.

For more information about [`pooler_output`](https://huggingface.co/docs/transformers/main_classes/output#transformers.modeling_outputs.BaseModelOutputWithPooling.pooler_output)

pooler_output (torch.FloatTensor of shape (batch_size, hidden_size)) — Last layer hidden-state of the first token of the sequence (classification token) after further processing through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns the classification token after processing through a linear layer and a tanh activation function. The linear layer weights are trained from the next sentence prediction (classification) objective during pretraining.

In [None]:
len(outputs["hidden_states"])

13

The size of `hidden_state` parameter equals hidden states of the model at the output of each layer plus the optional initial embedding outputs. So, all in all the size equals (12 layers => 12 hidden states + 1 (initial embedding outputs)) = 13

In [None]:
outputs["hidden_states"][0].shape

torch.Size([1, 6, 768])

In [None]:
outputs["hidden_states"][1].shape

torch.Size([1, 6, 768])

We can see that the size of tokens in each hidden state is the same and just embedding will be reacher in top layers (because of multi head attention)

In [None]:
# extract last hidden state
x = outputs["hidden_states"][-1]

In [None]:
x.shape

torch.Size([1, 6, 768])

## RoBERTa-base

RobertaModel => [source code](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel)

RobertaModelForSequenceClassification => [source code](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaForSequenceClassification)

### Tokenizer

In [None]:
from transformers import AutoTokenizer

model_ckpt = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
tokenizer

RobertaTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}

In [None]:
tokenizer.vocab_size

50265

In [None]:
tokenizer.model_max_length

512

In [None]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [None]:
text = "this is a test"

In [None]:
inputs = tokenizer(text, return_tensors="pt")

In [None]:
inputs

{'input_ids': tensor([[   0, 9226,   16,   10, 1296,    2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [None]:
type(inputs)

transformers.tokenization_utils_base.BatchEncoding

### AutoModel class (output_hidden_states = False)

In [None]:
from transformers import AutoModel

num_labels = 4
model = (AutoModel.from_pretrained(model_ckpt, num_labels=num_labels).to(device))

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [None]:
# previous inputs object don't have device type, so we should place them on the same device as the model (if we don't run this code and our model is in GPU, there will be a problem becaue the place of tensors and model is different.)
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_ids': tensor([[   0, 9226,   16,   10, 1296,    2]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0449,  0.0641, -0.0302,  ..., -0.0481, -0.0586, -0.0241],
         [ 0.1651, -0.1411,  0.1097,  ..., -0.0006,  0.0644, -0.1205],
         [ 0.2951,  0.1887,  0.2250,  ..., -0.1786,  0.1688,  0.0601],
         [ 0.1389, -0.1818,  0.0681,  ...,  0.0767,  0.0948,  0.3480],
         [ 0.2178, -0.2184,  0.1444,  ...,  0.2590,  0.0483,  0.1576],
         [-0.0422,  0.0573, -0.0571,  ..., -0.0836, -0.0624, -0.0582]]]), pooler_output=tensor([[ 4.5743e-02, -8.1662e-02,  9.0485e-02,  3.9966e-01,  1.6364e-01,
          1.7953e-01, -1.2574e-02, -8.4830e-02, -2.0164e-01,  3.7782e-01,
         -1.9069e-01, -3.4390e-01, -1.1624e-01,  1.0695e-02, -8.3627e-02,
          2.1695e-01,  2.8111e-01,  3.2979e-01, -1.8854e-01,  1.5988e-01,
         -2.0353e-01,  2.4059e-01,  1.0932e-01, -1.8814e-01, -1.7266e-02,
         -5.7698e-02,  8.0516e-03,  4.7877e-01,  1.5729e-01,  2.8460e-01,
         -2.1817e-01, -4.3100e-02, -2.6352e-01,  1

In [None]:
for k, v in outputs.items():
  print(k)

last_hidden_state
pooler_output


As we can see, we set don't set `output_hidden_states = True`, so in the output, `hidden_states` parameter doesn't have value and we have value just for `last_hidden_state` and `pooler_output` parameters.

In [None]:
y = outputs["last_hidden_state"]

In [None]:
y

tensor([[[-0.0449,  0.0641, -0.0302,  ..., -0.0481, -0.0586, -0.0241],
         [ 0.1651, -0.1411,  0.1097,  ..., -0.0006,  0.0644, -0.1205],
         [ 0.2951,  0.1887,  0.2250,  ..., -0.1786,  0.1688,  0.0601],
         [ 0.1389, -0.1818,  0.0681,  ...,  0.0767,  0.0948,  0.3480],
         [ 0.2178, -0.2184,  0.1444,  ...,  0.2590,  0.0483,  0.1576],
         [-0.0422,  0.0573, -0.0571,  ..., -0.0836, -0.0624, -0.0582]]])

In [None]:
y.shape

torch.Size([1, 6, 768])

It is obvious that `y` is the last hidden state in out model (hidden state that is corresponds to last layer(12th layer)). The shape of this object is like (batch_size, number of tokens, hidden size), out sentence has 5 tokens and as we know in transformers based models we always put `[CLS]` token at the begining of each **sequence** and `[SEP]` token at the end of each **sentence**. So, all in all we have 6 (4 tokens in sentenece + 2 special tokens) tokens. In this example we want to extract `[CLS]` token, so in this case we should select first token of this batch.

For this aim these two lines of codes are the same and give us `[CLS]` token.

```
utputs.last_hidden_state[:,0]
```
and
```
outputs.last_hidden_state[:,0,:]
```

In [None]:
zx = outputs.last_hidden_state[:,0,:]

In [None]:
zx.size()

torch.Size([1, 768])

In [None]:
zy = outputs.last_hidden_state[:,0]

In [None]:
zy.size()

torch.Size([1, 768])

In this line of code we proof the equality of those two lines of codes.

In [None]:
torch.all(zx == zy)

tensor(True)

### AutoModel class (output_hidden_states = True)

This time we set `output_hidden_states=True`. So, in the output we will have output of each hidden states in addition to `last_hidden_state` parameter.

In [None]:
from transformers import AutoModel

num_labels = 4
model = (AutoModel.from_pretrained(model_ckpt, num_labels=num_labels, output_hidden_states=True).to(device))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_ids': tensor([[   0, 9226,   16,   10, 1296,    2]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0449,  0.0641, -0.0302,  ..., -0.0481, -0.0586, -0.0241],
         [ 0.1651, -0.1411,  0.1097,  ..., -0.0006,  0.0644, -0.1205],
         [ 0.2951,  0.1887,  0.2250,  ..., -0.1786,  0.1688,  0.0601],
         [ 0.1389, -0.1818,  0.0681,  ...,  0.0767,  0.0948,  0.3480],
         [ 0.2178, -0.2184,  0.1444,  ...,  0.2590,  0.0483,  0.1576],
         [-0.0422,  0.0573, -0.0571,  ..., -0.0836, -0.0624, -0.0582]]]), pooler_output=tensor([[-3.5062e-01, -2.2375e-01,  1.8808e-01, -3.7619e-02, -4.5012e-01,
         -2.6006e-02, -7.5475e-02, -1.0866e-01, -2.3861e-01, -2.2611e-01,
         -1.7075e-01,  5.3057e-01, -4.9478e-01,  1.6467e-01, -2.7540e-02,
         -5.3349e-01,  1.7986e-01, -3.0532e-02,  1.5792e-01, -1.0123e-01,
         -1.9165e-01, -1.1438e-01, -4.1457e-02,  1.1304e-01,  1.4005e-01,
          6.0899e-02,  9.0977e-03,  2.6057e-01, -1.2366e-01,  1.0040e-01,
         -4.5075e-01, -8.0302e-02, -3.8394e-02, -3

In [None]:
for k, v in outputs.items():
  print(k)

last_hidden_state
pooler_output
hidden_states


As we can see, this time `hidden_states` parameter has value.

In [None]:
len(outputs["hidden_states"])

13

The size of `hidden_state` parameter equals hidden states of the model at the output of each layer plus the optional initial embedding outputs. So, all in all the size equals (12 layers => 12 hidden states + 1 (initial embedding outputs)) = 13

In [None]:
outputs["hidden_states"][0].shape

torch.Size([1, 6, 768])

In [None]:
outputs["hidden_states"][1].shape

torch.Size([1, 6, 768])

We can see that the size of tokens in each hidden state is the same and just embedding will be reacher in top layers (because of multi head attention)

In [None]:
# extract last hidden state
x = outputs["hidden_states"][-1]

In [None]:
x.shape

torch.Size([1, 6, 768])

In [None]:
y = outputs["last_hidden_state"]

In [None]:
y.shape

torch.Size([1, 6, 768])

We want to see that x and y objects are the same and check their equality.

x => last hidden state from `hidden_states` parameter in output

y => `last_hidden_state` parameter in output

In [None]:
torch.all(x == y)

tensor(True)

In [None]:
# output of pooler layer
a = outputs[1]

In [None]:
a

tensor([[-8.1014e-01, -1.6611e-01,  6.8871e-01,  5.6120e-01, -4.2450e-01,
         -1.0617e-01,  8.4508e-01,  1.5211e-01,  4.0698e-01, -9.9904e-01,
          3.3192e-01, -2.4647e-01,  9.5928e-01, -3.5368e-01,  8.7687e-01,
         -4.3937e-01,  1.3539e-02, -4.6144e-01,  2.8962e-01, -7.3881e-01,
          4.2921e-01,  2.6882e-01,  6.9294e-01,  1.8854e-01,  2.5857e-01,
         -3.1407e-01, -3.8146e-01,  8.7916e-01,  9.1173e-01,  5.7387e-01,
         -6.2304e-01,  1.1765e-01, -9.5845e-01, -1.3886e-01,  6.2574e-01,
         -9.4867e-01,  5.1345e-02, -6.7462e-01,  1.4575e-02,  3.4477e-02,
         -8.2107e-01,  1.9136e-01,  9.7657e-01, -4.6747e-01, -1.6186e-01,
         -2.9812e-01, -9.9659e-01,  1.4625e-01, -7.9653e-01, -6.0099e-01,
         -4.7811e-01, -7.4876e-01,  1.0094e-01,  2.5489e-01,  2.9220e-01,
          4.2842e-01, -1.2625e-01,  1.3058e-01, -4.4772e-02, -4.1464e-01,
         -5.1866e-01,  1.4801e-01,  4.3303e-01, -8.2359e-01, -6.0051e-01,
         -6.6067e-01,  1.2570e-03, -1.

In [None]:
a.size()

torch.Size([1, 768])

In [None]:
b = outputs.last_hidden_state

In [None]:
b

tensor([[[-0.0260,  0.1920,  0.0344,  ..., -0.1346,  0.0773,  0.2477],
         [-0.5674, -0.2561,  0.3124,  ..., -0.7818,  0.2190,  0.5927],
         [-0.2362, -0.3177,  0.6953,  ..., -0.2110,  0.1040,  0.9891],
         [-0.1820, -0.3069,  1.0218,  ..., -0.1590,  0.0841,  1.3962],
         [ 0.0534, -0.4904, -0.0244,  ...,  0.1503,  0.1478, -0.2623],
         [ 0.7536,  0.1498, -0.2997,  ...,  0.0948, -0.8660, -0.2169]]])

### AutoModelForSequenceClassification class (output_hidden_states = False)

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 4
model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
# previous inputs object don't have device type, so we should place them on the same device as the model (if we don't run this code and our model is in GPU, there will be a problem becaue the place of tensors and model is different.)
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_ids': tensor([[   0, 9226,   16,   10, 1296,    2]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0832,  0.2356, -0.0045,  0.0424]]), hidden_states=None, attentions=None)

As we can see, since we use `AutoModelForSequenceClassification` class, we have a classification head on top of pooler layer, so in the output we give logits which are scores for each class. (There is no pooler layer in the output)

In [None]:
for k, v in outputs.items():
  print(k)

logits


### AutoModelForSequenceClassification  class (output_hidden_states = True)

This time we set `output_hidden_states=True`.

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 4
model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels, output_hidden_states=True).to(device))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
inputs = {k:v.to(device) for k,v in inputs.items()}

In [None]:
inputs

{'input_ids': tensor([[   0, 9226,   16,   10, 1296,    2]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [None]:
type(inputs)

dict

In [None]:
with torch.no_grad():
  outputs = model(**inputs)

In [None]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0220,  0.0056, -0.0436, -0.0366]]), hidden_states=(tensor([[[ 0.1664, -0.0541, -0.0014,  ..., -0.0811,  0.0794,  0.0155],
         [ 0.0615,  0.4178,  0.2521,  ..., -0.0394, -0.1210,  0.1449],
         [ 0.0956,  0.2541,  0.0636,  ..., -0.0035, -0.1861, -0.1094],
         [-0.1625, -0.2608, -0.0704,  ...,  0.2587, -0.1823,  0.5187],
         [ 0.1381, -0.1489,  0.0545,  ...,  0.5065, -0.6059, -0.0133],
         [ 0.2591, -0.2605, -0.0797,  ...,  0.4292,  0.0867,  0.2130]]]), tensor([[[-0.0526, -0.0266,  0.0974,  ...,  0.0314,  0.0046, -0.1211],
         [ 0.0135,  0.0806,  0.7658,  ...,  0.1807, -0.2419,  0.3058],
         [ 0.2338,  0.1245,  0.1815,  ...,  0.0284, -0.6805, -0.0709],
         [-0.2702, -0.6758,  0.0561,  ...,  0.5978, -0.3892,  0.8312],
         [ 0.0235, -0.6660,  0.3830,  ...,  1.2095, -0.8966,  0.1152],
         [ 0.2046, -0.4344,  0.1746,  ...,  0.7963, -0.0742, -0.2797]]]), tensor([[[ 0.0496,  0.0144,  0.0052,

In [None]:
for k, v in outputs.items():
  print(k)

logits
hidden_states


As we can see, this time we have `hidden_states` in addition to `logits` parameter.

For more information about [`pooler_output`](https://huggingface.co/docs/transformers/main_classes/output#transformers.modeling_outputs.BaseModelOutputWithPooling.pooler_output)

pooler_output (torch.FloatTensor of shape (batch_size, hidden_size)) — Last layer hidden-state of the first token of the sequence (classification token) after further processing through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns the classification token after processing through a linear layer and a tanh activation function. The linear layer weights are trained from the next sentence prediction (classification) objective during pretraining.

In [None]:
len(outputs["hidden_states"])

13

The size of `hidden_state` parameter equals hidden states of the model at the output of each layer plus the optional initial embedding outputs. So, all in all the size equals (12 layers => 12 hidden states + 1 (initial embedding outputs)) = 13

In [None]:
outputs["hidden_states"][0].shape

torch.Size([1, 6, 768])

In [None]:
outputs["hidden_states"][1].shape

torch.Size([1, 6, 768])

We can see that the size of tokens in each hidden state is the same and just embedding will be reacher in top layers (because of multi head attention)

In [None]:
# extract last hidden state
x = outputs["hidden_states"][-1]

In [None]:
x.shape

torch.Size([1, 6, 768])

In [None]:
# -1 th element is just hidden_states
len(outputs[-1])

13