In [163]:
import torch
from transformers import BertTokenizer, BertTokenizerFast, BertForTokenClassification

In [164]:
CHECKPOINT = 'neuralmind/bert-base-portuguese-cased'
tokenizer = BertTokenizerFast.from_pretrained(CHECKPOINT)

In [165]:
model = BertForTokenClassification.from_pretrained(CHECKPOINT)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model check

In [166]:
input1 = [1,1,1, 0]
input2 = [2,2,2, 4]
batch = [input1, input2]

Se o attention mask estiver sendo aplicado corretamente, a saída do modelo pro batch deve ser igual ao input com pad.

In [167]:
input1_without_pad = input1[:-1]

In [168]:
input1

[1, 1, 1, 0]

In [169]:
batch

[[1, 1, 1, 0], [2, 2, 2, 4]]

In [170]:
input1, input2

([1, 1, 1, 0], [2, 2, 2, 4])

Passando dados isoladamente pelo modelo

In [171]:
out1 = model(torch.tensor([input1_without_pad]))['logits']

In [172]:
print(out1)

tensor([[[ 0.3487, -0.1619],
         [ 0.1516, -0.1921],
         [ 0.2658, -0.1833]]], grad_fn=<AddBackward0>)


In [173]:
out2 = model(torch.tensor([input2]))['logits']

In [174]:
out2

tensor([[[ 0.1650,  0.1033],
         [ 0.1337, -0.0854],
         [ 0.3468, -0.0913],
         [ 0.3077, -0.0307]]], grad_fn=<AddBackward0>)

Em tese a gente espera que o out3 contendo as duas entradas seja uma aglutinação dos dois, mas não. Por causa do batch

In [175]:
out3 = model(torch.tensor(batch))['logits']

In [176]:
out3

tensor([[[ 0.3736, -0.1760],
         [ 0.3407, -0.3460],
         [ 0.4535, -0.3298],
         [ 0.4237, -0.2578]],

        [[ 0.1650,  0.1033],
         [ 0.1337, -0.0854],
         [ 0.3468, -0.0913],
         [ 0.3077, -0.0307]]], grad_fn=<AddBackward0>)

In [177]:
mask = torch.tensor([[1,1,1,0],[1,1,1,1]])

In [178]:
out4 = model(torch.tensor(batch), attention_mask=mask)['logits']

Passando a mascara, o pad é ignorado.

In [179]:
out4

tensor([[[ 0.3487, -0.1619],
         [ 0.1516, -0.1921],
         [ 0.2658, -0.1833],
         [ 0.3059, -0.0367]],

        [[ 0.1650,  0.1033],
         [ 0.1337, -0.0854],
         [ 0.3468, -0.0913],
         [ 0.3077, -0.0307]]], grad_fn=<AddBackward0>)

In [180]:
out1

tensor([[[ 0.3487, -0.1619],
         [ 0.1516, -0.1921],
         [ 0.2658, -0.1833]]], grad_fn=<AddBackward0>)

In [181]:
out2

tensor([[[ 0.1650,  0.1033],
         [ 0.1337, -0.0854],
         [ 0.3468, -0.0913],
         [ 0.3077, -0.0307]]], grad_fn=<AddBackward0>)

## Agora a mesma coisa, mas com o código que implementei

In [182]:
from model import NERClassifier
model = NERClassifier(2, CHECKPOINT)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model check

In [189]:
out1 = model(torch.tensor([input1_without_pad]), mask=None)['logits']

In [190]:
out1

tensor([[[ 0.0538, -0.1933],
         [-0.0574, -0.1865],
         [-0.1432, -0.0405]]], grad_fn=<AddBackward0>)

In [185]:
out2 = model(torch.tensor([input2]), mask=None)['logits']

In [191]:
out2

tensor([[[-0.0109, -0.0509],
         [-0.3895,  0.1112],
         [-0.5186,  0.1902],
         [-0.4196,  0.1357]]], grad_fn=<AddBackward0>)

In [192]:
out4 = model(torch.tensor(batch), mask=mask)['logits']

In [194]:
out5 = model(torch.tensor(batch), mask=None)['logits']

In [193]:
out4

tensor([[[ 0.0538, -0.1933],
         [-0.0574, -0.1865],
         [-0.1432, -0.0405],
         [-0.1176, -0.0262]],

        [[-0.0109, -0.0509],
         [-0.3895,  0.1112],
         [-0.5186,  0.1902],
         [-0.4196,  0.1357]]], grad_fn=<AddBackward0>)

In [195]:
out5

tensor([[[-0.0469, -0.1732],
         [-0.2796, -0.1695],
         [-0.4308, -0.0570],
         [-0.4027, -0.1184]],

        [[-0.0109, -0.0509],
         [-0.3895,  0.1112],
         [-0.5186,  0.1902],
         [-0.4196,  0.1357]]], grad_fn=<AddBackward0>)

Conclusão: meu modelo tá passando a mascara de atenção corretamente. O hugging face até gera uma predição pro padding (o que não faz sentido algum), mas a existencia do padding não altera a previsão para m dado modelo