In [13]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

from pprint import pprint

In [2]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [3]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [10]:
# Let's see intermidiate steps
tokens = tokenizer.tokenize("Hello world")
ids = tokenizer.convert_tokens_to_ids(tokens)

# Inverse step
_tokens = tokenizer.convert_ids_to_tokens(ids)

# tokenize + convert_tokens_to_ids
encoded = tokenizer.encode("Hello world")
decoded = tokenizer.decode(encoded)

# Let's check encoded content
encoded_ids_to_tokens = tokenizer.convert_ids_to_tokens(encoded)

# Let's print results
print(f"Tokens: {_tokens}")
print(f"IDs: {ids}")
print(f"Tokens from ids: {_tokens}")
print(f"Encoded: {encoded}")
print(f"Decoded: {decoded}")
print(f"Encoded ids to tokens: {encoded_ids_to_tokens}")


Tokens: ['hello', 'world']
IDs: [7592, 2088]
Tokens from ids: ['hello', 'world']
Encoded: [101, 7592, 2088, 102]
Decoded: [CLS] hello world [SEP]
Encoded ids to tokens: ['[CLS]', 'hello', 'world', '[SEP]']


In [15]:
model_inputs = tokenizer("Hello world")
pprint(model_inputs, sort_dicts=False)

{'input_ids': [101, 7592, 2088, 102],
 'token_type_ids': [0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1]}


In [19]:
data = ["I like cats",
        "Do you like cats too?"]
data

['I like cats', 'Do you like cats too?']

In [26]:
pprint(tokenizer(data), sort_dicts=False)

{'input_ids': [[101, 1045, 2066, 8870, 102],
               [101, 2079, 2017, 2066, 8870, 2205, 1029, 102]],
 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]],
 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}


In [27]:
pprint(tokenizer(data, padding=True, truncation=True, return_tensors='pt'), sort_dicts=False)

{'input_ids': tensor([[ 101, 1045, 2066, 8870,  102,    0,    0,    0],
        [ 101, 2079, 2017, 2066, 8870, 2205, 1029,  102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])}


In [29]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# Intentional error:
outputs = model(**model_inputs)

AttributeError: 'list' object has no attribute 'size'

In [31]:
model_inputs = tokenizer("Hello world", padding=True, truncation=True, return_tensors='pt')
pprint(model_inputs, sort_dicts=False)

{'input_ids': tensor([[ 101, 7592, 2088,  102]]),
 'token_type_ids': tensor([[0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1]])}


In [32]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
outputs = model(**model_inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0733,  0.1373,  0.0767]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [34]:
outputs.logits

tensor([[-0.0733,  0.1373,  0.0767]], grad_fn=<AddmmBackward0>)

In [35]:
outputs['logits']

tensor([[-0.0733,  0.1373,  0.0767]], grad_fn=<AddmmBackward0>)

In [36]:
outputs[0]

tensor([[-0.0733,  0.1373,  0.0767]], grad_fn=<AddmmBackward0>)

In [37]:
outputs.logits.detach().cpu().numpy()

array([[-0.0732698 ,  0.13731125,  0.07668796]], dtype=float32)

In [41]:
# Intentional error
data = ["I like cats",
        "Do you like cats too?"]
model_inputs = tokenizer(data, return_tensors='pt')
pprint(model_inputs, sort_dicts=False)

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [42]:
data = ["I like cats",
        "Do you like cats too?"]
model_inputs = tokenizer(data, truncation=True, padding=True, return_tensors='pt')
pprint(model_inputs, sort_dicts=False)

{'input_ids': tensor([[ 101, 1045, 2066, 8870,  102,    0,    0,    0],
        [ 101, 2079, 2017, 2066, 8870, 2205, 1029,  102]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])}


In [45]:
outputs = model(**model_inputs)
pprint(outputs)

{'logits': tensor([[-0.0610,  0.1648,  0.1807],
        [-0.3100,  0.2396,  0.0304]], grad_fn=<AddmmBackward0>)}
