In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

## Token Results understanding
- input_ids: are the indices corresponding to each token in the sentence.
- attention_mask: indicates whether a token should be attended to or not.
- token_type_ids: identifies which sequence a token belongs to when there is more than one sequence.


In [7]:
encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")

print(encoded_input['input_ids'])
print(encoded_input['attention_mask'])
print(encoded_input['token_type_ids'])

[101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678, 1116, 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470, 119, 102]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [5]:
tokenizer.decode(encoded_input["input_ids"])

'[CLS] Do not meddle in the affairs of wizards. For they are subtle and quick to anger. [SEP]'

## Example of batch tokenizer
- **padding**: add special token id to fix the length of the sentence in the batch
- **truncation (max length)**: If the sentence exceed the max length, the sentence will be truncated
- **return_tensors**: tf or pt to return tensorFlow tensor or pytorch

In [9]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]

encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, max_length=100, return_tensors="tf")
print(encoded_input)

{'input_ids': <tf.Tensor: shape=(3, 15), dtype=int32, numpy=
array([[  101,  1252,  1184,  1164,  1248,  6462,   136,   102,     0,
            0,     0,     0,     0,     0,     0],
       [  101,  1790,   112,   189,  1341,  1119,  3520,  1164,  1248,
         6462,   117, 21902,  1643,   119,   102],
       [  101,  1327,  1164,  5450, 23434,   136,   102,     0,     0,
            0,     0,     0,     0,     0,     0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(3, 15), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(3, 15), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}
