- https://towardsdatascience.com/why-are-there-so-many-tokenization-methods-for-transformers-a340e493b3a8

In [31]:
text = 'hello world!'
text_list = ['hello world!', 'hello earth and mars!']
print(text)
print(text_list)

hello world!
['hello world!', 'hello earth and mars!']


# BartTokenizer

In [24]:
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
print(type(tokenizer))
print("pad:", tokenizer.pad_token, tokenizer.pad_token_id)
print("unk:", tokenizer.unk_token, tokenizer.unk_token_id)
print("mask:", tokenizer.mask_token, tokenizer.mask_token_id)
print("bos:", tokenizer.bos_token, tokenizer.bos_token_id)
print("eos:", tokenizer.eos_token, tokenizer.eos_token_id)
print("cls:", tokenizer.cls_token, tokenizer.cls_token_id)
print("sep:", tokenizer.sep_token, tokenizer.sep_token_id)

<class 'transformers.models.bart.tokenization_bart.BartTokenizer'>
pad: <pad> 1
unk: <unk> 3
mask: <mask> 50264
bos: <s> 0
eos: </s> 2
cls: <s> 0
sep: </s> 2


In [17]:
tokens = tokenizer.tokenize(text)   # string -> tokens
print(tokenizer.tokenize(text))
print(tokenizer.convert_tokens_to_string(tokens)) # tokens -> string
print(tokenizer.convert_tokens_to_ids(tokens))    # tokens -> ids

['hello', 'Ġworld', '!']
hello world!
[42891, 232, 328]


In [18]:
ids = tokenizer.encode(text)
print(tokenizer.encode(text)) # string -> tokens -> ids
print(tokenizer.encode_plus(text)) # string -> tokens -> ids + more
print(tokenizer.convert_ids_to_tokens(ids)) # ids -> tokens

[0, 42891, 232, 328, 2]
{'input_ids': [0, 42891, 232, 328, 2], 'attention_mask': [1, 1, 1, 1, 1]}
['<s>', 'hello', 'Ġworld', '!', '</s>']


In [19]:
encoded = tokenizer.encode_plus(text, max_length=10, padding='max_length', return_tensors='pt')
print("pad:", tokenizer.pad_token, tokenizer.pad_token_id)
print(encoded.keys())
for k in encoded:
    print(k, encoded[k])

pad: <pad> 1
dict_keys(['input_ids', 'attention_mask'])
input_ids tensor([[    0, 42891,   232,   328,     2,     1,     1,     1,     1,     1]])
attention_mask tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])


# BertTokenizer

In [25]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print(type(tokenizer))
print("pad:", tokenizer.pad_token, tokenizer.pad_token_id)
print("unk:", tokenizer.unk_token, tokenizer.unk_token_id)
print("mask:", tokenizer.mask_token, tokenizer.mask_token_id)
print("bos:", tokenizer.bos_token, tokenizer.bos_token_id)
print("eos:", tokenizer.eos_token, tokenizer.eos_token_id)
print("cls:", tokenizer.cls_token, tokenizer.cls_token_id)
print("sep:", tokenizer.sep_token, tokenizer.sep_token_id)


Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>
pad: [PAD] 0
unk: [UNK] 100
mask: [MASK] 103
bos: None None
eos: None None
cls: [CLS] 101
sep: [SEP] 102


In [26]:
ids = tokenizer.encode(text)
print(tokenizer.encode(text)) # string -> tokens -> ids
print(tokenizer.encode_plus(text)) # string -> tokens -> ids + more
print(tokenizer.convert_ids_to_tokens(ids)) # ids -> tokens

[101, 7592, 2088, 999, 102]
{'input_ids': [101, 7592, 2088, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}
['[CLS]', 'hello', 'world', '!', '[SEP]']


In [22]:
ids = tokenizer.encode(text, max_length=10, padding='max_length', return_tensors='pt')
print(ids)
print(ids.shape)

tensor([[ 101, 7592, 2088,  999,  102,    0,    0,    0,    0,    0]])
torch.Size([1, 10])


In [23]:
encoded = tokenizer.encode_plus(text, max_length=10, padding='max_length', return_tensors='pt')
print(encoded.keys())
for k in encoded:
    print(k, encoded[k])

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
input_ids tensor([[ 101, 7592, 2088,  999,  102,    0,    0,    0,    0,    0]])
token_type_ids tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])


In [32]:
encoded = tokenizer.batch_encode_plus(text_list, max_length=10, padding='max_length', return_tensors='pt')
print(encoded.keys())
for k in encoded:
    print(k, encoded[k])

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
input_ids tensor([[ 101, 7592, 2088,  999,  102,    0,    0,    0,    0,    0],
        [ 101, 7592, 3011, 1998, 7733,  999,  102,    0,    0,    0]])
token_type_ids tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])


In [36]:
print(tokenizer(text))  # = encode_plus()
print(tokenizer(text_list)) # = batch_encode_plus()

{'input_ids': [101, 7592, 2088, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}
{'input_ids': [[101, 7592, 2088, 999, 102], [101, 7592, 3011, 1998, 7733, 999, 102]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}
