In [2]:
from transformers import AutoTokenizer, BertTokenizer

In [3]:
model_name = "bert-base-uncased"
#tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

In [5]:
test_examples = ["today is not so bad", "It is so bad", "It's good"]

# 认识Tokenizer

In [10]:
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [11]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [12]:
tokenizer.convert_tokens_to_ids(tokenizer.special_tokens_map.values())

[100, 102, 0, 101, 103]

# 基本用法

1. 分词
2. token转为id
3. 加特殊token、截断、补白
4. 转为tensor

In [14]:
in_tensors = tokenizer(
    test_examples, padding=True, truncation=True, max_lenght=32, return_tensors="pt"
)
print(in_tensors.keys())
print(in_tensors["input_ids"])
print(in_tensors["attention_mask"])

Keyword arguments {'max_lenght': 32} not recognized.
Keyword arguments {'max_lenght': 32} not recognized.
Keyword arguments {'max_lenght': 32} not recognized.


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
tensor([[ 101, 2651, 2003, 2025, 2061, 2919,  102],
        [ 101, 2009, 2003, 2061, 2919,  102,    0],
        [ 101, 2009, 1005, 1055, 2204,  102,    0]])
tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 0]])


# 分词： tokenize

In [13]:
tokens = [tokenizer.tokenize(text) for text in test_examples]
print(tokens)

[['today', 'is', 'not', 'so', 'bad'], ['it', 'is', 'so', 'bad'], ['it', "'", 's', 'good']]


## 中文支持很有限

In [15]:
' | '.join(tokenizer.tokenize("你好，中国，Bert对中文的支持很有限"))

'[UNK] | [UNK] | ， | 中 | 国 | ， | bert | [UNK] | 中 | 文 | 的 | [UNK] | [UNK] | [UNK] | 有 | [UNK]'

In [14]:
' | '.join(tokenizer.tokenize("hello-cat!, ksdh1223, 123456"))

'hello | - | cat | ! | , | ks | ##dh | ##12 | ##23 | , | 123 | ##45 | ##6'

## 对于一些生僻，错误的word，会进行拆为wordpiece

# 转为id： convert_tokens_to_ids

In [14]:
ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
print(ids)

[[2651, 2003, 2025, 2061, 2919], [2009, 2003, 2061, 2919], [2009, 1005, 1055, 2204]]


# `encode`

encode = tokenize + convert_tokens_to_ids +  add special_token

encode_plus = tokenize + convert_tokens_to_ids +  add special_token,  generate mask


In [24]:
tokenizer.encode(test_examples[0])

[101, 2651, 2003, 2025, 2061, 2919, 102]

In [26]:
tokenizer.convert_ids_to_tokens(tokenizer.encode(test_examples[0]))

['[CLS]', 'today', 'is', 'not', 'so', 'bad', '[SEP]']

# `encode_plus` 编码一对句子

In [20]:
ids = tokenizer.encode_plus(test_examples[0], test_examples[1])
print(*ids.items(), sep='\n')

('input_ids', [101, 2651, 2003, 2025, 2061, 2919, 102, 2009, 2003, 2061, 2919, 102])
('token_type_ids', [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
('attention_mask', [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


token_type_ids中0表示第一个句子的id，1表示第2个句子的id

# decode

decode是encode的逆运算：将id list 转化为一个字符串

In [22]:
tokenizer.decode(ids["input_ids"])

'[CLS] today is not so bad [SEP] it is so bad [SEP]'

# Fast Tokenizer / Slow Tokenizer

Fast Tokenizer 是基于rust来实现的，速度快；而Slow tokenizer是基于python实现，速度慢；

In [4]:
fast_tokenizer = AutoTokenizer.from_pretrained(model_name)
slow_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

In [8]:
%%timeit
for _ in range(1000):
    fast_tokenizer(test_examples)

122 ms ± 3.33 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [9]:
%%timeit
for _ in range(1000):
    slow_tokenizer(test_examples)

419 ms ± 8.19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


FastTokenizer有一些特殊的返回值

* offset_mapping：标记了每一个token在原输出str中的索引位置
* word_ids：标记了每个token对应原输出中word的索引

这个对于NER或QA来说比较重要。

In [14]:
inputs = fast_tokenizer("In the big big world, I have a big dreamming", return_offsets_mapping=True)
inputs

{'input_ids': [101, 1999, 1996, 2502, 2502, 2088, 1010, 1045, 2031, 1037, 2502, 3959, 6562, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 2), (3, 6), (7, 10), (11, 14), (15, 20), (20, 21), (22, 23), (24, 28), (29, 30), (31, 34), (35, 40), (40, 44), (0, 0)]}

In [15]:
print(fast_tokenizer.convert_ids_to_tokens(inputs["input_ids"]))

['[CLS]', 'in', 'the', 'big', 'big', 'world', ',', 'i', 'have', 'a', 'big', 'dream', '##ming', '[SEP]']


In [13]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, None]