# Tokenizer 基本使用

In [1]:
from transformers import AutoTokenizer

In [2]:
sen = "弱小的我也有大梦想!"

## Step1 加载与保存

In [3]:
# 从HuggingFace加载，输入模型名称，即可加载对应的分词器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/295 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [4]:
# tokenizer 保存到本地
tokenizer.save_pretrained("./roberta_tokenizer")

('./roberta_tokenizer/tokenizer_config.json',
 './roberta_tokenizer/special_tokens_map.json',
 './roberta_tokenizer/vocab.txt',
 './roberta_tokenizer/added_tokens.json',
 './roberta_tokenizer/tokenizer.json')

In [6]:
# 从本地加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer/")
tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

## Step2 句子分词

In [8]:
tokens = tokenizer.tokenize(sen)
tokens

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']

## Step3 查看词典

In [9]:
tokenizer.vocab

{'##練': 18287,
 '##韆': 20556,
 '审': 2144,
 '2016': 8112,
 '噴': 1695,
 '璿': 4474,
 '滿': 4021,
 '##浯': 16917,
 '颠': 7585,
 '妪': 1983,
 '禀': 4880,
 'atom': 8941,
 '口': 1366,
 '哽': 1531,
 '禁': 4881,
 '##艺': 18743,
 '糞': 5135,
 '3t': 12338,
 '貴': 6523,
 '##醫': 20072,
 '杵': 3348,
 '箱': 5056,
 '论': 6389,
 '乌': 723,
 '##砧': 17841,
 '##泻': 16868,
 '15000': 10064,
 '##という': 12383,
 '309b': 9799,
 '##pact': 12624,
 '1978': 8774,
 '##襠': 19257,
 'あります': 12737,
 '☼': 485,
 '婿': 2053,
 '##魯': 20855,
 '糰': 5141,
 '##門': 20328,
 '##oid': 10523,
 '##囱': 14795,
 '##府': 15481,
 '##ᆨ': 11953,
 '##煜': 17264,
 '##哆': 14561,
 '炫': 4149,
 'weibo': 10565,
 'dd': 10391,
 '蔭': 5923,
 '##隽': 20466,
 '鐵': 7136,
 'top10': 10466,
 '洪': 3825,
 '龔': 7985,
 '##ni': 8833,
 '##啕': 14617,
 '移': 4919,
 '味': 1456,
 '##販': 19573,
 '##匙': 14324,
 '##告': 14497,
 'dj': 9135,
 '邢': 6928,
 '##层': 15288,
 '##疇': 17596,
 '##舂': 18699,
 '##观': 19282,
 '258': 10870,
 '[unused46]': 46,
 '醚': 7009,
 'line': 8323,
 '##鍥': 20162,
 '##暧':

In [11]:
tokenizer.vocab_size

21128

## Step4 索引转换

In [13]:
# 将词序列转换为id序列
ids = tokenizer.convert_tokens_to_ids(tokens)
ids, len(ids)

([2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106], 10)

In [14]:
# 将id序列转换为token序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']

In [15]:
# 将token序列转换为string
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen

'弱 小 的 我 也 有 大 梦 想!'

###  更便捷的实现方式

In [17]:
# 将字符串转换为id序列，又称之为编码
ids = tokenizer.encode(sen, add_special_tokens=True)
ids, len(ids)

([101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102], 12)

In [19]:
# 将id序列转换为字符串，又称之为解码
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen

'[CLS] 弱 小 的 我 也 有 大 梦 想! [SEP]'

是 BERT（Bidirectional Encoder Representations from Transformers） 模型在处理输入文本时的标记化格式。BERT 模型要求在输入文本的两端添加特殊标记来处理文本。具体来说：

[CLS]：这是 Classification 的缩写，通常在 BERT 中表示 分类任务 的开始标记。在进行分类任务时，[CLS] 标记通常用于从模型的最后一层输出中提取信息，作为句子的整体表示。换句话说，模型会把所有的信息聚合到这个 [CLS] 标记上，以便用于分类。

[SEP]：这是 Separator 的缩写，用于分隔句子或文本片段。在 BERT 中，[SEP] 标记用于标识句子或文本片段的结束。它可以用来分隔两个句子，或在单句文本的结尾处。

为什么要使用这些标记？

[CLS] 标记帮助模型识别文本的开始部分，并通过该标记的输出进行分类。

[SEP] 标记用于区分不同的句子或文本片段，特别是在句子对输入任务（如问答任务、文本对比任务）中，BERT 需要知道哪些部分是不同的句子或文本。

## Step5 填充与截断

In [21]:
# 填充
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]

In [22]:
# 截断
ids = tokenizer.encode(sen, max_length=5, truncation=True)
ids

[101, 2483, 2207, 4638, 102]

## Step6 其他输入部分

In [23]:
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]

In [25]:
attention_mask = [1 if idx != 0 else 0 for idx in ids]
token_type_ids = [0] * len(ids)
ids, attention_mask, token_type_ids

([101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Step7 快速调用方式

In [26]:
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

token_type_ids 是一个用来标识句子或文本片段所属类别的向量。在 BERT 中，token_type_ids 用于区分输入的两个句子或文本片段。

例如，在句子对任务（如问答、文本对比）中，BERT 需要知道哪些 token 属于句子 1，哪些属于句子 2。这个向量通过 0 和 1 来区分这两部分文本。

如果你只有一个句子输入，那么所有的 token_type_ids 通常是 0，表示该输入属于同一个句子。

0 代表第一个句子（或文本片段）。

1 代表第二个句子（或文本片段）。

In [27]:
inputs = tokenizer(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

## Step8 处理batch数据

In [28]:
sens = ["弱小的我也有大梦想",
        "有梦想谁都了不起",
        "追逐梦想的心，比梦想本身，更可贵"]
res = tokenizer(sens)
res

{'input_ids': [[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102], [101, 3300, 3457, 2682, 6443, 6963, 749, 679, 6629, 102], [101, 6841, 6852, 3457, 2682, 4638, 2552, 8024, 3683, 3457, 2682, 3315, 6716, 8024, 3291, 1377, 6586, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [31]:
%%time
# 单条循环处理
for i in range(1000):
    tokenizer(sen)

CPU times: user 76.7 ms, sys: 0 ns, total: 76.7 ms
Wall time: 76.4 ms


In [32]:
%%time
# 处理batch数据
res = tokenizer([sen] * 1000)

CPU times: user 67.9 ms, sys: 4.99 ms, total: 72.9 ms
Wall time: 42.8 ms


In [33]:
tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

# Fast / Slow Tokenizer

In [34]:
sen = "弱小的我也有大Dreaming!"

In [35]:
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [36]:
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer

BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [37]:
%%time
# 单条循环处理
for i in range(10000):
    fast_tokenizer(sen)

CPU times: user 925 ms, sys: 2.82 ms, total: 927 ms
Wall time: 956 ms


In [38]:
%%time
# 单条循环处理
for i in range(10000):
    slow_tokenizer(sen)

CPU times: user 2.16 s, sys: 0 ns, total: 2.16 s
Wall time: 2.17 s


In [39]:
%%time
# 处理batch数据
res = fast_tokenizer([sen] * 10000)

CPU times: user 898 ms, sys: 56.5 ms, total: 955 ms
Wall time: 657 ms


In [40]:
%%time
# 处理batch数据
res = slow_tokenizer([sen] * 10000)

CPU times: user 2.13 s, sys: 1.07 ms, total: 2.14 s
Wall time: 2.17 s


In [41]:
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 10252, 8221, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 12), (12, 15), (15, 16), (0, 0)]}

In [42]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [51]:
inputs = slow_tokenizer(sen, return_offsets_mapping=True)

NotImplementedError: return_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast. More information on available tokenizers at https://github.com/huggingface/transformers/pull/2674

# 特殊Tokenizer的加载

In [44]:
from transformers import AutoTokenizer

In [45]:
# 新版本的transformers（>4.34），加载 THUDM/chatglm 会报错，因此这里替换为了天宫的模型
tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-13B-base", trust_remote_code=True)
tokenizer

tokenizer_config.json:   0%|          | 0.00/857 [00:00<?, ?B/s]

tokenization_skywork.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Skywork/Skywork-13B-base:
- tokenization_skywork.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.model:   0%|          | 0.00/994k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]



SkyworkTokenizer(name_or_path='Skywork/Skywork-13B-base', vocab_size=65519, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [46]:
tokenizer.save_pretrained("skywork_tokenizer")

('skywork_tokenizer/tokenizer_config.json',
 'skywork_tokenizer/special_tokens_map.json',
 'skywork_tokenizer/tokenizer.model',
 'skywork_tokenizer/added_tokens.json')

In [47]:
tokenizer = AutoTokenizer.from_pretrained("skywork_tokenizer", trust_remote_code=True)



In [48]:
tokenizer.decode(tokenizer.encode(sen))

'<s>弱小的我也有大Dreaming!'