# Tokenizer 基本使用

In [2]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
sen = "弱小的我也有大夢想!"

## Step1 加載與保存

In [4]:
# 從HuggingFace加載，輸入模型名稱，即可加載對於的分詞器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [5]:
# tokenizer 保存到本地
tokenizer.save_pretrained("./tmp/roberta_tokenizer")

('./tmp/roberta_tokenizer\\tokenizer_config.json',
 './tmp/roberta_tokenizer\\special_tokens_map.json',
 './tmp/roberta_tokenizer\\vocab.txt',
 './tmp/roberta_tokenizer\\added_tokens.json',
 './tmp/roberta_tokenizer\\tokenizer.json')

In [6]:
# 從本地加載tokenizer
tokenizer = AutoTokenizer.from_pretrained("./tmp/roberta_tokenizer/")
tokenizer

BertTokenizerFast(name_or_path='./tmp/roberta_tokenizer/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Step2 句子分詞

In [7]:
tokens = tokenizer.tokenize(sen)
tokens

['弱', '小', '的', '我', '也', '有', '大', '夢', '想', '!']

## Step3 查看詞典

In [8]:
tokenizer.vocab

{'##oud': 12867,
 '##序': 15472,
 'ocean': 12546,
 '觐': 6233,
 'language': 8348,
 '##互': 13814,
 '##淀': 16952,
 'pet': 10495,
 '##别': 14223,
 '荳': 5791,
 '##拘': 15929,
 '##綽': 18269,
 '##]': 13341,
 '##dget': 11857,
 '##掃': 16011,
 '嬰': 2087,
 '##刊': 14206,
 '滄': 3993,
 '犢': 4303,
 '酩': 6990,
 '##rc': 10227,
 'panasonic': 10752,
 'java': 8507,
 '刨': 1163,
 '紜': 5161,
 '￥799': 9417,
 '##テ': 11903,
 '##夹': 14988,
 '##蔗': 18972,
 '##申': 17566,
 '淅': 3897,
 'ltxsw': 8793,
 'メーカー': 12052,
 '潼': 4065,
 '##诀': 19451,
 '##鈞': 20104,
 '##怖': 15644,
 '閻': 7291,
 '##า': 13449,
 '##萦': 18910,
 '維': 5204,
 '欒': 3610,
 '1500': 8443,
 '##身': 19773,
 '帥': 2371,
 '深': 3918,
 '1908': 11046,
 '叡': 1364,
 '##⋅': 13552,
 'nhk': 9689,
 '##蕩': 18996,
 '碛': 4816,
 'pu': 11227,
 '##绳': 18391,
 '##邰': 19993,
 '##イフ': 12692,
 'x5': 10871,
 'fi': 8533,
 '##鈺': 20109,
 'glass': 12535,
 '稀': 4921,
 '逊': 6849,
 '蘑': 5982,
 '舖': 5655,
 '铤': 7204,
 '癸': 4631,
 '頹': 7535,
 '##ニア': 12650,
 '##圾': 14826,
 '##钞': 20220,
 '

In [9]:
tokenizer.vocab_size

21128

## Step4 索引轉換

In [10]:
# 將詞序列轉換為id序列
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[2483, 2207, 4638, 2769, 738, 3300, 1920, 1918, 2682, 106]

In [None]:
# 將id序列轉換為token序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

In [None]:
# 將token序列轉換為string
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen

###  更便捷的實現方式

In [11]:
# 將字符串轉換為id序列，又稱之為編碼
ids = tokenizer.encode(sen, add_special_tokens=True)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 1918, 2682, 106, 102]

In [None]:
# 將id序列轉換為字符串，又稱之為解碼
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen

## Step5 填充與截斷

In [None]:
# 填充
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

In [None]:
# 截斷
ids = tokenizer.encode(sen, max_length=5, truncation=True)
ids

## Step6 其他輸入部分

In [None]:
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

In [None]:
attention_mask = [1 if idx != 0 else 0 for idx in ids]
token_type_ids = [0] * len(ids)
ids, attention_mask, token_type_ids

## Step7 快速調用方式

In [None]:
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)
inputs

In [None]:
inputs = tokenizer(sen, padding="max_length", max_length=15)
inputs

## Step8 處理batch數據

In [None]:
sens = ["弱小的我也有大夢想",
        "有夢想誰都了不起",
        "追逐夢想的心，比夢想本身，更可貴"]
res = tokenizer(sens)
res

In [None]:
%%time
# 單條循環處理
for i in range(1000):
    tokenizer(sen)

In [None]:
%%time
# 處理batch數據
res = tokenizer([sen] * 1000)

In [None]:
tokenizer

# Fast / Slow Tokenizer

In [None]:
sen = "弱小的我也有大Dreaming!"

In [None]:
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer

In [None]:
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer

In [None]:
%%time
# 單條循環處理
for i in range(10000):
    fast_tokenizer(sen)

In [None]:
%%time
# 單條循環處理
for i in range(10000):
    slow_tokenizer(sen)

In [None]:
%%time
# 處理batch數據
res = fast_tokenizer([sen] * 10000)

In [None]:
%%time
# 處理batch數據
res = slow_tokenizer([sen] * 10000)

In [None]:
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs

In [None]:
inputs.word_ids()

In [None]:
inputs = slow_tokenizer(sen, return_offsets_mapping=True)

# 特殊Tokenizer的加載

In [None]:
from transformers import AutoTokenizer

In [None]:
# 新版本的transformers（>4.34），加載 THUDM/chatglm 會報錯，因此這裡替換為了天宮的模型
tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-13B-base", trust_remote_code=True)
tokenizer

In [None]:
tokenizer.save_pretrained("skywork_tokenizer")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("skywork_tokenizer", trust_remote_code=True)

In [None]:
tokenizer.decode(tokenizer.encode(sen))