# 数据预处理
1. 分词
2. 构建词典： token 和 数字的对应
3. 数据转换： 文本序列 -》 数字序列
4. 数据的填充和截断

In [1]:
from transformers import AutoTokenizer
checkpoint = "uer/roberta-base-finetuned-dianping-chinese"
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

#  分词

In [2]:
sentence = "这家餐厅很好吃!"
tokens = tokenizer.tokenize(sentence)
print(tokens)

['这', '家', '餐', '厅', '很', '好', '吃', '!']


# 查看词典

In [3]:
tokenizer.vocab

{'alex': 10179,
 '##へ': 11320,
 'opec': 9416,
 '##曇': 16336,
 '##溉': 17030,
 '##鹫': 20971,
 '甙': 4492,
 '鹧': 7913,
 '怼': 2601,
 '刻': 1174,
 '辣': 6793,
 '##領': 20583,
 '##悴': 15708,
 '##緋': 18273,
 '125': 8752,
 '禦': 4888,
 '匹': 1276,
 '搖': 3015,
 '##离': 17952,
 'から': 8526,
 '貳': 6522,
 '##ov': 11689,
 '##玛': 17434,
 '慨': 2717,
 '##娘': 15080,
 '##殊': 16711,
 '鹘': 7908,
 '##為': 17215,
 '鱼': 7824,
 '3000': 8283,
 '##闯': 20367,
 '##share': 11042,
 '##隽': 20466,
 '梢': 3456,
 '鸡': 7883,
 '卖': 1297,
 '五': 758,
 '##查': 16446,
 '巩': 2343,
 '屹': 2256,
 '埠': 1819,
 '成': 2768,
 '##嘉': 14706,
 '紫': 5166,
 '##汩': 16798,
 '5757': 12007,
 '##茴': 18818,
 '嫩': 2075,
 'minkoff': 11018,
 '##潘': 17107,
 '##個': 14000,
 '枕': 3359,
 '##躏': 19771,
 '##朮': 16374,
 '槿': 3554,
 '##敎': 16186,
 '昂': 3203,
 '噜': 1686,
 'anthony': 10219,
 'ta': 8346,
 '噶': 1696,
 '6m': 12485,
 '##瀅': 17157,
 '##籽': 18161,
 '昕': 3213,
 '胱': 5537,
 '445': 12834,
 '翼': 5437,
 '##のか': 9584,
 '吋': 1397,
 '##纰': 18341,
 'are': 8995,
 '##沈'

In [4]:
tokenizer.vocab_size

21128

# 数据转换

In [5]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[6821, 2157, 7623, 1324, 2523, 1962, 1391, 106]


In [6]:
tokenizer.convert_ids_to_tokens(ids)

['这', '家', '餐', '厅', '很', '好', '吃', '!']

In [7]:
tokenizer.convert_tokens_to_string(tokens)

'这 家 餐 厅 很 好 吃!'

## 预处理

In [8]:
tokenizer.encode(sentence)

[101, 6821, 2157, 7623, 1324, 2523, 1962, 1391, 106, 102]

In [9]:
tokenizer.encode(sentence, add_special_tokens=False)

[6821, 2157, 7623, 1324, 2523, 1962, 1391, 106]

In [10]:
tokenizer.convert_ids_to_tokens(tokenizer.encode(sentence))

['[CLS]', '这', '家', '餐', '厅', '很', '好', '吃', '!', '[SEP]']

In [11]:
tokenizer.decode(tokenizer.encode(sentence))

'[CLS] 这 家 餐 厅 很 好 吃! [SEP]'

In [12]:
tokenizer.decode(tokenizer.encode(sentence), skip_special_tokens=True)

'这 家 餐 厅 很 好 吃!'

# 填充与截断

In [13]:
tokenizer.encode(sentence, max_length=15, padding="max_length")

[101, 6821, 2157, 7623, 1324, 2523, 1962, 1391, 106, 102, 0, 0, 0, 0, 0]

In [14]:
tokenizer.encode(sentence, max_length=5, truncation=True)

[101, 6821, 2157, 7623, 102]

# 其他输入 - mask and src

In [15]:
ids = tokenizer.encode(sentence, max_length=15, padding="max_length")
attention_mask = [1 if i != 0 else 0 for i in ids]
print(attention_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]


In [16]:
token_type_ids = [0 for i in range(len(ids))] # src of sentence
print(token_type_ids)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [17]:
tokenizer(sentence, max_length=15, padding="max_length")

{'input_ids': [101, 6821, 2157, 7623, 1324, 2523, 1962, 1391, 106, 102, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}

In [19]:
# 并没有事先填充，而是根据batch中最长的句子进行填充
tokenizer(sentence, max_length=15, truncation=True)

{'input_ids': [101, 6821, 2157, 7623, 1324, 2523, 1962, 1391, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [20]:
tokenizer(['你好', '早上好'], max_length=15, truncation=True)

{'input_ids': [[101, 872, 1962, 102], [101, 3193, 677, 1962, 102]], 'token_type_ids': [[0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1, 1]]}

# batched data

In [50]:
sentences = ["这个菜很好吃!", "服务员很热情赞赞!"]
res = tokenizer(sentences, max_length=12, padding="max_length")

In [51]:
res['input_ids']

[[101, 6821, 702, 5831, 2523, 1962, 1391, 106, 102, 0, 0, 0],
 [101, 3302, 1218, 1447, 2523, 4178, 2658, 6614, 6614, 106, 102, 0]]

In [52]:
res['attention_mask']

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]

# Fast / Slow Tokenizer

In [53]:
sentence = "这家餐厅很好吃delicious!"

In [55]:
fast_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
slow_tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=False)

In [56]:
%%time
for i in range(10000):
    slow_tokenizer.tokenize(sentence)

CPU times: user 1.21 s, sys: 2.69 ms, total: 1.21 s
Wall time: 1.21 s


In [57]:
%%time
for i in range(10000):
    fast_tokenizer.tokenize(sentence)

CPU times: user 603 ms, sys: 3.19 ms, total: 606 ms
Wall time: 603 ms


In [59]:
inputs = fast_tokenizer(sentence, return_offsets_mapping=True)
inputs

{'input_ids': [101, 6821, 2157, 7623, 1324, 2523, 1962, 1391, 10843, 8317, 11737, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 10), (10, 12), (12, 16), (16, 17), (0, 0)]}

In [60]:
inputs['offset_mapping']

[(0, 0),
 (0, 1),
 (1, 2),
 (2, 3),
 (3, 4),
 (4, 5),
 (5, 6),
 (6, 7),
 (7, 10),
 (10, 12),
 (12, 16),
 (16, 17),
 (0, 0)]

In [63]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 8, None]

和英文部分进行对应
```
(7, 10),
(10, 12),
(12, 16),

---> 

7, 7, 7

```