In [2]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

# 加载预训练模型 tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 输入文本
text = "[CLS] Who is Li Jinhong ? [SEP] Li Jinhong is a programmer [SEP]"
tokenized_text = tokenizer.tokenize(text)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenized_text

['[CLS]',
 'who',
 'is',
 'li',
 'jin',
 '##hong',
 '?',
 '[SEP]',
 'li',
 'jin',
 '##hong',
 'is',
 'a',
 'programmer',
 '[SEP]']

In [4]:
tokenizer(text)

{'input_ids': [101, 101, 2040, 2003, 5622, 9743, 19991, 1029, 102, 5622, 9743, 19991, 2003, 1037, 20273, 102, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [5]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
sequence = "A Titan RTX has 24GB of VRAM"

In [6]:
tokenizer.tokenize(sequence)

['A',
 'Titan',
 'R',
 '##T',
 '##X',
 'has',
 '24',
 '##GB',
 'of',
 'V',
 '##RA',
 '##M']

In [7]:
encoded_sequence = tokenizer(sequence)['input_ids']

In [8]:
tokenizer.decode(encoded_sequence)

'[CLS] A Titan RTX has 24GB of VRAM [SEP]'

In [9]:
sequence

'A Titan RTX has 24GB of VRAM'

In [10]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [11]:
sequence_a = "This is a short sequence."
sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."

In [12]:
encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
encoded_sequence_b = tokenizer(sequence_b)["input_ids"]

In [13]:
encoded_sequence_a

[101, 1188, 1110, 170, 1603, 4954, 119, 102]

In [14]:
encoded_sequence_b

[101,
 1188,
 1110,
 170,
 1897,
 1263,
 4954,
 119,
 1135,
 1110,
 1120,
 1655,
 2039,
 1190,
 1103,
 4954,
 138,
 119,
 102]

In [15]:
padded_sequence = tokenizer([sequence_a, sequence_b], padding=True)

In [16]:
# 不够的位数进行补零
padded_sequence['input_ids']

[[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [101,
  1188,
  1110,
  170,
  1897,
  1263,
  4954,
  119,
  1135,
  1110,
  1120,
  1655,
  2039,
  1190,
  1103,
  4954,
  138,
  119,
  102]]

In [17]:
# Attention mask会标出哪些是补充0的
padded_sequence["attention_mask"]

[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

## 自动帮忙加上分隔符

In [18]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
sequence_a = "HuggingFace is based in NYC"
sequence_b = "Where is HuggingFace based?"
encoded_dict = tokenizer(sequence_a, sequence_b)
decoded = tokenizer.decode(encoded_dict["input_ids"])

In [19]:
decoded

'[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]'

In [20]:
encoded_dict

{'input_ids': [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# 特殊类词语的使用

## tokenizer类的方法，可以对字符进行处理，分词，转换成id

In [32]:
# encode 方法把字符转换成 id
one = tokenizer.encode("who is li jinhong ? ")

In [33]:
two = tokenizer.encode("Li jihong is a programmer")

In [37]:
all_sen = one + two[1:]

In [40]:
one, two, all_sen

([101, 1150, 1110, 181, 1182, 23220, 15624, 4553, 136, 102],
 [101, 5255, 23220, 15564, 1110, 170, 23981, 102],
 [101,
  1150,
  1110,
  181,
  1182,
  23220,
  15624,
  4553,
  136,
  102,
  5255,
  23220,
  15564,
  1110,
  170,
  23981,
  102])

In [42]:
# convert_ids_to_tokens 方法把id 重新转换成字符串，同时加上分隔符，是个list
tokenizer.convert_ids_to_tokens(one)

['[CLS]', 'who', 'is', 'l', '##i', 'ji', '##nh', '##ong', '?', '[SEP]']

In [43]:
# decode方法把encode的逆操作
tokenizer.decode(one)

'[CLS] who is li jinhong? [SEP]'

In [44]:
tokenizer.encode('name is hello world', 'gender is boy')

[101, 1271, 1110, 19082, 1362, 102, 5772, 1110, 2298, 102]

In [24]:
# max_length是最大长度，不足的话补充0，长度超过10的话就直接截断
tokenizer.encode('name is hello world', 'gender is a boy', max_length=10, padding='max_length', add_special_tokens=True)

[101, 1271, 1110, 19082, 1362, 102, 5772, 1110, 170, 2298, 102]

In [25]:
# encode_plus方法,对输入的一个batch的句子添加一些附加的信息，比如mask还有type等
tokenizer.encode_plus('li is a programmer', max_length=10, padding='max_length')

{'input_ids': [101, 181, 1182, 1110, 170, 23981, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

In [26]:
tokenizer.encode_plus('li is a programmer','name is a boy', max_length=10, padding='max_length')

{'input_ids': [101, 181, 1182, 1110, 170, 23981, 102, 1271, 1110, 170, 2298, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [27]:
# batch_encode_plus方法,是批处理方法，一次可以处理多条语句
tokenizer.batch_encode_plus(['this is a sample','this is another longer sample text'], padding='max_length', max_length=10)


{'input_ids': [[101, 1142, 1110, 170, 6876, 102, 0, 0, 0, 0], [101, 1142, 1110, 1330, 2039, 6876, 3087, 102, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]}

In [30]:
tokenizer.batch_encode_plus(['this is a sample','this is another longer sample text'], padding='longest')


{'input_ids': [[101, 1142, 1110, 170, 6876, 102, 0, 0], [101, 1142, 1110, 1330, 2039, 6876, 3087, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1]]}