<a href="https://colab.research.google.com/github/boshuaiYu/huggingface_tutorial/blob/Transformers/HuggingFace4_%E5%A4%9A%E5%BA%8F%E5%88%97%E5%A4%84%E7%90%86.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers[sentencepiece]



# 单序列输入情况

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [3]:
sequence = "I've been waiting for a HuggingFace course my whole life."
tokens = tokenizer.tokenize(sequence)
tokens

['i',
 "'",
 've',
 'been',
 'waiting',
 'for',
 'a',
 'hugging',
 '##face',
 'course',
 'my',
 'whole',
 'life',
 '.']

In [4]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[1045,
 1005,
 2310,
 2042,
 3403,
 2005,
 1037,
 17662,
 12172,
 2607,
 2026,
 2878,
 2166,
 1012]

In [5]:
input_ids = torch.tensor(ids)
input_ids,input_ids.shape

(tensor([ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]),
 torch.Size([14]))

In [6]:
model(input_ids)  # Transformer默认需要多序列处理(即必须是两个维度以上)

IndexError: ignored

In [7]:
model(input_ids.view(1,-1)) # 此时的id中没有包含分隔词

SequenceClassifierOutput(loss=None, logits=tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [8]:
tokens_all = tokenizer.tokenize(sequence, add_special_tokens=True)
tokens_all

['[CLS]',
 'i',
 "'",
 've',
 'been',
 'waiting',
 'for',
 'a',
 'hugging',
 '##face',
 'course',
 'my',
 'whole',
 'life',
 '.',
 '[SEP]']

In [9]:
ids_all = tokenizer.convert_tokens_to_ids(tokens_all)
ids_all

[101,
 1045,
 1005,
 2310,
 2042,
 3403,
 2005,
 1037,
 17662,
 12172,
 2607,
 2026,
 2878,
 2166,
 1012,
 102]

In [10]:
model(torch.tensor(ids_all).reshape(1, -1)) # 与之前的结果不等[[-2.7276,  2.8789]]

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

---

In [11]:
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print(tokenized_inputs["input_ids"])

tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])


In [12]:
inputs_ids1 = torch.tensor([ids])
inputs_ids1,inputs_ids1.shape

(tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
           2026,  2878,  2166,  1012]]),
 torch.Size([1, 14]))

In [13]:
result = model(inputs_ids1)
result  # 与之前结果[-2.7276,  2.8789]]一致，标识词没有算进去

SequenceClassifierOutput(loss=None, logits=tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [14]:
torch.softmax(result.logits,dim=-1), model.config.id2label

(tensor([[0.0037, 0.9963]], grad_fn=<SoftmaxBackward0>),
 {0: 'NEGATIVE', 1: 'POSITIVE'})

In [15]:
batch = torch.tensor([ids, ids])
torch.softmax(model(batch).logits,dim=-1)

tensor([[0.0037, 0.9963],
        [0.0037, 0.9963]], grad_fn=<SoftmaxBackward0>)

# 填充输入(多序列长度不同)

In [16]:
batched_ids = [
    [200, 200, 200], # 句子1由3个字词构成
    [200, 200] # 句子2由2个字词构成
]

## 1.仅通过填充进行

In [17]:
padding_id = 100
se1_ids = [200, 200, 200]
se2_ids = [200, 200]
batch = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id] # tokenizer.pad_token_id填充令牌默认为0
]
model(torch.tensor(se1_ids).reshape(1,-1)).logits

tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)

In [18]:
model(torch.tensor(se2_ids).reshape(1,-1)).logits

tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)

In [19]:
model(torch.tensor(batch)).logits # 通过填充后Transformer会根据上下文信息得出结果，因此填充得padding不通过mask会对结果有影响
# 此时由于padding的默认为0，对结果有影响

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895],
        [ 1.3373, -1.2163]], grad_fn=<AddmmBackward0>)

## 2.注意力Mask

In [20]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]
attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]
outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)
# 采用注意力掩码的方式不会对计算造成影响

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


# 长序列(长序列模型/截断序列）

# 测试

In [21]:
sentence1, sentence2 = "I’ve been waiting for a HuggingFace course my whole life.","I hate this so much!"
se1_token, se2_token = tokenizer.tokenize(sentence1),tokenizer.tokenize(sentence2)
se1_token, se2_token

(['i',
  '’',
  've',
  'been',
  'waiting',
  'for',
  'a',
  'hugging',
  '##face',
  'course',
  'my',
  'whole',
  'life',
  '.'],
 ['i', 'hate', 'this', 'so', 'much', '!'])

In [22]:
id1, id2 = tokenizer.convert_tokens_to_ids(se1_token), tokenizer.convert_tokens_to_ids(se2_token)
id1, id2

([1045,
  1521,
  2310,
  2042,
  3403,
  2005,
  1037,
  17662,
  12172,
  2607,
  2026,
  2878,
  2166,
  1012],
 [1045, 5223, 2023, 2061, 2172, 999])

In [23]:
id1_1 = tokenizer.encode(sentence1)
id1_1

[101,
 1045,
 1521,
 2310,
 2042,
 3403,
 2005,
 1037,
 17662,
 12172,
 2607,
 2026,
 2878,
 2166,
 1012,
 102]

In [24]:
id1_1_pad = tokenizer.encode(sentence1, padding="max_length", max_length=25)
id1_1_pad  # 按照最大设定长度进行填充，不足的用0填充

[101,
 1045,
 1521,
 2310,
 2042,
 3403,
 2005,
 1037,
 17662,
 12172,
 2607,
 2026,
 2878,
 2166,
 1012,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [25]:
id1_1_trun = tokenizer.encode(sentence1, max_length=5, truncation=True)
id1_1_trun  # 进行截断，保存开始与结束分隔词

[101, 1045, 1521, 2310, 102]

In [26]:
id1_1_plus = tokenizer.encode_plus(sentence1, padding="max_length", max_length=25)
id1_1_plus # encode_plus包含了attention_mask的信息

{'input_ids': [101, 1045, 1521, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [27]:
model(torch.tensor([id1])).logits, model(torch.tensor([id2])).logits

(tensor([[-2.5720,  2.6852]], grad_fn=<AddmmBackward0>),
 tensor([[ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>))

In [28]:
while len(id2) != len(id1):
  id2.append(tokenizer.pad_token_id)

In [29]:
ids = torch.stack((torch.tensor(id1),torch.tensor(id2)))
ids

tensor([[ 1045,  1521,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012],
        [ 1045,  5223,  2023,  2061,  2172,   999,     0,     0,     0,     0,
             0,     0,     0,     0]])

In [30]:
mask = ids.masked_fill(ids!=0, 1)
mask

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])

In [31]:
model(ids, attention_mask=mask).logits

tensor([[-2.5720,  2.6852],
        [ 3.1931, -2.6685]], grad_fn=<AddmmBackward0>)

# Fast/Slow Tokenizer

In [32]:
sen = "我是一个有梦想的咸鱼"

## Fast Tokenizer是基于Rust实现的，速度比较快

In [33]:
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese")
fast_tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-chinanews-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Slow Tokenizer是基于Python实现的，速度比较慢

In [34]:
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese",use_fast=False)
slow_tokenizer

BertTokenizer(name_or_path='uer/roberta-base-finetuned-chinanews-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## 测试结果

### 单条语句循环分词

In [35]:
%%time

for i in range(10000):
  fast_tokenizer(sen)

CPU times: user 601 ms, sys: 3.29 ms, total: 605 ms
Wall time: 614 ms


In [36]:
%%time

for i in range(10000):
  slow_tokenizer(sen)

CPU times: user 1.57 s, sys: 8.76 ms, total: 1.57 s
Wall time: 1.59 s


### batch语句分词

In [37]:
%%time
res_fast = fast_tokenizer([sen]*10000)

CPU times: user 700 ms, sys: 55.2 ms, total: 756 ms
Wall time: 484 ms


In [38]:
%%time
res_slow = slow_tokenizer([sen]*10000)

CPU times: user 1.21 s, sys: 8.48 ms, total: 1.22 s
Wall time: 1.22 s


### fast_tokenizer的返回值

In [39]:
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs

{'input_ids': [101, 2769, 3221, 671, 702, 3300, 3457, 2682, 4638, 1496, 7824, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (0, 0)]}

In [40]:
inputs.word_ids() # 确定各个token是否属于一个词

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, None]

# 特殊Tokenizer的加载

In [41]:
from transformers import AutoTokenizer

In [42]:
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True)
tokenizer  # 非官方实现的分词器在远程仓库中，需要trust_remote_code参数

ChatGLMTokenizer(name_or_path='THUDM/chatglm3-6b', vocab_size=64798, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	
}

In [43]:
tokenizer.save_pretrained("chatglm3_tokenizer")

('chatglm3_tokenizer/tokenizer_config.json',
 'chatglm3_tokenizer/special_tokens_map.json',
 'chatglm3_tokenizer/tokenizer.model',
 'chatglm3_tokenizer/added_tokens.json')