<a href="https://colab.research.google.com/github/cche0214/HuggingFaceLLM/blob/main/02pipeline%E7%9A%84%E5%86%85%E9%83%A8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import pipeline

# pipeline集成了三个步骤，预处理，模型计算和后处理
classifier = pipeline("sentiment-analysis")
classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)

In [None]:
from transformers import AutoTokenizer

# 检查点是针对于架构的权重，这里的架构就是DistilBERT
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
# 这里的意思就是，找到这个检查点下的Tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!"
]

# Tokenizer将原始的文本转换成tokenID（实际应该是原始文本->分词成token->token映射为tokenID
# return_tensors参数指定返回的tensor类型，可以是PyTorch、TensorFlow或者纯NumPy
# Transformers模型只接受张量输入
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")

In [None]:
# 输出包含两个键input_ids和attention_mask
# input_ids就是每个句子中token的ID
# 现在就是有了可以输入模型的数据，所以接下来下载模型
print(inputs)

In [None]:
from transformers import AutoModel

# 这里并没有加载模型头，也就是不能得到情感分类任务的结果
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

In [None]:
outputs = model(**inputs)

# 输入句子，输出称为hidden states隐状态，这个维度很大有三个维度，隐状态也被称为模型头（输入下游任务）
# [BatchSize, SequenceLength, Hiddensize]
# 分别是一次处理的序列数量，表示序列（句子）的长度和每个模型输入的向量维度
print(outputs.last_hidden_state.shape)

In [None]:
print(outputs)

In [None]:
# 现在要导入一个带有序列分类头的模型，也就是情感分类模型
from transformers import AutoModelForSequenceClassification

# 这里的checkpoint跟单纯导入AutoModle一样，原因是两者的区别只是有没有下游的模型头
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)


In [None]:
# 这里的维度小了很多，因为模型头已经接受了前面的高维向量作为输入，输出包含两个值（每种标签一个的）向量
print(outputs.logits.shape)

In [None]:
print(outputs)

In [None]:
# 预测出来的暂时还是对数几率，是模型最后一层输出的原始的、未标准化的分数
# 因此要知道最后的概率，需要经过softmax层，所有Transformers模型的输出都是logits
print(outputs.logits)

In [None]:
import torch

# 经过softmax层之后才得到概率，现在需要查看模型看每个概率对应哪种标签
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

In [None]:
# 现在知道第一个是负面的概率，第二个是积极的概率
model.config.id2label

In [None]:
# 测试更多的句子，分别比较自己的管道和transformers实现的管道
from transformers import pipeline

classifierTF = pipeline("sentiment-analysis")
classifierTF(
    [
        "I absolutely love this project—it exceeded every expectation I had.",
        "This is the most rewarding experience I’ve had in years.",
        "I’m extremely disappointed with the final outcome.",
        "The entire process was frustrating and poorly managed.",
        "The outcome is good, and it meets most of my expectations."
    ]
)

In [None]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
inputs = [
    "I absolutely love this project—it exceeded every expectation I had.",
    "This is the most rewarding experience I’ve had in years.",
    "I’m extremely disappointed with the final outcome.",
    "The entire process was frustrating and poorly managed.",
    "The outcome is good, and it meets most of my expectations."
]

inputsid = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
print(inputsid)

In [None]:
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
# 这里前面加上**是Python语法，表示把inputsid这个字典的每个key作为函数的输入
# outs = model(input_ids = inputsid["inputs_ids"], attention_mask = inputsid["attention_mask"])
outs = model(**inputsid)

# 模型最后一层输出，没有经过SoftMax层
print(outs)

In [None]:
import torch

predictions = torch.nn.functional.softmax(outs.logits, dim=-1)
print(predictions)
# 输出和直接用pipeline是一样的

In [None]:
model.config.id2label

In [None]:
from transformers import BertConfig, BertModel

config = BertConfig()

# 使用默认配置创建模型会使用随机值对其进行初始化
# 模型可以运行得到结果，但是输出会胡言乱语
model = BertModel(config)

In [None]:
# config中包含许多用于构建模型的属性，是写死的，还没有训练
# 还没有训练的意思是，它的权重还是随机的，config展示的只是它结构的一些属性
# hidden_states隐状态向量的大小，num_hidden_layers定义了Transformers模型的参数
print(config)

In [None]:
# 未训练的模型输出是胡言乱语，不过这个是基础模型我不知道怎么确认
outputs = model(**inputsid)
print(outputs)
print(outputs.last_hidden_state.shape)

In [None]:
# 加载已经训练过的模型，参考上一节的方法
from transformers import BertModel

# 这里和AutoModel的区别就是，你已经指定了Bert模型的架构，所以你的checkpoint必须是Bert模型的
# 所以这里的逻辑就是找到Bert模型的bert-base-cased检查点的模型
# 这个模型就是预训练后的模型了，可以执行任务，也可以在新任务上微调
# 因此从单个Model变成AutoModel，checkpoint就不受限制了，Transformers库会自动识别
model = BertModel.from_pretrained("bert-base-cased")

In [None]:
model.save_pretrained("directory_on_my_computer")

In [None]:
ls directory_on_my_computer
# config.json表示模型的结构和元数据
# model.safetensors表示模型的所有参数

In [None]:
# 从自己的电脑下载模型
modelload = BertModel.from_pretrained("directory_on_my_computer")

In [None]:
tokenized_text = "Jim Henson was a puppeteer".split()
print(tokenized_text)

In [None]:
# 这里就是指定BERT模型的分词器
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [None]:
# 同样的，你可以直接使用AutoTokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
tokenizer("Using a Transformers network is simple")
# 这个输出和情感分类的那个模型输出好像不同，多了一个token_type_ids

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
tokenizer("Using a Transformers network is simple")
# 情感分类的输出

In [None]:
# 保存tokenizer
tokenizer.save_pretrained("directory_on_my_computer")

In [None]:
# 详细展示Tokenizer内部的处理过程:原始文本->分词（不同模型的分词法不同）->映射词汇ID（词汇表)->转换成模型可以接受的张量
from transformers import AutoTokenizer

# 实例化一个BERT模型的分词器，你正常使用直接用就行，下面展示详细内部步骤
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformers network is simple, I like tokenization"
print(sequence)

# 第一步分词，也就是把文本变成Token
tokens = tokenizer.tokenize(sequence)
print(tokens) # 这里也可以看出来,BERT模型是子词分词

# 第二步token查表变成inputIDs
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

# 第三步还要给ids进行处理，加上模型需要的特殊字符
final_inputs = tokenizer.prepare_for_model(ids)
print(final_inputs)

# 解码过程decode
# 不仅将索引转回tokens，还会将相同单词的tokens组合在一起生成可读的句子
decoded_string = tokenizer.decode(ids)
print(decoded_string)


# 经过这三步生成的就跟直接调用tokenizer一样了
# ！！注意这句话有问题，经过上面那三步，没有把它转换成tensor的形式，所以不能喂给模型！
inputsid = tokenizer(sequence, padding=True, truncation=True, return_tensors="pt")
print(inputsid)

In [None]:
# 试试看！
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

inputs = [
    "I’ve been waiting for a HuggingFace course my whole life.",
    "I hate this so much!"
]
print(inputs)

tokens = tokenizer.tokenize(inputs)
print(tokens)

ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

In [None]:
# 模型需要一批输入
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
# 转换出来的向量是一维的
input_ids = torch.tensor(ids)
print(input_ids)

# 我们直接用tokenizer看看，到底喂给模型的是什么
tokenized_inputs = tokenizer(sequence, padding=True, truncation=True, return_tensors="pt")
# 看到了吗，他是两维的
print(tokenized_inputs["input_ids"])

final_inputs = torch.tensor([ids])
print(final_inputs)

In [None]:
# 会报错，只输入了一个句子但是模型需要一个句子列表
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
# 增加一个维度，数据就输入成功了，复习一下怎么把他转变回来吧！
out = model(final_inputs)
print(out)

predictions = torch.nn.functional.softmax(out.logits, dim=-1)
print(predictions)

In [None]:
# 前几天学习第二章的处理多个序列昏头了，重新来一次
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)

prepared_input = tokenizer.prepare_for_model(ids)

print(input_ids)
print(prepared_input) # 注意和之间转换差tensor
print(tokenizer(sequence, padding=True, truncation=True, return_tensors="pt"))

In [None]:
tokenized_inputs = tokenizer(sequence, padding=True, truncation=True, return_tensors="pt")
print(tokenized_inputs["input_ids"])
# 和上面比多了一个维度

In [None]:
input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

# 手动的话如果没有prepare_for_model，你会发现他也没有特殊token
out = model(input_ids)
print("out:", out["logits"])

In [None]:
# 如果你只有一句话，你可以构建只有一个句子的batch
# 试试看！
batched_ids = [ids, ids]
input_ids = torch.tensor(batched_ids)
print("Input IDs:", input_ids)

out = model(input_ids)
print("out:", out["logits"])

In [None]:
# 张量要是矩形，所以
# batched_ids = [
#     [200, 200, 200],
#     [200, 200]
# ]
# 不能转换成张量 -> 填充输入Padding

padding_id = 100
# padding的作用是在值较少的句子中添加一个名为padding_id的特殊单词确保所有句子长度相同
batched_ids = [
    [200, 200, 200],
    [200, 200, padding_id]
]


In [None]:
# tokenizer.pad_token_id找到分词器的填充token的ID
print(tokenizer.pad_token_id) # 输出为0

In [None]:
# 比较单个处理和批处理，并使用填充的tokenID
sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]

batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id]
]

# 第一个结果并不是前两个结果的简单拼起来，为什么？
# 注意力层会考虑每个token的上下文信息
# 因此通过pad填充之后，填充的这个值也被注意力层纳入考虑范围，尽管它没有什么实际含义
# 需要通过注意力掩码attention_mask层来让注意力层忽略这些填充的token
print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
# 这里就是没有加上注意力掩码层的结果，所有token都被考虑了，所以输出不同
print(model(torch.tensor(batched_ids)).logits)

In [None]:
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id]
]

# 与Input_IDs张量形状完全一样的张量，用0和1填充，1表示应该关注，0表示忽略
attention_mask = [
    [1, 1, 1],
    [1, 1, 0]
]

outputs = model(torch.tensor(batched_ids), attention_mask = torch.tensor(attention_mask))
print(outputs)
# 现在的值就和上面两个句子单独考虑的一样了，说明忽略了填充的token
print(outputs.logits)

In [None]:
# 试试看！
sequence_1 = "I’ve been waiting for a HuggingFace course my whole life."
sequence_2 = "I hate this so much!"

tokens_1 = tokenizer.tokenize(sequence_1)
tokens_2 = tokenizer.tokenize(sequence_2)

input_ids1 = tokenizer.convert_tokens_to_ids(tokens_1)
input_ids2 = tokenizer.convert_tokens_to_ids(tokens_2)

input_ids22 = input_ids2 + [tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, tokenizer.pad_token_id, ]

batched_ids = [
    input_ids1,
    input_ids22
]

print(batched_ids)

input_ids = torch.tensor(batched_ids)

print(input_ids)

attention_mask = [
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
]

print(torch.tensor(attention_mask))

outputs = model(input_ids, attention_mask=torch.tensor(attention_mask))
print(outputs.logits)


In [None]:
output_1 = tokenizer(sequence_1, padding=True, truncation=True, return_tensors="pt")
print(output_1)
answer = model(**output_1)
print(answer) # 这里不一样是因为我没有加上特殊token

In [None]:
# 综合应用
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

model_inputs = tokenizer(sequence)

In [None]:
print(model_inputs)

In [None]:
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "So have I!"
]

model_inputs = tokenizer(sequences)

In [None]:
print(model_inputs)

In [None]:
model_inputs = tokenizer(sequences, padding="longest")
print(model_inputs)

In [None]:
model_inputs = tokenizer(sequences, padding="max_length")
print(model_inputs)

In [None]:
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)
print(model_inputs)

In [None]:
model_inputs = tokenizer(sequences, truncation=True)
print(model_inputs)

In [None]:
model_inputs = tokenizer(sequences, truncation=True, max_length=8)
print(model_inputs)

In [None]:
model_inputs = tokenizer(sequence, padding=True, return_tensors="pt")
print(model_inputs)

In [None]:
model_inputs = tokenizer(sequence)
print(model_inputs["input_ids"])

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
# 和直接用不同，直接调用Tokenizer会帮你加上开头结尾的特殊字符
print(ids)

In [None]:
print(tokenizer.decode(model_inputs["input_ids"]))
print(tokenizer.decode(ids))

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

model_inputs = tokenizer(sequence, padding=True, truncation=True, return_tensors="pt")
out = model(**model_inputs)
print(out)
print(out.logits)

import torch

predictions = torch.nn.functional.softmax(out.logits, dim=-1)
print(predictions)

In [None]:
from transformers import pipeline

classificater = pipeline("sentiment-analysis")
out = classificater(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "So have I!"
    ]
)
print(out)