<a href="https://colab.research.google.com/github/cche0214/HuggingFaceLLM/blob/main/03%E5%BE%AE%E8%B0%83%E4%B8%80%E4%B8%AA%E6%A8%A1%E5%9E%8B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life",
    "This course is amazing!"
]

batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

print(batch)

# 新增
batch["labels"] = torch.tensor([1,1])

print(batch)

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

In [None]:
# datasets理解为包含三个表（训练、验证、测试）的一个数据结构
# dataset理解为一张表，features表示表头，num_rows表示有多少行
# 所以可以按照列取表，也可以按照行取表
raw_train_dataset = raw_datasets["train"]
raw_train_dataset

In [None]:
raw_train_dataset[0]

In [None]:
raw_train_dataset["label"]

In [None]:
print(len(raw_train_dataset["label"]))

In [None]:
# 查看不同数字对应标签的含义，通过raw_train_dataset.features来查询
# 这里就是label为0表示两个句子意思不相等，为1表示两个句子意思相等
raw_train_dataset.features

In [None]:
# 试试看！
# 查询训练集第15行元素
raw_train_dataset[14]

In [None]:
# 查询验证集第87行元素
raw_validation_dataset = raw_datasets["validation"]
raw_validation_dataset

In [None]:
raw_validation_dataset[86]

In [None]:
raw_train_dataset["sentence1"]

In [None]:
print(len(raw_train_dataset["sentence1"]))

In [None]:
# 预处理数据集，也就是把文本序列转换成数字，也就是上一节的Tokenizer
# 报错了，原因是tokenizer传入的对象不对，我们这里传入的是Column对象，看看下面的教程
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [None]:
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs
# token_type_ids就是告诉模型哪一部分是第一句，哪一部分是第二句

In [None]:
raw_train_dataset[14]

In [None]:
train_151 = tokenizer(raw_train_dataset[15]["sentence1"])
train_152 = tokenizer(raw_train_dataset[15]["sentence2"])
# 和上面的区别就是，两个句子分别分词的话，token_type_ids都是0，而上面有1
print(train_151, train_152)

In [None]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])
# 两句话的时候，模型需要的输入是[CLS]sentence1[SEP]sentence2[SEP]
# 对比token_type_ids发现，输入里面的[CLS]sentence1[SEP]都是0，sentence2[SEP]都是1
# 有的模型不一定具有token_type_ids

In [None]:
# 预处理的一种方式，还是报错没搞明白
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [None]:
# 使用Dataset.map()方法将数据保存为dataset格式
# 前面一波分析半天，其实就是说要把预处理之后的数据保存成Dataset格式，模型才能用
# 这里的预处理，就是分词
# map()函数的原理是使用一个函数处理数据集的每个元素，因此需要定义这个函数
# 省略padding的原因是，如果这里指定会将所有样本填充到最大长度，而如果在每个batch中，只需要填充到当前batch的最大长度就可以，节省空间
def tokenize_function(example):
  return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets
# 处理之后会给数据集添加新的字段，下面多了input_ids,token_type_ids,attention_mask

In [None]:
tokenized_datasets["train"]

In [None]:
tokenized_datasets["train"][0]

In [None]:
# 最后需要将所有示例填充到该batch中最长元素的长度，动态填充技术
# 需要定义一个collate函数，将每个batch句子填充到正确的长度
# transformers库通过DataCollatorWithPadding为我们提供了一个函数
# 实例化时需要一个tokenizer，用来知道使用哪种填充token和模型期望的填充方向
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
samples = tokenized_datasets["train"][:8]
print(samples.items())
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
print(samples)
# 每个样本的长度是不同的，那么动态填充就意为着这个batch都应该填充到67，也就是这个batch的最大长度
# 这就叫做动态填充，如果不采用这个就会填充到数据集中的最大长度，或者模型可以接受的最大长度
[len(x) for x in samples["input_ids"]]

In [None]:
# data_collator把每个元素转换成张量，并且动态填充
batch = data_collator(samples)
print(batch)
{k: v.shape for k, v in batch.items()}

In [None]:
# 微调一个模型，复习上节的内容
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments

# 包含在训练和评估中使用的所有超参数，这里都是默认值，传入的参数是保存文件的目录
# training_args = TrainingArguments("test-trainer")

In [None]:
from transformers import AutoModelForSequenceClassification

# bert模型没有在句子分类方面进行过预训练，但是我们又用它生成了一个有句子分类模型头的模型
# 因此，原先预训练的模型头被丢弃，添加了一个适合句子分类的模型头
# 输出的警告说明，有一些权重没有用，对应原先的被放弃的预训练的模型头的参数，有些权重被随机初始化，也就是新加入的模型头的权重，我们微调的对象
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
from transformers import Trainer

# trainer = Trainer(
#     model,
#     training_args,
#     train_dataset = tokenized_datasets["train"],
#     eval_dataset = tokenized_datasets["validation"],
#     data_collator = data_collator,
#     tokenizer = tokenizer
# )

In [None]:
!nvidia-smi

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions)

In [None]:
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
import numpy as np

# -1表示最后一个维度上找最大值，对于二维数组，就是每一行找最大值，返回索引表示类别
# preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
# 所以这个其实是模型预测的每个样本的类别，上面那个label_ids是真实标签
print(preds)

In [None]:
!pip install evaluate

In [None]:
import evaluate

# metric = evaluate.load("glue", "mrpc")
# metric.compute(predictions=preds, references=predictions.label_ids)

In [None]:
def compute_metrics(eval_preds):
  metric = evaluate.load("glue", "mrpc")
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [None]:
import transformers

transformers.__version__

In [None]:
print(TrainingArguments)

In [None]:
training_args = TrainingArguments("test-trainer", eval_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()