# Hugging Face Transformers 微调语言模型-文本分类任务

## 数据集下载

In [1]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")

### 查看数据集

In [5]:
import random
import pandas as pd
import datasets
from IPython.display import display, HTML

# 展示数据
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(dataset["train"])

## 数据预处理

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# 数据填充、截断
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
show_random_elements(tokenized_datasets["train"], num_examples=1)

### 数据抽样

In [9]:
# 抽取全量的训练数据集(650,000)以及5,000个测试样本
full_train_dataset = tokenized_datasets["train"]
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(5000))

## 训练配置

### 原始模型加载

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

### 训练超参数

In [11]:
from transformers import TrainingArguments

model_dir = "models/bert-base-cased-finetune-yelp"
# logging_steps 默认值为500，根据我们的训练数据和步长，将其设置为100
training_args = TrainingArguments(output_dir=model_dir,
                                  evaluation_strategy="epoch",  # 指标监控
                                  per_device_train_batch_size=16,
                                  num_train_epochs=3,
                                  logging_steps=30)

### 指标评估

In [13]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
# 计算预测的准确率
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

## 模型训练

In [None]:
from transformers import  Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

## 模型评估

In [18]:
small_test_dataset = tokenized_datasets["test"].shuffle(seed=64).select(range(100))

In [None]:
trainer.evaluate(small_test_dataset)

## 模型保存

In [20]:
trainer.save_model(model_dir)

In [21]:
trainer.save_state()