# Hugging Face Transformers 微调语言模型-文本分类任务

## 数据集下载

In [1]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")

  from .autonotebook import tqdm as notebook_tqdm


### 查看数据集

In [2]:
import random
import pandas as pd
import datasets
from IPython.display import display, HTML

# 展示数据
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [3]:
# show_random_elements(dataset["train"])

## 数据预处理

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
# 数据填充、截断
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [5]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# show_random_elements(tokenized_datasets["train"], num_examples=1)

### 数据抽样

In [6]:
# 抽取全量的训练数据集(650,000)以及50,000个测试样本
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(5000))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(5000))

## 训练配置

### 原始模型加载

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 训练超参数

In [8]:
from transformers import TrainingArguments

model_dir = "models/bert-base-cased-finetune-yelp"
# logging_steps 默认值为500，根据我们的训练数据和步长，将其设置为100
training_args = TrainingArguments(output_dir=model_dir,
                                  evaluation_strategy="epoch",  # 指标监控
                                  per_device_train_batch_size=64,
                                  num_train_epochs=3,
                                  logging_steps=100)



### 指标评估

In [9]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
# 计算预测的准确率
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

## 模型训练

In [10]:
from transformers import  Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_eval_dataset,
    compute_metrics=compute_metrics,
)

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7061,0.711628,0.68536
2,0.6215,0.699841,0.69666
3,0.5429,0.728897,0.6964


TrainOutput(global_step=30471, training_loss=0.6536481841713516, metrics={'train_runtime': 20745.3935, 'train_samples_per_second': 93.997, 'train_steps_per_second': 1.469, 'total_flos': 5.130803778048e+17, 'train_loss': 0.6536481841713516, 'epoch': 3.0})

### 训练时的GPU状态

* CPU: 12核 
* MEM：29G  
* GPU：GeForce RTX 4090*1【显存24G】
  
```shell
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:00:08.0 Off |                  Off |
|  0%   32C    P8              28W / 450W |  21698MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A      9432      G   /usr/lib/xorg/Xorg                            4MiB |
|    0   N/A  N/A     13320      C   ...conda3/envs/transformers/bin/python    21678MiB |
+---------------------------------------------------------------------------------------+
```

## 模型评估

In [12]:
small_test_dataset = tokenized_datasets["test"].shuffle(seed=64).select(range(10000))

In [13]:
trainer.evaluate(small_test_dataset)

{'eval_loss': 0.7242669463157654,
 'eval_accuracy': 0.7002,
 'eval_runtime': 43.3769,
 'eval_samples_per_second': 230.537,
 'eval_steps_per_second': 28.817,
 'epoch': 3.0}

## 模型保存

In [14]:
trainer.save_model(model_dir)

In [15]:
trainer.save_state()