In [1]:
# 文本分类模型微调的示例
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
# 加载数据集
from datasets import load_dataset


In [3]:
dataset = load_dataset("csv", data_files="dataset/dataset_furina.csv", split="train")
dataset = dataset.filter(lambda example: example["review"] is not None and example["label"] is not None)
print(dataset)

Dataset({
    features: ['label', 'review'],
    num_rows: 734
})


In [4]:
# 划分数据集
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 660
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 74
    })
})

In [6]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hflrbt3")

def process_function(examples):
    # 暂时不填充, 组成batch时再填充
    tokenized_example = tokenizer(examples["review"], max_length=128, truncation=True) 
    tokenized_example["label"] = examples["label"]
    return tokenized_example

# 处理数据集, 把数据集转换为模型可以处理的格式(分词器编码后的格式)
tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets


Map:   0%|          | 0/660 [00:00<?, ? examples/s]

Map:   0%|          | 0/74 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 660
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 74
    })
})

In [7]:
# 建立DataLoader
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
trainset = tokenized_datasets["train"]
validset = tokenized_datasets["test"]

trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))  # 乱序, 一组大小为32
validloader = DataLoader(validset, batch_size=32, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer)) # 不乱序


In [8]:
for x in trainloader:
    print(x)
    break
# next(enumerate(trainloader))


{'input_ids': tensor([[  101,  6763,  6763,  ...,     0,     0,     0],
        [  101,   113,  4500,  ...,     0,     0,     0],
        [  101,  1403,   519,  ...,     0,     0,     0],
        ...,
        [  101,  4500, 12619,  ...,     0,     0,     0],
        [  101,   872,   791,  ...,     0,     0,     0],
        [  101,  1146,  3358,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
        0, 0, 1, 1, 1, 0, 0, 1])}


In [9]:
from torch.optim import AdamW
# 导入模型
model = AutoModelForSequenceClassification.from_pretrained("hflrbt3")
#改为三分类的模型
# model.classifier = torch.nn.Linear(768, 3)
# print(model)
if torch.cuda.is_available():
    model = model.cuda()
# 定义优化器
optimizer = AdamW(model.parameters(), lr=2e-6) # 1e-5是学习率, 迁移学习使用的一般比较低


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hflrbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def evaluate():
    """
    Description: 评估模型在验证集上的性能
    Returns:
        模型的准确率
    """
    model.eval()
    acc_num = 0
    with torch.no_grad():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k:v.cuda() for k, v in batch.items()}
            outputs = model(**batch)
            pred = outputs.logits.argmax(dim=-1) # 预测的类别
            acc_num += (pred == batch["labels"].long()).float().sum().item()
    return acc_num / len(validset)


def train(epoch=15, log_step=100):
    """
    Description: 训练模型
    Args:
        epoch (int, optional): 训练的次数. Defaults to 3.
        log_step (int, optional): 打印log的步长. Defaults to 100.
    """
    global_step=0
    for ep in range(epoch):
        model.train()
        # 遍历训练集
        for batch in trainloader:
            # 将数据放到cuda上
            if torch.cuda.is_available():
                batch = {k:v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            outputs = model(**batch)
            outputs.loss.backward()
            optimizer.step()
            global_step += 1
            if global_step % log_step == 0:
                print(f"epoch={ep}, global_step={global_step}, loss={outputs.loss.item()}")
        # 每个epoch结束评估一次
        acc = evaluate()
        print(f"epoch={ep}, acc={acc}")

In [12]:
print(f'before train {evaluate()}')
train()

before train 0.5945945945945946
epoch=0, acc=0.6891891891891891
epoch=1, acc=0.7567567567567568
epoch=2, acc=0.8243243243243243
epoch=3, acc=0.8513513513513513
epoch=4, global_step=100, loss=0.5408715605735779
epoch=4, acc=0.8783783783783784
epoch=5, acc=0.918918918918919
epoch=6, acc=0.9459459459459459
epoch=7, acc=0.9594594594594594
epoch=8, acc=0.9594594594594594
epoch=9, global_step=200, loss=0.28677430748939514
epoch=9, acc=0.9594594594594594
epoch=10, acc=0.972972972972973
epoch=11, acc=0.972972972972973
epoch=12, acc=0.972972972972973
epoch=13, acc=0.9864864864864865
epoch=14, global_step=300, loss=0.16718952357769012
epoch=14, acc=0.9864864864864865


In [13]:
sen = "你今天怎么样"
id2label = {0:"no", 1:"yes"}
with torch.inference_mode():
    inputs = tokenizer(sen, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k:v.cuda() for k, v in inputs.items()}
    outputs = model(**inputs)
    pred = outputs.logits.argmax(dim=-1)
    print(id2label[pred.item()])

no


In [14]:
sen = ["服从命令, 芙宁娜女士",
       "你今天怎么样",
       "我要去上班了",
       "给我今天的新闻",
       "你是谁",
       "看一下最近的股票行情", 
       "给我讲解一下这一个项目的情况",
       "项目的进度怎么样了",]
from transformers import pipeline
model.config.id2label = id2label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
pipe(sen)

Device set to use cuda:0


[{'label': 'no', 'score': 0.8589010238647461},
 {'label': 'no', 'score': 0.9369204044342041},
 {'label': 'no', 'score': 0.8544386625289917},
 {'label': 'yes', 'score': 0.7509503364562988},
 {'label': 'no', 'score': 0.8854764699935913},
 {'label': 'yes', 'score': 0.9277355670928955},
 {'label': 'yes', 'score': 0.9519011378288269},
 {'label': 'yes', 'score': 0.8795883655548096}]

In [15]:
#获取时间生成名字
import time
import os
def get_time():
    return time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
model_name = f"model_furina/model_{get_time()}.pth" 
torch.save(model, model_name)


In [16]:
# 加载模型
model = torch.load(model_name)
model.eval()
model_name

  model = torch.load(model_name)


'model_furina/model_2025_02_05_12_07_44.pth'

In [17]:
sen = ["服从命令, 芙宁娜女士",
       "你今天怎么样",
       "我要去上班了",
       "给我今天的新闻",
       "你是谁",
       "看一下最近的股票行情",
       "给我讲解一下这一个项目的情况",
       "项目的情况怎么样了"]
from transformers import pipeline
model.config.id2label = id2label

pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
pipe(sen)

Device set to use cuda:0


[{'label': 'no', 'score': 0.8589010238647461},
 {'label': 'no', 'score': 0.9369204044342041},
 {'label': 'no', 'score': 0.8544386625289917},
 {'label': 'yes', 'score': 0.7509503364562988},
 {'label': 'no', 'score': 0.8854764699935913},
 {'label': 'yes', 'score': 0.9277355670928955},
 {'label': 'yes', 'score': 0.9519011378288269},
 {'label': 'yes', 'score': 0.9043154120445251}]