In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import evaluate

# S1. 加载数据

In [3]:
import pandas as pd

data = pd.read_csv("./ChnSentiCorp_htl_all.csv")
data.head()

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"


In [4]:
na_rows = data[data.isna().any(axis=1)]
na_rows

Unnamed: 0,label,review
6374,0,


In [5]:
print(len(data))
data = data.dropna()
print(len(data))

7766
7765


# S2: 创建数据集

In [6]:
from torch.utils.data import Dataset, DataLoader


class HotelReviewDataset(Dataset):
    def __init__(self) -> None:
        super().__init__()
        self.data = pd.read_csv("./ChnSentiCorp_htl_all.csv")
        self.data = self.data.dropna()

    def __getitem__(self, index):
        return self.data.iloc[index]["review"], self.data.iloc[index]["label"]

    def __len__(self):
        return len(self.data)


dataset = HotelReviewDataset()

for i in range(5):
    print(dataset[i])

('距离川沙公路较近,但是公交指示不对,如果是"蔡陆线"的话,会非常麻烦.建议用别的路线.房间较为简单.', 1)
('商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!', 1)
('早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。', 1)
('宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小，但加上低价位因素，还是无超所值的；环境不错，就在小胡同内，安静整洁，暖气好足-_-||。。。呵还有一大优势就是从宾馆出发，步行不到十分钟就可以到梅兰芳故居等等，京味小胡同，北海距离好近呢。总之，不错。推荐给节约消费的自助游朋友~比较划算，附近特色小吃很多~', 1)
('CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风', 1)


# S3: 数据集划分

In [7]:
from torch.utils.data import random_split

trainset, validset = random_split(dataset, lengths=[0.9, 0.1])

# S4: 创建DataLoader

In [8]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")


def collate_fn(batch):
    input_texts = []
    labels = []
    for text, label in batch:
        input_texts.append(text)
        labels.append(label)
    model_input = tokenizer(
        input_texts,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    model_input["labels"] = torch.tensor(labels, dtype=torch.long)
    return model_input


trainloader = DataLoader(
    trainset, batch_size=128, shuffle=True, num_workers=4, collate_fn=collate_fn
)
valloader = DataLoader(
    validset, batch_size=128, shuffle=False, num_workers=4, collate_fn=collate_fn
)

# S5: 创建模型与优化器

In [9]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# S6: 模型训练

In [10]:
import numpy as np

accuracy = evaluate.load("accuracy")


def evaluate():
    model.eval()
    eval_accuracy = evaluate.load("accuracy")
    for batch in valloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.inference_mode():
            output = model(**batch)
            predicts = torch.argmax(output.logits, dim=-1)
        eval_accuracy.add_batch(predictions=predicts, references=batch["labels"])
    model.train()
    return eval_accuracy.compute()["accuracy"]


def train(epochs=10):
    for epoch in range(epochs):
        train_loss = []
        for batch in trainloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            output = model(**batch)
            loss = output.loss
            train_loss.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        val_acc = evaluate()
        print(
            f"epoch {epoch}: training loss: {np.array(train_loss).mean()}, val acc: {val_acc:.4f}"
        )

In [11]:
train(epochs=3)

epoch 0: training loss: 0.46828752403909507, val acc: 0.875
epoch 1: training loss: 0.2842704336751591, val acc: 0.8994845360824743
epoch 2: training loss: 0.23899261192841964, val acc: 0.9085051546391752


# S7: 模型预测

In [37]:
test_input = "我觉得这家酒店不错，房间很大"
id2_label = {0: "差评", 1: "好评"}
model.eval()
with torch.inference_mode():
    model_input = tokenizer(test_input, return_tensors="pt")
    model_input = {k: v.to(device) for k, v in model_input.items()}
    logits = model(**model_input).logits
    predict_id = torch.argmax(logits, dim=-1)
    print(f"Input: {test_input}, CLS: {id2_label[predict_id.item()]}")

Input: 我觉得这家酒店不错，房间很大, CLS: 好评


In [38]:
from transformers.pipelines import pipeline

model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
pipe(test_input)

[{'label': '好评', 'score': 0.9584382176399231}]