In [157]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification, pipeline

In [158]:
import pandas as pd

data = pd.read_csv('ChnSentiCorp_htl_all.csv')

In [159]:
data = data.dropna()

In [160]:
from torch.utils.data import DataLoader, Dataset


class myDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]["review"], int(self.data.iloc[idx]["label"])

In [161]:
dataset = myDataset(data)
type(dataset[0])

tuple

In [162]:
from torch.utils.data import random_split

train_set, valid_set = random_split(dataset, [0.9, 0.1])

In [163]:
tokenizer = AutoTokenizer.from_pretrained("./rbt3")


def collate_fn(batch):
    text, label = [], []
    for item in batch:
        text.append(item[0])
        label.append(item[1])
    inputs = tokenizer(text, max_length=128, padding="max_length", truncation=True, return_tensors="pt")
    inputs["labels"] = torch.tensor(label)
    return inputs

In [164]:
from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=train_set, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(dataset=valid_set, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [165]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained("./rbt3",num_labels=2)

optimizer = Adam(model.parameters(), lr=2e-5, )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [166]:
def valid():
    model.eval()
    acc_num = 0
    with torch.no_grad():
        for batch in valid_loader:
            batch = batch.to(torch.device("cuda"))
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            acc_num += (pred.long() == batch["labels"].long()).float().sum()
    return acc_num/len(valid_set)

def train(epochs=5, log_step=100):
    model.cuda()
    global_step = 0
    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            batch = batch.to(torch.device("cuda"))
            outputs = model(**batch)
            outputs.loss.backward()
            optimizer.step()
            global_step += 1
            if global_step % log_step == 0:
                print(f"epoch{epoch} global_step:{global_step} loss:{outputs.loss.item()}")
        acc = valid()
        print(f"epoch{epoch} acc:{acc}")

In [167]:
train()

epoch0 global_step:100 loss:0.31878790259361267
epoch0 global_step:200 loss:0.3004882335662842
epoch0 acc:0.8853092789649963
epoch1 global_step:300 loss:0.22689475119113922
epoch1 global_step:400 loss:0.16999724507331848
epoch1 acc:0.8840205669403076
epoch2 global_step:500 loss:0.13916510343551636
epoch2 global_step:600 loss:0.0849262923002243
epoch2 acc:0.8827319145202637
epoch3 global_step:700 loss:0.18778321146965027
epoch3 global_step:800 loss:0.053851135075092316
epoch3 acc:0.875
epoch4 global_step:900 loss:0.16670793294906616
epoch4 global_step:1000 loss:0.06257722526788712
epoch4 acc:0.8672680258750916


In [169]:
sen = "我觉得这个酒店很差，饭很难吃"
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = inputs.to(torch.device("cuda"))
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入：{sen}\n预测结果:{pred.item()}")

输入：我觉得这个酒店很差，饭很难吃
预测结果:0


In [171]:
from transformers import pipeline
sen = "我觉得这个酒店很差，饭很难吃"
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer,device=0)
print(pipe(sen))

Device set to use cuda:0


[{'label': 'LABEL_0', 'score': 0.9903035759925842}]
