# 文本分类

In [7]:
from transformers import AutoModel,AutoTokenizer,AutoConfig,AutoModelForSequenceClassification

## 加载数据

In [22]:
import pandas as pd

train_data = pd.read_csv("./train.csv")
test_data = pd.read_csv("./test.csv")

In [23]:
train_data = train_data.dropna()
test_data = test_data.dropna()


## 创建DataSet

In [25]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = pd.read_csv(data)
        self.data = self.data.dropna()

    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        text = self.data.iloc[index]["text"]
        label = self.data.iloc[index]["label"]
        return text,label


In [26]:
train_dataset = MyDataset("./train.csv")
test_dataset = MyDataset("./test.csv")
train_dataset[0]

('选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
 np.int64(1))

## 创建Dataloader

In [37]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def collate_fn(batch):
    texts,labels = [], []
    for test, label in batch:
        texts.append(test)
        labels.append(label)
    inputs = tokenizer(texts,padding="max_length",truncation=True,max_length=128,return_tensors="pt")
    inputs["labels"] = torch.tensor(labels)
    return inputs

In [38]:
trainloader = DataLoader(train_dataset,batch_size=32,shuffle=True,collate_fn=collate_fn)

In [39]:
testloader = DataLoader(test_dataset,batch_size=32,shuffle=True,collate_fn=collate_fn)

In [40]:
trainloader

<torch.utils.data.dataloader.DataLoader at 0x1f70f465040>

In [41]:
next(iter(trainloader))

{'input_ids': tensor([[ 101,  855, 5390,  ..., 1469,  855,  102],
        [ 101, 2595, 5543,  ...,    0,    0,    0],
        [ 101, 4294,  817,  ...,    0,    0,    0],
        ...,
        [ 101, 2419, 6956,  ...,    0,    0,    0],
        [ 101, 7360,  817,  ...,    0,    0,    0],
        [ 101, 3766, 3300,  ..., 3209, 3209,  102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 1, 1, 0, 0, 1])}

## 创建模型及优化器

In [50]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")

if torch.cuda.is_available():
    model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
optimizer = Adam(model.parameters(),lr=2e-5)

## 训练与测试

In [52]:
def evaluate():
    model.eval()
    acc_num = 0
    with torch.no_grad():
        for step, inputs in enumerate(testloader):
            if torch.cuda.is_available():
                inputs = {k:v.cuda() for k,v in inputs.items()}
            outputs = model(**inputs)
            preds = outputs.logits.argmax(-1)
            acc_num += (preds == inputs["labels"].long()).float().sum()
    return acc_num / len(test_dataset)



def train(epochs=3, log_step=10):
    global_step = 0
    for epoch in range(epochs):
        model.train()
        for step, inputs in enumerate(trainloader):
            if torch.cuda.is_available():
                inputs = {k:v.cuda() for k,v in inputs.items()}
            optimizer.zero_grad()
            outputs = model(**inputs)
            outputs.loss.backward()
            optimizer.step()
            if step % log_step == 0:
                print(f"epoch:{epoch+1}/{epochs} step:{step}/{len(trainloader)} loss:{outputs.loss.item()}")
            global_step += 1
        acc = evaluate()
        print(f"epoch:{epoch+1}/{epochs} acc:{acc}")


## 训练

In [53]:
train()

epoch:1/3 step:0/300 loss:0.7386910915374756
epoch:1/3 step:10/300 loss:0.6403576135635376
epoch:1/3 step:20/300 loss:0.5800982713699341
epoch:1/3 step:30/300 loss:0.5473603010177612
epoch:1/3 step:40/300 loss:0.48898643255233765
epoch:1/3 step:50/300 loss:0.30891624093055725
epoch:1/3 step:60/300 loss:0.5130375623703003
epoch:1/3 step:70/300 loss:0.40818893909454346
epoch:1/3 step:80/300 loss:0.48515215516090393
epoch:1/3 step:90/300 loss:0.39430752396583557
epoch:1/3 step:100/300 loss:0.31969308853149414
epoch:1/3 step:110/300 loss:0.2986724376678467
epoch:1/3 step:120/300 loss:0.24974414706230164
epoch:1/3 step:130/300 loss:0.19234725832939148
epoch:1/3 step:140/300 loss:0.35180678963661194
epoch:1/3 step:150/300 loss:0.23677435517311096
epoch:1/3 step:160/300 loss:0.49373844265937805
epoch:1/3 step:170/300 loss:0.26226794719696045
epoch:1/3 step:180/300 loss:0.3923727869987488
epoch:1/3 step:190/300 loss:0.3655291795730591
epoch:1/3 step:200/300 loss:0.3823994994163513
epoch:1/3 st