In [1]:
# 选择根目录
%cd /home/zjdou/zpwang/InsultingLanguageDetection/

import torch

dev = torch.device('cuda:1')  # 运算设备，cpu 或 cuda
model_name = 'bert-base-uncased'

/home/zjdou/zpwang/InsultingLanguageDetection


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Data

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


class MyDataset(Dataset):
    def __init__(self, data) -> None:
        super().__init__()
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]


sample_data = [
    ['i am happy', 1],
    ['i am glad', 1],
    ['i am sad', 0],
    ['i hate it', 0],
] * 16
train_data, dev_data = train_test_split(sample_data, train_size=0.75, shuffle=True)
train_dataloader = DataLoader(MyDataset(train_data), batch_size=4, shuffle=True)
dev_dataloader = DataLoader(MyDataset(dev_data), batch_size=4, shuffle=False)
print('one sample batch:', iter(train_dataloader).__next__())

one sample batch: [('i hate it', 'i am happy', 'i am sad', 'i am glad'), tensor([0, 1, 0, 1])]


In [3]:
# Model, Tokenizer

from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    cache_dir='./pretrained_model/',
    num_labels=2,  # 分类的类别数量
)
model.to(dev)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir='./pretrained_model/')



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [4]:
# Loss, Optimizer

import torch
import torch.nn as nn

criterion = nn.CrossEntropyLoss()  # 交叉熵损失
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [5]:
# Dev 开发，不同于测试（Test），使用部分训练集调整超参


import torchmetrics


def val():
    model.eval()
    with torch.no_grad():
        metric = torchmetrics.Accuracy('binary').to(dev)
        for x, y in dev_dataloader:
            x = tokenizer(x, padding=True, truncation=True, return_tensors='pt')
            x = x.to(dev)
            y = y.to(dev)
            output = model(**x)
            output = torch.argmax(output['logits'], dim=1)
            metric(output, y)
    print('Acc', metric.compute())

In [6]:
# Train

epoch = 3  # 训练轮次，不同于batch，全部训练集训练一遍为一个epoch
for p in range(1, epoch+1):
    model.train()
    total_loss = 0
    for x, y in train_dataloader:
        x = tokenizer(x, padding=True, truncation=True, return_tensors='pt')
        x = x.to(dev)
        y = y.to(dev)
        output = model(**x)
        # print(output)
        loss = criterion(output['logits'], y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss
    print(f'epoch {p}, total loss {total_loss:.3f}')
    val()

epoch 1, total loss 8.987
Acc tensor(0.4375, device='cuda:1')
epoch 2, total loss 10.084
Acc tensor(0.4375, device='cuda:1')
epoch 3, total loss 10.442
Acc tensor(0.4375, device='cuda:1')


In [7]:
# Test

test_data = [
    ['i am angry', 0],
    ['i dislike it', 0],
    ['i am hopeful', 1],
    ['i\'m glad to see you', 1]
]
test_dataloader = DataLoader(MyDataset(test_data), batch_size=4)

model.eval()
with torch.no_grad():
    metric = torchmetrics.Accuracy('binary').to(dev)
    for x, y in test_dataloader:
        x = tokenizer(x, padding=True, truncation=True, return_tensors='pt')
        x = x.to(dev)
        y = y.to(dev)
        output = model(**x)
        output = torch.argmax(output['logits'], dim=1)
        metric(output, y)
    print('Acc', metric.compute())


Acc tensor(0.5000, device='cuda:1')


写于 2023.6.6