In [8]:
from pandas import DataFrame
from MyModule import load_data, load_vocab, remap_labels

train_data = remap_labels(load_data("train_raw.txt"))
val_data = remap_labels(load_data("val_raw.txt"))
test_data = remap_labels(load_data("test_raw.txt"))

In [9]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-chinese")

# 示例
encoded = tokenizer(
    "生活很美好，但也有点疲惫。",
    truncation=True,
    padding='max_length',
    max_length=128,
    return_tensors='pt'
)

print(encoded['input_ids'].shape)  # torch.Size([1, 128])


torch.Size([1, 128])


In [14]:
import torch
from torch.utils.data import Dataset, DataLoader

class WeiboDataset(Dataset):
    def __init__(self, data: DataFrame, tokenizer, max_len=128):
        self.texts = data[0]
        self.labels = data[1]
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx], 
            truncation=True, 
            padding='max_length', 
            max_length=self.max_len, 
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0), 
            'attention_mask': encoding["attention_mask"].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }


max_len = 64
train_dataset = WeiboDataset(train_data, tokenizer, max_len)
val_dataset = WeiboDataset(val_data, tokenizer, max_len)
test_dataset = WeiboDataset(test_data, tokenizer, max_len)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [15]:
from transformers import BertModel
import torch.nn as nn

class BertForWeiboSentiment(nn.Module):
    def __init__(self, model_name, num_class=3) -> None:
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_class)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        out = self.fc(self.dropout(cls_output))
        return out

In [17]:
from tqdm.auto import tqdm

# from torch.cuda.amp import autocast, GradScaler
from torch.amp import autocast, GradScaler
from transformers import get_linear_schedule_with_warmup


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForWeiboSentiment("./models/chinese-macbert-base").to(device)


criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=1e-3)
num_epochs = 5

total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, int(0.1*total_steps), total_steps)

scaler = GradScaler('cuda')
accum_steps = 2

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    for step, batch in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch+1}")):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)


        with autocast('cuda'):
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels) / accum_steps

        scaler.scale(loss).backward()

        if (step + 1) % accum_steps == 0 or (step + 1) == len(train_loader):
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()

        total_loss += loss.item() * accum_steps

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")


model.eval()
correct, total = 0, 0
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        with autocast('cuda'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask)
            preds = logits.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")

Training Epoch 1: 100%|██████████| 7844/7844 [13:43<00:00,  9.53it/s]


Epoch 1/5, Loss: 0.6315


Training Epoch 2: 100%|██████████| 7844/7844 [13:33<00:00,  9.64it/s]


Epoch 2/5, Loss: 0.4672


Training Epoch 3: 100%|██████████| 7844/7844 [13:11<00:00,  9.91it/s]


Epoch 3/5, Loss: 0.4214


Training Epoch 4: 100%|██████████| 7844/7844 [13:14<00:00,  9.87it/s]


Epoch 4/5, Loss: 0.3874


Training Epoch 5: 100%|██████████| 7844/7844 [13:13<00:00,  9.89it/s]


Epoch 5/5, Loss: 0.3613


Evaluating: 100%|██████████| 1681/1681 [00:57<00:00, 29.43it/s]

Test Accuracy: 0.8262



