#### 数据预处理

下载分词器

In [2]:
from transformers import BertTokenizer, BertModel
import torch
from torch import nn
import os
import random
import torch.utils
import torch.utils.data

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# 使用bert的tokenizer和embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased')
bert = bert.to(device)
bert.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

读取数据,预处理数据

In [4]:
with open("data/rt-polarity.neg") as f:
    data_neg = []
    for line in f.readlines():
        data_neg.append(line.strip())

with open("data/rt-polarity.pos") as f:
    data_pos = []
    for line in f.readlines():
        data_pos.append(line.strip())

创造dataset

In [5]:
class SentimentClass(torch.utils.data.Dataset):
    def __init__(self, data, label) -> None:
        super().__init__()
        self.data = data
        self.label = label
    
    def __len__(self)-> int:
        return len(self.data)
    
    def __getitem__(self, index) -> tuple:
        return self.data[index], self.label[index]

# 合并正负样本，创建标签
all_data = data_pos + data_neg
all_labels = [1] * len(data_pos) + [0] * len(data_neg)

# 打乱数据
combined = list(zip(all_data, all_labels))
random.shuffle(combined)
all_data, all_labels = zip(*combined)

# 划分训练集和测试集（80%训练，20%测试）
split_idx = int(0.8 * len(all_data))
train_data, train_labels = list(all_data[:split_idx]), list(all_labels[:split_idx])
test_data, test_labels = list(all_data[split_idx:]), list(all_labels[split_idx:])

# 创建Dataset对象
train_dataset = SentimentClass(train_data, train_labels)
test_dataset = SentimentClass(test_data, test_labels)

print(f"训练集样本数: {len(train_dataset)}")
print(f"测试集样本数: {len(test_dataset)}")

训练集样本数: 8528
测试集样本数: 2133


创建dataloader

In [6]:
# 定义collate_fn，将文本批量编码为BERT词向量
def collate_fn(batch):
    texts, labels = zip(*batch)
    # tokenizer需提前定义好
    encodings = tokenizer(list(texts),
                         truncation=True,
                         padding=True,
                         max_length=128,
                         return_tensors='pt')
    # 返回input_ids, attention_mask, labels等
    return encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels)

In [7]:
# 创建DataLoader时，加入collate_fn
train_dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

#### 创造cnn模型

In [8]:

print(f"Using {device} device")

Using cuda device


In [9]:
class CNNModel(nn.Module):
    def __init__(self, embed_dim=768, num_classes=2, conv_layers=2, dropout=0.2, kernel_size=3, hidden_channels=128):
        super().__init__()
        layers = []
        in_channels = embed_dim
        for i in range(conv_layers):
            out_channels = hidden_channels
            layers.append(nn.Conv1d(in_channels, out_channels, kernel_size, padding=kernel_size//2))
            layers.append(nn.ReLU())
            in_channels = out_channels
        self.conv = nn.Sequential(*layers)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_channels, num_classes)

    def forward(self, x):
        # x: (batch, seq_len, embed_dim)
        x = x.transpose(1, 2)  # (batch, embed_dim, seq_len)
        x = self.conv(x)
        x = self.pool(x).squeeze(-1)  # (batch, hidden_channels)
        x = self.dropout(x)
        x = self.fc(x)
        return x

#### 创建优化器

In [None]:
# 创建模型和优化器
model = CNNModel().to(device)
optimizer = torch.optim.AdamW(params=model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

#### 创建训练代码

In [11]:
# 训练代码
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in train_dataloader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            outputs = bert(input_ids, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state  # (batch, seq_len, embed_dim)
        logits = model(embeddings)
        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    # 简单评估
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in test_dataloader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            outputs = bert(input_ids, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state
            logits = model(embeddings)
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    acc = correct / total
    print(f"Test Accuracy: {acc:.4f}")

Epoch 1/3, Loss: 0.6733
Test Accuracy: 0.7553
Test Accuracy: 0.7553
Epoch 2/3, Loss: 0.5786
Epoch 2/3, Loss: 0.5786
Test Accuracy: 0.7843
Test Accuracy: 0.7843
Epoch 3/3, Loss: 0.4669
Epoch 3/3, Loss: 0.4669
Test Accuracy: 0.8200
Test Accuracy: 0.8200
