In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

# 读取数据，第一行为文本，第二行为标签
def load_data(filename):
    texts = []
    labels = []
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for i in range(0, len(lines), 2):  # 每两行一起处理
            text = lines[i].strip()
            label = lines[i+1].strip()
            try:
                labels.append(int(label))  # 尝试将标签转换为整数
                texts.append(text)
            except ValueError:
                print(f"Skipping invalid label: {label}")
    return texts, labels

# 文本分词
def tokenize(text):
    return re.findall(r'\w+', text.lower())

# 构建词汇表
def build_vocab(texts):
    vocab = {}
    idx = 0
    for text in texts:
        tokens = tokenize(text)
        for token in tokens:
            if token not in vocab:
                vocab[token] = idx
                idx += 1
    return vocab

# 将文本转换为特征向量
def text_to_features(texts, vocab):
    features = []
    for text in texts:
        tokens = tokenize(text)
        token_ids = [vocab.get(token, -1) for token in tokens if token in vocab]
        features.append(token_ids)
    return features

# 填充序列
def pad_sequences(sequences, maxlen):
    return torch.tensor([seq + [0] * (maxlen - len(seq)) for seq in sequences])

# 加载训练和测试数据
train_texts, train_labels = load_data('./dataset/headlines/train.txt')
test_texts, test_labels = load_data('./dataset/headlines/test.txt')

# 构建词汇表
vocab = build_vocab(train_texts)

# 将文本转换为特征
train_features = text_to_features(train_texts, vocab)
test_features = text_to_features(test_texts, vocab)

# 标签编码
le = LabelEncoder()
train_labels = torch.tensor(le.fit_transform(train_labels))
test_labels = torch.tensor(le.transform(test_labels))

# 确定最大序列长度并填充
maxlen = max(max(len(seq) for seq in train_features), max(len(seq) for seq in test_features))
train_features = pad_sequences(train_features, maxlen)
test_features = pad_sequences(test_features, maxlen)


In [6]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)  # 嵌入层
        self.model = nn.Sequential(
            nn.Linear(embed_size * maxlen, hidden_size),  # 全连接层1
            nn.ReLU(),                                   # 激活函数
            nn.Linear(hidden_size, num_classes)           # 全连接层2（输出层）
        )

    def forward(self, x):
        embedded = self.embedding(x)  # 词嵌入
        embedded = embedded.view(embedded.size(0), -1)  # 展平成平面结构
        output = self.model(embedded)  # 使用 Sequential 定义的层
        return output


In [7]:
# 训练模型
def train_model(model, train_features, train_labels, epochs=10, batch_size=32):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        all_preds = []
        all_labels = []
        for i in range(0, len(train_features), batch_size):
            batch_features = train_features[i:i+batch_size]
            batch_labels = train_labels[i:i+batch_size]
            
            optimizer.zero_grad()
            output = model(batch_features)
            loss = loss_fn(output, batch_labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            # 收集每个批次的预测和标签用于后续的F1计算
            preds = torch.argmax(output, dim=1).cpu().numpy()
            labels = batch_labels.cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels)
        
        # 计算并输出整个epoch的准确率和F1值
        epoch_acc = accuracy_score(all_labels, all_preds)
        epoch_f1 = f1_score(all_labels, all_preds, average='macro')
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_features):.4f}, ACC: {epoch_acc:.4f}, F1: {epoch_f1:.4f}')

# 测试模型
def test_model(model, test_features, test_labels):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        output = model(test_features)
        preds = torch.argmax(output, dim=1).cpu().numpy()
        labels = test_labels.cpu().numpy()
        
        all_preds.extend(preds)
        all_labels.extend(labels)

        # 计算准确率和F1-score
        test_acc = accuracy_score(all_labels, all_preds)
        test_f1 = f1_score(all_labels, all_preds, average='macro')
        print(f'Test Accuracy: {test_acc:.4f}, F1-score: {test_f1:.4f}')


In [8]:
# 模型参数
vocab_size = len(vocab)
embed_size = 64
hidden_size = 128
num_classes = len(set(train_labels.tolist()))

# 初始化模型
model = CNN(vocab_size, embed_size, hidden_size, num_classes)

# 损失函数和优化器
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
train_model(model, train_features, train_labels, epochs=20, batch_size=32)

# 测试模型
test_model(model, test_features, test_labels)


Epoch 1/20, Loss: 0.0223, ACC: 0.5520, F1: 0.5514
Epoch 2/20, Loss: 0.0182, ACC: 0.6928, F1: 0.6925
Epoch 3/20, Loss: 0.0140, ACC: 0.7916, F1: 0.7914
Epoch 4/20, Loss: 0.0106, ACC: 0.8568, F1: 0.8567
Epoch 5/20, Loss: 0.0081, ACC: 0.9000, F1: 0.8999
Epoch 6/20, Loss: 0.0092, ACC: 0.8720, F1: 0.8719
Epoch 7/20, Loss: 0.0073, ACC: 0.8952, F1: 0.8950
Epoch 8/20, Loss: 0.0040, ACC: 0.9544, F1: 0.9544
Epoch 9/20, Loss: 0.0024, ACC: 0.9780, F1: 0.9780
Epoch 10/20, Loss: 0.0016, ACC: 0.9876, F1: 0.9876
Epoch 11/20, Loss: 0.0009, ACC: 0.9948, F1: 0.9948
Epoch 12/20, Loss: 0.0008, ACC: 0.9936, F1: 0.9936
Epoch 13/20, Loss: 0.0005, ACC: 0.9964, F1: 0.9964
Epoch 14/20, Loss: 0.0003, ACC: 0.9988, F1: 0.9988
Epoch 15/20, Loss: 0.0002, ACC: 0.9996, F1: 0.9996
Epoch 16/20, Loss: 0.0002, ACC: 1.0000, F1: 1.0000
Epoch 17/20, Loss: 0.0001, ACC: 1.0000, F1: 1.0000
Epoch 18/20, Loss: 0.0001, ACC: 1.0000, F1: 1.0000
Epoch 19/20, Loss: 0.0001, ACC: 1.0000, F1: 1.0000
Epoch 20/20, Loss: 0.0000, ACC: 1.0000, 