In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

# 读取数据，第一行为文本，第二行为标签
def load_data(filename):
    texts = []
    labels = []
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for i in range(0, len(lines), 2):  # 每两行一起处理
            text = lines[i].strip()
            label = lines[i+1].strip()
            try:
                labels.append(int(label))  # 尝试将标签转换为整数
                texts.append(text)
            except ValueError:
                print(f"Skipping invalid label: {label}")
    return texts, labels

# 文本分词
def tokenize(text):
    return re.findall(r'\w+', text.lower())

# 构建词汇表
def build_vocab(texts):
    vocab = {}
    idx = 0
    for text in texts:
        tokens = tokenize(text)
        for token in tokens:
            if token not in vocab:
                vocab[token] = idx
                idx += 1
    return vocab

# 将文本转换为特征向量
def text_to_features(texts, vocab):
    features = []
    for text in texts:
        tokens = tokenize(text)
        token_ids = [vocab.get(token, -1) for token in tokens if token in vocab]
        features.append(token_ids)
    return features

# 填充序列
def pad_sequences(sequences, maxlen):
    return torch.tensor([seq + [0] * (maxlen - len(seq)) for seq in sequences])

# 加载训练和测试数据
train_texts, train_labels = load_data('train.txt')
test_texts, test_labels = load_data('test.txt')

# 构建词汇表
vocab = build_vocab(train_texts)

# 将文本转换为特征
train_features = text_to_features(train_texts, vocab)
test_features = text_to_features(test_texts, vocab)

# 标签编码
le = LabelEncoder()
train_labels = torch.tensor(le.fit_transform(train_labels))
test_labels = torch.tensor(le.transform(test_labels))

# 确定最大序列长度并填充
maxlen = max(max(len(seq) for seq in train_features), max(len(seq) for seq in test_features))
train_features = pad_sequences(train_features, maxlen)
test_features = pad_sequences(test_features, maxlen)


In [6]:
class SimpleNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes):
        super(SimpleNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)  # 嵌入层
        self.model = nn.Sequential(
            nn.Linear(embed_size * maxlen, hidden_size),  # 全连接层1
            nn.ReLU(),                                   # 激活函数
            nn.Linear(hidden_size, num_classes)           # 全连接层2（输出层）
        )

    def forward(self, x):
        embedded = self.embedding(x)  # 词嵌入
        embedded = embedded.view(embedded.size(0), -1)  # 展平成平面结构
        output = self.model(embedded)  # 使用 Sequential 定义的层
        return output


In [7]:
# 训练模型
def train_model(model, train_features, train_labels, epochs=10, batch_size=32):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        all_preds = []
        all_labels = []
        for i in range(0, len(train_features), batch_size):
            batch_features = train_features[i:i+batch_size]
            batch_labels = train_labels[i:i+batch_size]
            
            optimizer.zero_grad()
            output = model(batch_features)
            loss = loss_fn(output, batch_labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            # 收集每个批次的预测和标签用于后续的F1计算
            preds = torch.argmax(output, dim=1).cpu().numpy()
            labels = batch_labels.cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels)
        
        # 计算并输出整个epoch的准确率和F1值
        epoch_acc = accuracy_score(all_labels, all_preds)
        epoch_f1 = f1_score(all_labels, all_preds, average='macro')
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_features):.4f}, ACC: {epoch_acc:.4f}, F1: {epoch_f1:.4f}')

# 测试模型
def test_model(model, test_features, test_labels):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        output = model(test_features)
        preds = torch.argmax(output, dim=1).cpu().numpy()
        labels = test_labels.cpu().numpy()
        
        all_preds.extend(preds)
        all_labels.extend(labels)

        # 计算准确率和F1-score
        test_acc = accuracy_score(all_labels, all_preds)
        test_f1 = f1_score(all_labels, all_preds, average='macro')
        print(f'Test Accuracy: {test_acc:.4f}, F1-score: {test_f1:.4f}')


In [8]:
# 模型参数
vocab_size = len(vocab)
embed_size = 64
hidden_size = 128
num_classes = len(set(train_labels.tolist()))

# 初始化模型
model = SimpleNN(vocab_size, embed_size, hidden_size, num_classes)

# 损失函数和优化器
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
train_model(model, train_features, train_labels, epochs=10, batch_size=32)

# 测试模型
test_model(model, test_features, test_labels)


Epoch 1/10, Loss: 0.0224, ACC: 0.5500, F1: 0.5489
Epoch 2/10, Loss: 0.0193, ACC: 0.6604, F1: 0.6598
Epoch 3/10, Loss: 0.0152, ACC: 0.7684, F1: 0.7679
Epoch 4/10, Loss: 0.0116, ACC: 0.8384, F1: 0.8382
Epoch 5/10, Loss: 0.0085, ACC: 0.8852, F1: 0.8851
Epoch 6/10, Loss: 0.0059, ACC: 0.9280, F1: 0.9280
Epoch 7/10, Loss: 0.0041, ACC: 0.9568, F1: 0.9568
Epoch 8/10, Loss: 0.0039, ACC: 0.9488, F1: 0.9488
Epoch 9/10, Loss: 0.0044, ACC: 0.9428, F1: 0.9428
Epoch 10/10, Loss: 0.0053, ACC: 0.9336, F1: 0.9336
Test Accuracy: 0.7280, F1-score: 0.7245
