In [1]:
from pandas import DataFrame
from MyModule import load_data, load_vocab, remap_labels

train_data = remap_labels(load_data("train_vectorized.txt", sep=",", is_vectorized=True))
val_data = remap_labels(load_data("val_vectorized.txt", sep=",", is_vectorized=True))
test_data = remap_labels(load_data("test_vectorized.txt", sep=",", is_vectorized=True))

vocab = load_vocab("vocab.txt")

In [2]:
import numpy as np
import torch

def load_pretrained_embeddings(embedding_path, vocab, embedding_dim=200):
    """
    根据已有 vocab 构建嵌入矩阵
    """
    # found = {v: False for v in vocab.token_to_idx.keys()}
    embeddings = np.random.normal(scale=0.6, size=(len(vocab), embedding_dim))  # 随机初始化
    with open(embedding_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype='float32')
            if word in vocab.token_to_idx:
                embeddings[vocab[word]] = vector
                # found[word] = True
    # for word, was_found in found.items():
    #     if not was_found:
    #         print(f"词元 {word} 未找到预训练向量")
    return torch.tensor(embeddings, dtype=torch.float32)

In [3]:
embedding_matrix = load_pretrained_embeddings("Word2Vec-100000-small.txt", vocab, embedding_dim=200)

In [4]:
# 融合词向量层的多层感知机
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F


class BowDataset(Dataset):
    def __init__(self, data: DataFrame) -> None:
        self.data = data

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, i: int):
        return self.data.iloc[i]

def collate_fn_mlp(batch):
    # 从独立样本集合中构建各批次的输入输出
    # 其中 BowDataset 类定义了一个样本的数据结构，即输入标签和输出标签的元组
    # 因此，将输入 inputs 定义为一个张量的列表，其中每个张量为原始句子中词元序列对应的索引值序列
    inputs = [torch.tensor(b[0]) for b in batch]

    # 输出的目标 targets 为该批次中由全部样例输出结果构成的张量
    targets = torch.tensor([b[1] for b in batch], dtype=torch.long)

    # 获取一个批次中每个样例的序列长度
    offsets = [0] + [i.shape[0] for i in inputs]

    # 根据序列的长度，转换为每个序列起始位置的偏移量
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)

    # 将 inputs 列表中的张量拼接成一个大的张量
    inputs = torch.cat(inputs)

    return inputs, offsets, targets

In [5]:
class MLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
        super(MLP, self).__init__()
        # EmbeddingBag 层
        self.embeddingbag = nn.EmbeddingBag(vocab_size, embedding_dim)

        # 词向量层：使用 EmbeddingBag
        # self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)

        # 线性变换：词向量层 -> 隐含层
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)

        # 使用ReLU激活函数
        self.activate = F.relu

        # 线性变换：激活层 -> 输出层
        self.linear2 = nn.Linear(hidden_dim, num_class)
        # self.dropout = nn.Dropout(dropout)
    
    def forward(self, inputs, offsets):
        # embeddings = self.embedding(inputs)  # (batch_size, seq_length, embed_size)
        # embedding = embeddings.mean(dim=1)  # (batch_size, embed_size)
        embedding = self.embeddingbag(inputs, offsets)
        hidden = self.activate(self.linear1(embedding))  # (batch_size, hidden_size)
        outputs = self.linear2(hidden)  # (batch_size, output_size)

        # 获得每个序列属于某个类别概率的对数值
        # probs = F.log_softmax(outputs, dim=1)
        return outputs

In [6]:
# MLP 相关参数与数据加载
from tqdm.auto import tqdm

# 超参数设置
embedding_dim = 128
num_class = 3
hidden_dim = 256
batch_size = 32
num_epoch = 5

# 加载数据
train_dataset = BowDataset(train_data)
test_dataset = BowDataset(test_data)

train_data_loader_mlp = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn_mlp, shuffle=True)
test_data_loader_mlp = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn_mlp, shuffle=False)

# 加载模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
mlp = MLP(len(vocab), embedding_dim, hidden_dim, num_class)
mlp.to(device)

  from .autonotebook import tqdm as notebook_tqdm


MLP(
  (embeddingbag): EmbeddingBag(33152, 128, mode='mean')
  (embedding): Embedding(33152, 200)
  (linear1): Linear(in_features=128, out_features=256, bias=True)
  (linear2): Linear(in_features=256, out_features=3, bias=True)
)

In [7]:
# 训练过程
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.001)

mlp.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader_mlp, desc=f"Training Epoch {epoch+1}"):
        inputs, offsets, targets = [x.to(device) for x in batch]

        logits = mlp(inputs, offsets)
        loss = criterion(logits, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
            
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss:.2f}")

Training Epoch 0: 100%|██████████| 7155/7155 [00:46<00:00, 153.94it/s]


Epoch 1 - Loss: 3028.61


Training Epoch 1: 100%|██████████| 7155/7155 [00:46<00:00, 154.66it/s]


Epoch 2 - Loss: 2418.53


Training Epoch 2: 100%|██████████| 7155/7155 [00:48<00:00, 148.55it/s]


Epoch 3 - Loss: 2190.53


Training Epoch 3: 100%|██████████| 7155/7155 [00:45<00:00, 157.60it/s]


Epoch 4 - Loss: 2002.17


Training Epoch 4: 100%|██████████| 7155/7155 [00:44<00:00, 161.24it/s]

Epoch 5 - Loss: 1808.90





In [8]:
# 测试过程
acc = 0
for batch in tqdm(test_data_loader_mlp, desc="Testing"):
    inputs, offsets, targets = [x.to(device) for x in batch]
    with torch.no_grad():
        output = mlp(inputs, offsets)
        acc += (output.argmax(dim=1) == targets).sum().item()
    
# 输出在测试集上的准确率
print(f"Acc: {acc / len(test_data_loader_mlp):.4f}")

Testing: 100%|██████████| 49057/49057 [00:49<00:00, 992.92it/s] 

Acc: 0.8385



