In [None]:
from pandas import DataFrame
from MyModule import Vocab, load_vocab, load_data

train_data = load_data('train_vectorized.txt', sep=',', is_vectorized=True)
val_data = load_data('val_vectorized.txt', sep=',', is_vectorized=True)
test_data = load_data('test_vectorized.txt', sep=',', is_vectorized=True)

vocab = load_vocab('vocab.txt')

### 多层感知机

In [None]:
# 融合词向量层的多层感知机
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F


class BowDataset(Dataset):
    def __init__(self, data: DataFrame) -> None:
        self.data = data

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, i: int):
        return self.data.iloc[i]

def collate_fn_mlp(batch):
    # 从独立样本集合中构建各批次的输入输出
    # 其中 BowDataset 类定义了一个样本的数据结构，即输入标签和输出标签的元组
    # 因此，将输入 inputs 定义为一个张量的列表，其中每个张量为原始句子中词元序列对应的索引值序列
    inputs = [torch.tensor(b[0]) for b in batch]

    # 输出的目标 targets 为该批次中由全部样例输出结果构成的张量
    targets = torch.tensor([b[1] for b in batch], dtype=torch.long)

    # 获取一个批次中每个样例的序列长度
    offsets = [0] + [i.shape[0] for i in inputs]

    # 根据序列的长度，转换为每个序列起始位置的偏移量
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)

    # 将 inputs 列表中的张量拼接成一个大的张量
    inputs = torch.cat(inputs)

    return inputs, offsets, targets

In [18]:
class MLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
        super(MLP, self).__init__()
        # EmbeddingBag 层
        self.embeddingbag = nn.EmbeddingBag(vocab_size, embedding_dim)

        # 词向量层：使用 EmbeddingBag
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # 线性变换：词向量层 -> 隐含层
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)

        # 使用ReLU激活函数
        self.activate = F.relu

        # 线性变换：激活层 -> 输出层
        self.linear2 = nn.Linear(hidden_dim, num_class)
        # self.dropout = nn.Dropout(dropout)
    
    def forward(self, inputs, offsets):
        # embeddings = self.embedding(inputs)  # (batch_size, seq_length, embed_size)
        # embedding = embeddings.mean(dim=1)  # (batch_size, embed_size)
        embedding = self.embeddingbag(inputs, offsets)
        hidden = self.activate(self.linear1(embedding))  # (batch_size, hidden_size)
        outputs = self.linear2(hidden)  # (batch_size, output_size)

        # 获得每个序列属于某个类别概率的对数值
        # probs = F.log_softmax(outputs, dim=1)
        return outputs

In [19]:
# MLP 相关参数与数据加载
from tqdm.auto import tqdm

# 超参数设置
embedding_dim = 128
num_class = 3
hidden_dim = 256
batch_size = 32
num_epoch = 5

# 加载数据
train_dataset = BowDataset(train_data)
test_dataset = BowDataset(test_data)

train_data_loader_mlp = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn_mlp, shuffle=True)
test_data_loader_mlp = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn_mlp, shuffle=False)

# 加载模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
mlp = MLP(len(vocab), embedding_dim, hidden_dim, num_class)
mlp.to(device)

MLP(
  (embeddingbag): EmbeddingBag(33152, 128, mode='mean')
  (embedding): Embedding(33152, 128)
  (linear1): Linear(in_features=128, out_features=256, bias=True)
  (linear2): Linear(in_features=256, out_features=3, bias=True)
)

In [20]:
# 训练过程
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=0.001)

mlp.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader_mlp, desc=f"Training Epoch {epoch}"):
        inputs, offsets, targets = [x.to(device) for x in batch]

        logits = mlp(inputs, offsets)
        loss = criterion(logits, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
            
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss:.2f}")

Training Epoch 0: 100%|██████████| 7155/7155 [00:22<00:00, 321.28it/s]


Epoch 1 - Loss: 5085.01


Training Epoch 1: 100%|██████████| 7155/7155 [00:21<00:00, 332.74it/s]


Epoch 2 - Loss: 4172.95


Training Epoch 2: 100%|██████████| 7155/7155 [00:22<00:00, 318.89it/s]


Epoch 3 - Loss: 3796.43


Training Epoch 3: 100%|██████████| 7155/7155 [00:22<00:00, 321.54it/s]


Epoch 4 - Loss: 3498.65


Training Epoch 4: 100%|██████████| 7155/7155 [00:22<00:00, 317.27it/s]

Epoch 5 - Loss: 3200.31





In [21]:
# 测试过程
acc = 0
for batch in tqdm(test_data_loader_mlp, desc="Testing"):
    inputs, offsets, targets = [x.to(device) for x in batch]
    with torch.no_grad():
        output = mlp(inputs, offsets)
        acc += (output.argmax(dim=1) == targets).sum().item()
    
# 输出在测试集上的准确率
print(f"Acc: {acc / len(test_data_loader_mlp):.4f}")

Testing: 100%|██████████| 49057/49057 [00:30<00:00, 1627.29it/s]

Acc: 0.7446





### 卷积神经网络

In [22]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, filter_size, num_filter, num_class) -> None:
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv1d = nn.Conv1d(embedding_dim, num_filter, filter_size, padding=1)  # padding=1 表示在卷积操作之前，将序列的前后各补充1个输入
        self.activate = F.relu
        self.linear = nn.Linear(num_filter, num_class)

    
    def forward(self, inputs):
        embedding = self.embedding(inputs)
        convolution = self.activate(self.conv1d(embedding.permute(0, 2, 1)))
        pooling = F.max_pool1d(convolution, kernel_size=convolution.shape[2])
        outputs = self.linear(pooling.squeeze(dim=2))
        return outputs

In [23]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn_cnn(batch):
    inputs = [torch.tensor(b[0]) for b in batch]
    targets = torch.tensor([b[1] for b in batch], dtype=torch.long)

    # 对批次内的样本补齐，使其具有相同的长度
    inputs = pad_sequence(inputs, batch_first=True)
    return inputs, targets

In [24]:
train_data_loader_cnn = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn_cnn, shuffle=True)
test_data_loader_cnn = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn_cnn, shuffle=False)

# CNN 相关参数
filter_size = 3
num_filter = 100

# 加载模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
cnn = CNN(len(vocab), embedding_dim, filter_size, num_filter, num_class)
cnn.to(device)

CNN(
  (embedding): Embedding(33152, 128)
  (conv1d): Conv1d(128, 100, kernel_size=(3,), stride=(1,), padding=(1,))
  (linear): Linear(in_features=100, out_features=3, bias=True)
)

In [25]:
# 训练过程
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=0.001)

cnn.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader_cnn, desc=f"Training Epoch {epoch}"):
        inputs, targets = [x.to(device) for x in batch]

        logits = cnn(inputs)
        loss = criterion(logits, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
            
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss:.2f}")

Training Epoch 0: 100%|██████████| 7155/7155 [00:25<00:00, 283.76it/s]


Epoch 1 - Loss: 5079.45


Training Epoch 1: 100%|██████████| 7155/7155 [00:24<00:00, 295.36it/s]


Epoch 2 - Loss: 4111.97


Training Epoch 2: 100%|██████████| 7155/7155 [00:24<00:00, 291.43it/s]


Epoch 3 - Loss: 3585.46


Training Epoch 3: 100%|██████████| 7155/7155 [00:24<00:00, 289.28it/s]


Epoch 4 - Loss: 3084.63


Training Epoch 4: 100%|██████████| 7155/7155 [00:24<00:00, 295.25it/s]

Epoch 5 - Loss: 2572.62





In [26]:
# 测试过程
acc = 0
for batch in tqdm(test_data_loader_cnn, desc="Testing"):
    inputs, targets = [x.to(device) for x in batch]
    with torch.no_grad():
        output = cnn(inputs)
        acc += (output.argmax(dim=1) == targets).sum().item()
    
# 输出在测试集上的准确率
print(f"Acc: {acc / len(test_data_loader_cnn):.4f}")

Testing: 100%|██████████| 49057/49057 [00:38<00:00, 1282.37it/s]

Acc: 0.7368





### 长短时记忆网络

In [31]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn_lstm(batch):
    lengths = torch.tensor([len(b[0]) for b in batch], dtype=torch.long)
    inputs = [torch.tensor(b[0]) for b in batch]
    targets = torch.tensor([b[1] for b in batch], dtype=torch.long)

    # 使用 pad_sequence 函数对输入序列进行填充
    inputs = pad_sequence(inputs, batch_first=True)

    return inputs, lengths, targets

In [32]:
from torch.nn.utils.rnn import pack_padded_sequence

class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
        super(LSTM, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.output = nn.Linear(hidden_dim, num_class)

    def forward(self, inputs, lengths):
        embedding = self.embeddings(inputs)

        # 使用 pack_padded_sequence 函数对嵌入序列进行打包
        x_pack = pack_padded_sequence(embedding, lengths, batch_first=True, enforce_sorted=False)
        hidden, (hn, cn) = self.lstm(x_pack)
        outputs = self.output(hn[-1])
        return outputs

In [35]:
train_data_loader_lstm = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn_lstm, shuffle=True)
test_data_loader_lstm = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn_lstm, shuffle=False)

# 加载模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lstm = LSTM(len(vocab), embedding_dim, hidden_dim, num_class)
lstm.to(device)

LSTM(
  (embeddings): Embedding(33152, 128)
  (lstm): LSTM(128, 256, batch_first=True)
  (output): Linear(in_features=256, out_features=3, bias=True)
)

In [37]:
# 训练过程
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.001)

lstm.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader_lstm, desc=f"Training Epoch {epoch}"):
        # inputs, lengths, targets = [x.to(device) for x in batch]
        inputs, lengths, targets = batch
        inputs = inputs.to(device)
        targets = targets.to(device)

        logits = lstm(inputs, lengths)
        loss = criterion(logits, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
            
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss:.2f}")

Training Epoch 0: 100%|██████████| 7155/7155 [00:44<00:00, 161.81it/s]


Epoch 1 - Loss: 4874.49


Training Epoch 1: 100%|██████████| 7155/7155 [00:42<00:00, 168.41it/s]


Epoch 2 - Loss: 3764.39


Training Epoch 2: 100%|██████████| 7155/7155 [00:42<00:00, 169.25it/s]


Epoch 3 - Loss: 3012.29


Training Epoch 3: 100%|██████████| 7155/7155 [00:41<00:00, 172.25it/s]


Epoch 4 - Loss: 2210.72


Training Epoch 4: 100%|██████████| 7155/7155 [00:42<00:00, 169.70it/s]

Epoch 5 - Loss: 1470.70





In [38]:
# 测试过程
acc = 0
for batch in tqdm(test_data_loader_lstm, desc="Testing"):
    # inputs, targets = [x.to(device) for x in batch]
    inputs, lengths, targets = batch
    inputs = inputs.to(device)
    targets = targets.to(device)
    with torch.no_grad():
        output = lstm(inputs, lengths)
        acc += (output.argmax(dim=1) == targets).sum().item()
    
# 输出在测试集上的准确率
print(f"Acc: {acc / len(test_data_loader_lstm):.4f}")

Testing: 100%|██████████| 49057/49057 [01:03<00:00, 766.88it/s]

Acc: 0.7445





### Transformer

In [65]:
def length_to_mask(lengths):
    max_len = torch.max(lengths)
    mask = torch.arange(max_len, device=lengths.device).expand(lengths.shape[0], max_len) < lengths.unsqueeze(1)
    return mask

In [66]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=512):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0)) / d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

In [67]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class, dim_feedforward=512, num_head=2, num_layers=2, dropout=0.1, max_len=128, activation: str = "relu"):
        super(Transformer, self).__init__()
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.position_embedding = PositionalEncoding(embedding_dim, dropout, max_len)  # 位置编码

        encoder_layer = nn.TransformerEncoderLayer(embedding_dim, num_head, dim_feedforward, dropout, activation)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)

        self.output = nn.Linear(embedding_dim, num_class)

    def forward(self, inputs, lengths):
        inputs = torch.transpose(inputs, 0, 1)

        hidden_states = self.embeddings(inputs)
        hidden_states = self.position_embedding(hidden_states)

        lengths = lengths.to(inputs.device)
        attention_mask = length_to_mask(lengths) == False

        # 根据批次中每个序列长度生成 Mask 矩阵
        hidden_states = self.transformer(hidden_states, src_key_padding_mask=attention_mask)
        hidden_states = hidden_states[0, :, :]

        # 去第一个词元的输出结果作为分类层的输入
        outputs = self.output(hidden_states)
        return outputs

In [68]:
collate_fn_trans = collate_fn_lstm

In [69]:
train_data_loader_trans = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn_trans, shuffle=True)
test_data_loader_trans = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn_trans, shuffle=False)

# 加载模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transformer = Transformer(len(vocab), embedding_dim, embedding_dim, num_class)
transformer.to(device)



Transformer(
  (embeddings): Embedding(33152, 128)
  (position_embedding): PositionalEncoding()
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (output): Linear(in_features=128, out_features=3, bias=True)
)

In [70]:
# 训练过程
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001)

transformer.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader_trans, desc=f"Training Epoch {epoch}"):
        # inputs, lengths, targets = [x.to(device) for x in batch]
        inputs, lengths, targets = batch
        inputs = inputs.to(device)
        targets = targets.to(device)

        logits = transformer(inputs, lengths)
        loss = criterion(logits, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
            
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss:.2f}")

Training Epoch 0: 100%|██████████| 7155/7155 [01:09<00:00, 103.60it/s]


Epoch 1 - Loss: 5540.14


Training Epoch 1: 100%|██████████| 7155/7155 [01:08<00:00, 104.05it/s]


Epoch 2 - Loss: 4616.21


Training Epoch 2: 100%|██████████| 7155/7155 [01:08<00:00, 104.49it/s]


Epoch 3 - Loss: 4203.25


Training Epoch 3: 100%|██████████| 7155/7155 [01:08<00:00, 104.70it/s]


Epoch 4 - Loss: 3918.12


Training Epoch 4: 100%|██████████| 7155/7155 [01:10<00:00, 102.17it/s]

Epoch 5 - Loss: 3690.80





In [71]:
# 测试过程
acc = 0
for batch in tqdm(test_data_loader_trans, desc="Testing"):
    # inputs, targets = [x.to(device) for x in batch]
    inputs, lengths, targets = batch
    inputs = inputs.to(device)
    targets = targets.to(device)
    with torch.no_grad():
        output = lstm(inputs, lengths)
        acc += (output.argmax(dim=1) == targets).sum().item()
    
# 输出在测试集上的准确率
print(f"Acc: {acc / len(test_data_loader_lstm):.4f}")

Testing: 100%|██████████| 49057/49057 [01:21<00:00, 605.31it/s]

Acc: 0.7445





## 使用PyTorch现成模型实现

以下使用PyTorch中预构建的模型来实现相同的分类任务。

### 1. 多层感知机（使用nn.Sequential）

In [78]:
# 使用 nn.Sequential 实现多层感知机
class SimpleMLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
        super(SimpleMLP, self).__init__()
        self.embeddingbag = nn.EmbeddingBag(vocab_size, embedding_dim)
        
        # 使用 nn.Sequential 构建全连接层
        self.classifier = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_class)
        )
    
    def forward(self, inputs, offsets):
        embedding = self.embeddingbag(inputs, offsets)
        outputs = self.classifier(embedding)
        return outputs

# 加载模型
simple_mlp = SimpleMLP(len(vocab), embedding_dim, hidden_dim, num_class)
simple_mlp.to(device)

# 训练过程
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(simple_mlp.parameters(), lr=0.001)

simple_mlp.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader_mlp, desc=f"Simple MLP Epoch {epoch}"):
        inputs, offsets, targets = [x.to(device) for x in batch]

        logits = simple_mlp(inputs, offsets)
        loss = criterion(logits, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
            
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss:.2f}")

# 测试过程
acc = 0
for batch in tqdm(test_data_loader_mlp, desc="Testing Simple MLP"):
    inputs, offsets, targets = [x.to(device) for x in batch]
    with torch.no_grad():
        output = simple_mlp(inputs, offsets)
        acc += (output.argmax(dim=1) == targets).sum().item()

print(f"Simple MLP Acc: {acc / len(test_data_loader_mlp):.4f}")

Simple MLP Epoch 0: 100%|██████████| 7155/7155 [00:44<00:00, 162.03it/s]
Simple MLP Epoch 0: 100%|██████████| 7155/7155 [00:44<00:00, 162.03it/s]


Epoch 1 - Loss: 5095.20


Simple MLP Epoch 1: 100%|██████████| 7155/7155 [00:44<00:00, 161.46it/s]
Simple MLP Epoch 1: 100%|██████████| 7155/7155 [00:44<00:00, 161.46it/s]


Epoch 2 - Loss: 4169.11


Simple MLP Epoch 2: 100%|██████████| 7155/7155 [00:43<00:00, 163.66it/s]
Simple MLP Epoch 2: 100%|██████████| 7155/7155 [00:43<00:00, 163.66it/s]


Epoch 3 - Loss: 3792.21


Simple MLP Epoch 3: 100%|██████████| 7155/7155 [00:43<00:00, 163.73it/s]
Simple MLP Epoch 3: 100%|██████████| 7155/7155 [00:43<00:00, 163.73it/s]


Epoch 4 - Loss: 3491.96


Simple MLP Epoch 4: 100%|██████████| 7155/7155 [00:43<00:00, 164.24it/s]
Simple MLP Epoch 4: 100%|██████████| 7155/7155 [00:43<00:00, 164.24it/s]


Epoch 5 - Loss: 3200.95


Testing Simple MLP: 100%|██████████| 49057/49057 [00:38<00:00, 1286.00it/s]

Simple MLP Acc: 0.7471





### 2. 卷积神经网络（使用torchvision中的基础组件）

In [74]:
# 使用 nn.ModuleList 和现成组件实现 CNN
class SimpleCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, filter_size, num_filter, num_class):
        super(SimpleCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # 使用现成的卷积和激活函数组件
        self.conv_layers = nn.ModuleList([
            nn.Conv1d(embedding_dim, num_filter, filter_size, padding=1)
        ])
        
        self.activation = nn.ReLU()
        self.global_pool = nn.AdaptiveMaxPool1d(1)
        self.classifier = nn.Linear(num_filter, num_class)
        self.dropout = nn.Dropout(0.5)

    def forward(self, inputs):
        embedding = self.embedding(inputs)
        x = embedding.permute(0, 2, 1)  # (batch_size, embedding_dim, seq_len)
        
        # 应用卷积层
        for conv in self.conv_layers:
            x = self.activation(conv(x))
        
        # 全局最大池化
        x = self.global_pool(x)
        x = x.squeeze(dim=2)
        
        # 分类层
        x = self.dropout(x)
        outputs = self.classifier(x)
        return outputs

# 加载模型
simple_cnn = SimpleCNN(len(vocab), embedding_dim, filter_size, num_filter, num_class)
simple_cnn.to(device)

# 训练过程
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(simple_cnn.parameters(), lr=0.001)

simple_cnn.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader_cnn, desc=f"Simple CNN Epoch {epoch}"):
        inputs, targets = [x.to(device) for x in batch]

        logits = simple_cnn(inputs)
        loss = criterion(logits, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
            
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss:.2f}")

# 测试过程
acc = 0
for batch in tqdm(test_data_loader_cnn, desc="Testing Simple CNN"):
    inputs, targets = [x.to(device) for x in batch]
    with torch.no_grad():
        output = simple_cnn(inputs)
        acc += (output.argmax(dim=1) == targets).sum().item()

print(f"Simple CNN Acc: {acc / len(test_data_loader_cnn):.4f}")

Simple CNN Epoch 0: 100%|██████████| 7155/7155 [00:47<00:00, 150.68it/s]
Simple CNN Epoch 0: 100%|██████████| 7155/7155 [00:47<00:00, 150.68it/s]


Epoch 1 - Loss: 5451.57


Simple CNN Epoch 1: 100%|██████████| 7155/7155 [00:46<00:00, 155.33it/s]
Simple CNN Epoch 1: 100%|██████████| 7155/7155 [00:46<00:00, 155.33it/s]


Epoch 2 - Loss: 4560.30


Simple CNN Epoch 2: 100%|██████████| 7155/7155 [00:45<00:00, 156.20it/s]
Simple CNN Epoch 2: 100%|██████████| 7155/7155 [00:45<00:00, 156.20it/s]


Epoch 3 - Loss: 4236.97


Simple CNN Epoch 3: 100%|██████████| 7155/7155 [00:45<00:00, 156.09it/s]
Simple CNN Epoch 3: 100%|██████████| 7155/7155 [00:45<00:00, 156.09it/s]


Epoch 4 - Loss: 4002.67


Simple CNN Epoch 4: 100%|██████████| 7155/7155 [00:45<00:00, 156.48it/s]
Simple CNN Epoch 4: 100%|██████████| 7155/7155 [00:45<00:00, 156.48it/s]


Epoch 5 - Loss: 3816.91


Testing Simple CNN: 100%|██████████| 49057/49057 [00:50<00:00, 963.39it/s]

Simple CNN Acc: 0.7375





### 3. 长短时记忆网络（使用nn.LSTM现成模块）

In [75]:
# 使用 nn.LSTM 现成模块，添加更多功能
class SimpleLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class, num_layers=2, bidirectional=True):
        super(SimpleLSTM, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # 使用双向LSTM和多层结构
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_dim, 
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=0.3 if num_layers > 1 else 0
        )
        
        # 如果是双向LSTM，隐藏状态维度需要乘以2
        lstm_output_dim = hidden_dim * 2 if bidirectional else hidden_dim
        
        self.classifier = nn.Sequential(
            nn.Linear(lstm_output_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden_dim, num_class)
        )

    def forward(self, inputs, lengths):
        embedding = self.embeddings(inputs)
        
        # 打包序列
        x_pack = pack_padded_sequence(embedding, lengths, batch_first=True, enforce_sorted=False)
        hidden, (hn, cn) = self.lstm(x_pack)
        
        # 使用最后一层的隐藏状态
        # 对于双向LSTM，需要合并前向和后向的隐藏状态
        if self.lstm.bidirectional:
            # hn的形状: [num_layers*2, batch_size, hidden_dim]
            # 取最后一层的前向和后向隐藏状态
            forward_hidden = hn[-2, :, :]
            backward_hidden = hn[-1, :, :]
            last_hidden = torch.cat([forward_hidden, backward_hidden], dim=1)
        else:
            last_hidden = hn[-1, :, :]
        
        outputs = self.classifier(last_hidden)
        return outputs

# 加载模型
simple_lstm = SimpleLSTM(len(vocab), embedding_dim, hidden_dim, num_class)
simple_lstm.to(device)

# 训练过程
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(simple_lstm.parameters(), lr=0.001, weight_decay=1e-5)

simple_lstm.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader_lstm, desc=f"Simple LSTM Epoch {epoch}"):
        inputs, lengths, targets = batch
        inputs = inputs.to(device)
        targets = targets.to(device)

        logits = simple_lstm(inputs, lengths)
        loss = criterion(logits, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
            
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss:.2f}")

# 测试过程
acc = 0
for batch in tqdm(test_data_loader_lstm, desc="Testing Simple LSTM"):
    inputs, lengths, targets = batch
    inputs = inputs.to(device)
    targets = targets.to(device)
    with torch.no_grad():
        output = simple_lstm(inputs, lengths)
        acc += (output.argmax(dim=1) == targets).sum().item()

print(f"Simple LSTM Acc: {acc / len(test_data_loader_lstm):.4f}")

Simple LSTM Epoch 0: 100%|██████████| 7155/7155 [01:47<00:00, 66.56it/s]
Simple LSTM Epoch 0: 100%|██████████| 7155/7155 [01:47<00:00, 66.56it/s]


Epoch 1 - Loss: 4858.33


Simple LSTM Epoch 1: 100%|██████████| 7155/7155 [01:46<00:00, 67.04it/s]
Simple LSTM Epoch 1: 100%|██████████| 7155/7155 [01:46<00:00, 67.04it/s]


Epoch 2 - Loss: 4027.14


Simple LSTM Epoch 2: 100%|██████████| 7155/7155 [01:46<00:00, 67.16it/s]
Simple LSTM Epoch 2: 100%|██████████| 7155/7155 [01:46<00:00, 67.16it/s]


Epoch 3 - Loss: 3723.49


Simple LSTM Epoch 3: 100%|██████████| 7155/7155 [01:46<00:00, 67.18it/s]
Simple LSTM Epoch 3: 100%|██████████| 7155/7155 [01:46<00:00, 67.18it/s]


Epoch 4 - Loss: 3458.42


Simple LSTM Epoch 4: 100%|██████████| 7155/7155 [01:47<00:00, 66.78it/s]
Simple LSTM Epoch 4: 100%|██████████| 7155/7155 [01:47<00:00, 66.78it/s]


Epoch 5 - Loss: 3192.09


Testing Simple LSTM: 100%|██████████| 49057/49057 [01:28<00:00, 553.54it/s]

Simple LSTM Acc: 0.7663





### 4. Transformer（使用nn.TransformerEncoder现成模块）

In [76]:
# 使用 nn.TransformerEncoder 现成模块
class SimpleTransformer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_class, nhead=8, num_layers=6, dim_feedforward=2048, dropout=0.1):
        super(SimpleTransformer, self).__init__()
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # 位置编码
        self.pos_encoder = PositionalEncoding(embedding_dim, dropout)
        
        # 使用现成的 TransformerEncoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation='relu',
            batch_first=True
        )
        
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers,
            norm=nn.LayerNorm(embedding_dim)
        )
        
        # 分类头
        self.classifier = nn.Sequential(
            nn.Linear(embedding_dim, embedding_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(embedding_dim // 2, num_class)
        )

    def forward(self, inputs, lengths):
        # 生成padding mask
        batch_size, seq_len = inputs.shape
        mask = torch.arange(seq_len, device=inputs.device).expand(batch_size, seq_len) >= lengths.unsqueeze(1)
        
        # 词嵌入
        embeddings = self.embeddings(inputs) * math.sqrt(self.embedding_dim)
        
        # 位置编码
        embeddings = self.pos_encoder(embeddings.transpose(0, 1)).transpose(0, 1)
        
        # Transformer编码
        encoded = self.transformer_encoder(embeddings, src_key_padding_mask=mask)
        
        # 使用[CLS] token的表示（这里使用第一个token）
        # 或者可以使用平均池化
        cls_representation = encoded[:, 0, :]  # 取第一个位置的表示
        
        # 分类
        outputs = self.classifier(cls_representation)
        return outputs

# 加载模型
simple_transformer = SimpleTransformer(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    num_class=num_class,
    nhead=8,
    num_layers=3,
    dim_feedforward=512,
    dropout=0.1
)
simple_transformer.to(device)

# 训练过程
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(simple_transformer.parameters(), lr=0.0001, weight_decay=1e-5)

simple_transformer.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(train_data_loader_trans, desc=f"Simple Transformer Epoch {epoch}"):
        inputs, lengths, targets = batch
        inputs = inputs.to(device)
        lengths = lengths.to(device)
        targets = targets.to(device)

        logits = simple_transformer(inputs, lengths)
        loss = criterion(logits, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
            
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss:.2f}")

# 测试过程
acc = 0
for batch in tqdm(test_data_loader_trans, desc="Testing Simple Transformer"):
    inputs, lengths, targets = batch
    inputs = inputs.to(device)
    lengths = lengths.to(device)
    targets = targets.to(device)
    with torch.no_grad():
        output = simple_transformer(inputs, lengths)
        acc += (output.argmax(dim=1) == targets).sum().item()

print(f"Simple Transformer Acc: {acc / len(test_data_loader_trans):.4f}")

Simple Transformer Epoch 0: 100%|██████████| 7155/7155 [01:29<00:00, 80.14it/s]



Epoch 1 - Loss: 5953.68


Simple Transformer Epoch 1: 100%|██████████| 7155/7155 [01:28<00:00, 80.61it/s]
Simple Transformer Epoch 1: 100%|██████████| 7155/7155 [01:28<00:00, 80.61it/s]


Epoch 2 - Loss: 5104.61


Simple Transformer Epoch 2: 100%|██████████| 7155/7155 [01:30<00:00, 78.86it/s]



Epoch 3 - Loss: 4762.32


Simple Transformer Epoch 3: 100%|██████████| 7155/7155 [01:29<00:00, 79.87it/s]
Simple Transformer Epoch 3: 100%|██████████| 7155/7155 [01:29<00:00, 79.87it/s]


Epoch 4 - Loss: 4502.60


Simple Transformer Epoch 4: 100%|██████████| 7155/7155 [01:44<00:00, 68.21it/s]
Simple Transformer Epoch 4: 100%|██████████| 7155/7155 [01:44<00:00, 68.21it/s]


Epoch 5 - Loss: 4288.91


Testing Simple Transformer: 100%|██████████| 49057/49057 [02:18<00:00, 354.88it/s]

Simple Transformer Acc: 0.7165





### 模型性能比较

现在让我们比较一下手工实现和使用PyTorch现成模块实现的模型性能：

In [79]:
# 模型性能对比总结
import pandas as pd

print("=" * 60)
print("模型性能对比")
print("=" * 60)

# 这里可以手动记录各个模型的测试准确率
# 实际运行时需要根据上面的输出结果填入
model_results = {
    '模型类型': ['手工MLP', 'PyTorch MLP', '手工CNN', 'PyTorch CNN', 
               '手工LSTM', 'PyTorch LSTM', '手工Transformer', 'PyTorch Transformer'],
    '实现方式': ['手工搭建', 'nn.Sequential', '手工搭建', 'nn.ModuleList', 
               '手工搭建', 'nn.LSTM双向', '手工搭建', 'nn.TransformerEncoder'],
    '准确率': [0.7446, 0.7368, 0.7445, 0.7445, 0.7471, 0.7375, 0.7663, 0.7165],  # 需要根据实际结果更新
    '特点': ['基础实现', '简洁的序列结构', '基础CNN', '更好的组件化', 
           '基础LSTM', '双向+多层+Dropout', '基础Transformer', '完整的编码器结构']
}

results_df = pd.DataFrame(model_results)
print(results_df.to_string(index=False))

print("\n主要改进点：")
print("1. MLP: 使用nn.Sequential简化代码结构")
print("2. CNN: 使用nn.ModuleList和更好的池化策略")
print("3. LSTM: 使用双向LSTM、多层结构和Dropout正则化")
print("4. Transformer: 使用标准的TransformerEncoder和更好的位置编码")

模型性能对比
               模型类型                  实现方式    准确率            特点
              手工MLP                  手工搭建 0.7446          基础实现
        PyTorch MLP         nn.Sequential 0.7368       简洁的序列结构
              手工CNN                  手工搭建 0.7445         基础CNN
        PyTorch CNN         nn.ModuleList 0.7445        更好的组件化
             手工LSTM                  手工搭建 0.7471        基础LSTM
       PyTorch LSTM             nn.LSTM双向 0.7375 双向+多层+Dropout
      手工Transformer                  手工搭建 0.7663 基础Transformer
PyTorch Transformer nn.TransformerEncoder 0.7165      完整的编码器结构

主要改进点：
1. MLP: 使用nn.Sequential简化代码结构
2. CNN: 使用nn.ModuleList和更好的池化策略
3. LSTM: 使用双向LSTM、多层结构和Dropout正则化
4. Transformer: 使用标准的TransformerEncoder和更好的位置编码
