## Ablation消融实验

**Attention vs 均匀池化**

baseline模型：BiLSTM + 掩码mean pooling （所有token权重一样=1 / 长度）

attention模型：BiLSTM + attention （权重由网络学习得到）

In [2]:
from datasets import load_dataset
#from sklearn.model_selection import train_test_split

raw_datasets = load_dataset("imdb")     #二分类：0=neg， 1=pos

print(raw_datasets)

train_valid = raw_datasets["train"]
test_dataset = raw_datasets["test"]

#拆成20000train和5000valid
train_valid = train_valid.train_test_split(test_size=5000, seed=42)
train_dataset = train_valid["train"]
valid_dataset = train_valid["test"]

print(len(train_dataset), len(valid_dataset), len(test_dataset))

from collections import Counter
import re

def simple_tokenizer(text):
    text = text.lower()
    #r表示原始字符串raw string, 避免/这种转义带来的麻烦
    #+表示前面的模式至少出现一次，可以连续多次
    #可在一起的意思是，匹配text里连续的小写英文字母字符串，也就是按英文单词切分
    tokens = re.findall(r"[a-z]+", text)
    return tokens

min_freq = 5 #词频小于5的当<unk>
counter = Counter()

for example in train_dataset:
    tokens = simple_tokenizer(example["text"])
    counter.update(tokens)

#special tokens
PAD = "<pad>"
UNK = "<unk>"

#<pad>用于补齐长度，编号0；<unk>表示未知词，编号1
vocab = {PAD: 0, UNK: 1}

#counter.items()返回(单词，词频)
for word, freq in counter.items():
    if freq >= min_freq:
        vocab[word] = len(vocab)

vocab_size = len(vocab)
print("vocab_size= ", vocab_size)

#encode: 把一条原始文本转换成固定长度的token id序列
def encode(text, vocab, max_len=256):
    tokens = simple_tokenizer(text)
    #把token转换成对应的词表id，如果token不在词表中，则用<UNK>的id
    ids = [vocab.get(tok, vocab[UNK]) for tok in tokens]

    if len(ids) < max_len:
        pad_len = max_len - len(ids)
        ids += [vocab[PAD]] * pad_len
    
    else:
        ids = ids[:max_len]

    return ids

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})
20000 5000 25000
vocab_size=  25954


In [3]:

import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    #cpu
    torch.manual_seed(seed)
    #gpu
    torch.cuda.manual_seed_all(seed)
    # 强制 PyTorch 使用“确定性”的 cuDNN 算法
    # 作用：即使速度慢一些，也保证同样的输入 → 同样的输出
    # 否则 cuDNN 可能为了加速选择非确定性实现
    torch.backends.cudnn.deterministic = True
    # 关闭 cuDNN 的自动性能优化
    # benchmark=True 会根据输入动态选择最快算法，但可能导致结果不稳定
    # 这里设为 False，是为了结果可复现而不是追求速度
    torch.backends.cudnn.benchmark = False

class TextClassificationDataset(Dataset):
    def __init__(self, hf_dataset, vocab, max_len=256):
        self.dataset = hf_dataset
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        text = self.dataset[idx]["text"]
        label = self.dataset[idx]["label"]

        input_ids = encode(text, vocab=self.vocab, max_len=self.max_len)
        input_ids = torch.tensor(input_ids, dtype=torch.long)

        return input_ids, label
    
max_len = 256
train_data = TextClassificationDataset(train_dataset, vocab, max_len=max_len)
valid_data = TextClassificationDataset(valid_dataset, vocab, max_len=max_len)
test_data = TextClassificationDataset(test_dataset, vocab, max_len=max_len)

batch_size = 64
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

### BiLSTMPoolingClassifier

In [4]:

import torch.nn as nn
import torch.nn.functional as F

class BiLSTMPoolingClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, 
                 num_classes=2, num_layers=1, 
                 bidirectional=True, dropout=0.5, pad_idx=0, pooling="mean"):
        super().__init__()

        #输入：[B, T]
        #输出：[B, T, embed_dim]
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)

        #输出 H: [B, T, hidden_dim * 2]
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True
        )

        self.dropout = nn.Dropout(dropout)
        self.pad_idx = pad_idx
        self.pooling = pooling
        lstm_output_dim = hidden_dim * (2 if bidirectional else 1)
        self.fc = nn.Linear(lstm_output_dim, num_classes)

    def forward(self, input_ids):
        """
        input_ids: [B, T]
        """
        emb = self.embedding(input_ids)     #[B, T, E]
        H, _ = self.lstm(emb)      #H: [B, T, 2H]

        #构造mask，标记哪些位置是真实token， 哪些位置是padding
        mask = (input_ids != self.pad_idx).unsqueeze(-1)    #mask:[B, T, 1], detype=bool

        #对非pad位置做加权平均
        if self.pooling == "mean":
            #把pad位置的hidden置为0， 真实token保留原值
            H_masked = H * mask                     #[B, T, 2H]
            #延时间维度求和
            sum_hidden = H_masked.sum(dim=1)        #[B, 2H]
            #计算每个样本的真实长度
            lengths = mask.sum(dim=1).clamp(min=1)  #[B, 1]
            pooled = sum_hidden / lengths
        
        #对时间维度做最大池化
        elif self.pooling == "max":
            #对于padding位置，把hidden的值设为1e-9
            H_masked = H.maked_fill(mask == 0, 1e-9) #[B, T, 2H]
            #延时间维度T取每个维度上的最大值
            #_是max的索引，不需要
            pooled, _ = H_masked.max(dim=1)

        #取最后一个非pad_token的hidden
        elif self.pooling == "last_nonpad":
            #对mask在时间维度上求和，计算每一个样本的真是长度
            lengths = mask.squeeze(-1).max(dim=1)   #[B]
            #取最后一个pad的索引
            idx = (lengths - 1).clamp(min=1)        #[B]
            #构造一个batch索引，用来和时间索引组合取值
            batch_idx = torch.arange(H.size(0), device=H.device)    #[B]
            #取每个样本最后一个非pad位置的hidden
            pooled = H[batch_idx, idx, :]           #[B, 2H]

        else:
            raise ValueError(f"Unkown pooling type: {self.pooling}")

        #缓解过拟合
        out = self.dropout(pooled)                  #[B, 2H]
        logits = self.fc(out)

        return logits


### LSTMAttentionClassifier

In [5]:

class LSTMAttentionClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim,
                 num_classes=2, num_layers=1, bidirectional=True,
                 dropout=0.5, pad_idx=0, attn_dim=128):
        super().__init__()
        #输入：[B, T]
        #输出：[B, T, embed_dim]
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)

       #输出 H: [B, T, hidden_dim * 2]
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=bidirectional,
            batch_first=True
        )

        self.dropout = nn.Dropout(dropout)
        lstm_out_dim = hidden_dim * (2 if bidirectional else 1)

        #attention
        self.atten_w = nn.Linear(lstm_out_dim, attn_dim, bias=True)
        self.atten_v = nn.Linear(attn_dim, 1, bias=False)

        #分类器
        self.fc = nn.Linear(lstm_out_dim, num_classes)
        self.pad_idx = pad_idx

    def attention(self, H, mask=None):
        """
        H: [B, T, H_dim]
        mask: [B, T]，padding 位置为 0, 真实token为1
        """
        u = torch.tanh(self.atten_w(H))     #[B, T, atten_dim]
        #线性映射成标量打分
        e = self.atten_v(u).squeeze(-1)     #[B, T]

        #mask掉padding的位置
        if mask is not None:
            e = e.masked_fill(mask==0, float("-inf"))

        #softmax归一化得到attention的权重分布
        alpha = F.softmax(e, dim=-1)        #[B, T]

        alpha_expanded = alpha.unsqueeze(-1)

        #加权隐藏状态
        weighted_H = H * alpha_expanded     #[B, T, H_dim]

        #沿时间求和得到上下文向量
        context = weighted_H.sum(dim=1)     #[B, H_dim]

        return context, alpha
    
    def forward(self, input_ids):
        """
        input_ids: [B, T]
        """
        emb = self.embedding(input_ids)     #[B, T, E]
        H, _ = self.lstm(emb)               #[B, T, H_dim]

        #标记真实token的位置
        mask = (input_ids != self.pad_idx).long() #[B, T]

        context, atten_weights = self.attention(H, mask)
        out = self.dropout(context)
        logits = self.fc(out)               #[B, num_classes]

        return logits, atten_weights

In [6]:
import torch
import torch.optim as optim
from sklearn.metrics import f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_one_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss, total_correct, total_examples = 0, 0, 0

    for input_ids, labels in dataloader:
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        #区分两种模型，有无attention
        outputs = model(input_ids)
        if isinstance(outputs, tuple):
            logits, _ = outputs
        else:
            logits = outputs

        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * input_ids.size(0)
        preds = logits.argmax(dim=-1)
        total_correct += (preds == labels).sum().item()
        total_examples += input_ids.size(0)

    avg_loss = total_loss / total_examples
    avg_acc = total_correct / total_examples

    return avg_loss, avg_acc

@torch.no_grad()
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss, total_correct, total_examples = 0, 0, 0
    all_preds, all_labels = [], []

    for input_ids, labels in dataloader:
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        outputs = model(input_ids)
        if isinstance(outputs, tuple):
            logits, _ = outputs
        else:
            logits = outputs

        loss = criterion(logits, labels)
        total_loss += loss.item() * input_ids.size(0)
        preds = logits.argmax(dim=-1)
        total_correct += (preds == labels).sum().item()
        total_examples += input_ids.size(0)

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())
    
    avg_loss = total_loss / total_examples
    avg_acc = total_correct / total_examples
    f1 = f1_score(all_labels, all_preds, average="macro")

    return avg_loss, avg_acc, f1

In [10]:
def run_experiment(model, train_loader, valid_loader, test_loader, lr=1e-3, epochs=5, model_name="model"):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    best_val_f1 = 0.0
    best_state_dict = None
    best_val_metrics = None

    for epoch in range(epochs):
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion)
        val_loss, val_acc, val_f1 = evaluate(model, valid_loader, criterion)
        print(f"[{model_name}] Epoch: {epoch+1}: "
              f"trian loss: {train_loss:.4f}, train_acc: {train_acc:.4f},"
              f"val loss: {val_loss:.4f}, val acc: {val_acc:.4f}, val f1: {val_f1:.4f}")
        
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_state_dict = {k: v.cpu() for k, v in model.state_dict().items()}
            best_val_metrics = (val_loss, val_acc, val_f1)

    #用best checkpoint 在test上评估
    model.load_state_dict(best_state_dict)
    model.to(device)
    test_loss, test_acc, test_f1 = evaluate(model, test_loader, criterion)

    print(f"[{model_name}] BEST on val_f1: "
          f"val loss: {best_val_metrics[0]:.4f}, val acc: {best_val_metrics[1]:.4f}, val f1: {best_val_metrics[2]:.4f}")
    print(f"[{model_name}] Test: loss={test_loss:.4f}, acc={test_acc:.4f}, f1={test_f1:.4f}")

    return {
        "val_loss": best_val_metrics[0],
        "val_acc": best_val_metrics[1],
        "val_f1": best_val_metrics[2],
        "test_loss": test_loss,
        "test_acc": test_acc,
        "test_f1": test_f1
    }

In [11]:
embed_dim = 128
hidden_dim = 128
num_classes = 2
pad_idx = vocab[PAD]
epochs = 5

set_seed(42)

baseline_model = BiLSTMPoolingClassifier(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    hidden_dim=hidden_dim,
    num_classes=num_classes,
    num_layers=1,
    bidirectional=True,
    dropout=0.5,
    pad_idx=pad_idx,
    pooling="mean"
)
metrics_base = run_experiment(
    baseline_model, train_loader, valid_loader, test_loader, lr=1e-3, epochs=epochs, model_name="BiLSTMPoolingClassifier"
)

attn_model = LSTMAttentionClassifier(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    hidden_dim=hidden_dim,
    num_classes=num_classes,
    num_layers=1,
    bidirectional=True,
    dropout=0.5,
    pad_idx=pad_idx,
    attn_dim=128
)
metrics_attn = run_experiment(
    attn_model, train_loader, valid_loader, test_loader, lr=1e-3, epochs=epochs, model_name="LSTMAttentionClassifier"
)

[BiLSTMPoolingClassifier] Epoch: 1: trian loss: 0.4995, train_acc: 0.7538,val loss: 0.3651, val acc: 0.8330, val f1: 0.8328
[BiLSTMPoolingClassifier] Epoch: 2: trian loss: 0.3096, train_acc: 0.8711,val loss: 0.3280, val acc: 0.8626, val f1: 0.8621
[BiLSTMPoolingClassifier] Epoch: 3: trian loss: 0.2160, train_acc: 0.9147,val loss: 0.2997, val acc: 0.8770, val f1: 0.8770
[BiLSTMPoolingClassifier] Epoch: 4: trian loss: 0.1477, train_acc: 0.9459,val loss: 0.3279, val acc: 0.8794, val f1: 0.8793
[BiLSTMPoolingClassifier] Epoch: 5: trian loss: 0.0911, train_acc: 0.9685,val loss: 0.3878, val acc: 0.8818, val f1: 0.8817
[BiLSTMPoolingClassifier] BEST on val_f1: val loss: 0.3878, val acc: 0.8818, val f1: 0.8817
[BiLSTMPoolingClassifier] Test: loss=0.4684, acc=0.8516, f1=0.8515
[LSTMAttentionClassifier] Epoch: 1: trian loss: 0.5097, train_acc: 0.7352,val loss: 0.3882, val acc: 0.8284, val f1: 0.8273
[LSTMAttentionClassifier] Epoch: 2: trian loss: 0.3010, train_acc: 0.8743,val loss: 0.2960, val a

In [13]:
def print_ablation_table(metrics_base, metrics_attn):
    print("| 模型                 | Attention | Val Acc | Val F1 | Test Acc | Test F1 | Test ΔF1 |")
    print("|----------------------|-----------|---------|--------|----------|---------|----------|")

    base_va, base_vf = metrics_base["val_acc"], metrics_base["val_f1"]
    base_ta, base_tf = metrics_base["test_acc"], metrics_base["test_f1"]

    attn_va, attn_vf = metrics_attn["val_acc"], metrics_attn["val_f1"]
    attn_ta, attn_tf = metrics_attn["test_acc"], metrics_attn["test_f1"]

    delta_f1 = attn_tf - base_tf

    print(f"| BiLSTM + MeanPooling | ✗         | {base_va:.4f} | {base_vf:.4f} | {base_ta:.4f} | {base_tf:.4f} | 0.0000   |")
    print(f"| BiLSTM + Attention   | ✓         | {attn_va:.4f} | {attn_vf:.4f} | {attn_ta:.4f} | {attn_tf:.4f} | {delta_f1:+.4f} |")

print_ablation_table(metrics_base, metrics_attn)

| 模型                 | Attention | Val Acc | Val F1 | Test Acc | Test F1 | Test ΔF1 |
|----------------------|-----------|---------|--------|----------|---------|----------|
| BiLSTM + MeanPooling | ✗         | 0.8818 | 0.8817 | 0.8516 | 0.8515 | 0.0000   |
| BiLSTM + Attention   | ✓         | 0.8800 | 0.8800 | 0.8658 | 0.8658 | +0.0143 |
