In [23]:
from datasets import load_dataset
#from sklearn.model_selection import train_test_split

raw_datasets = load_dataset("imdb")     #二分类：0=neg， 1=pos

print(raw_datasets)

train_valid = raw_datasets["train"]
test_dataset = raw_datasets["test"]

#拆成20000train和5000valid
train_valid = train_valid.train_test_split(test_size=5000, seed=42)
train_dataset = train_valid["train"]
valid_dataset = train_valid["test"]

print(len(train_dataset), len(valid_dataset), len(test_dataset))

from collections import Counter
import re

def simple_tokenizer(text):
    text = text.lower()
    #r表示原始字符串raw string, 避免/这种转义带来的麻烦
    #+表示前面的模式至少出现一次，可以连续多次
    #可在一起的意思是，匹配text里连续的小写英文字母字符串，也就是按英文单词切分
    tokens = re.findall(r"[a-z]+", text)
    return tokens

min_freq = 5 #词频小于5的当<unk>
counter = Counter()

for example in train_dataset:
    tokens = simple_tokenizer(example["text"])
    counter.update(tokens)

#special tokens
PAD = "<pad>"
UNK = "<unk>"

#<pad>用于补齐长度，编号0；<unk>表示未知词，编号1
vocab = {PAD: 0, UNK: 1}

#counter.items()返回(单词，词频)
for word, freq in counter.items():
    if freq >= min_freq:
        vocab[word] = len(vocab)

vocab_size = len(vocab)
print("vocab_size= ", vocab_size)

#encode: 把一条原始文本转换成固定长度的token id序列
def encode(text, vocab, max_len=256):
    tokens = simple_tokenizer(text)
    #把token转换成对应的词表id，如果token不在词表中，则用<UNK>的id
    ids = [vocab.get(tok, vocab[UNK]) for tok in tokens]

    if len(ids) < max_len:
        pad_len = max_len - len(ids)
        ids += [vocab[PAD]] * pad_len
    
    else:
        ids = ids[:max_len]

    return ids

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})
20000 5000 25000
vocab_size=  25954


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    #cpu
    torch.manual_seed(seed)
    #gpu
    torch.cuda.manual_seed_all(seed)
    # 强制 PyTorch 使用“确定性”的 cuDNN 算法
    # 作用：即使速度慢一些，也保证同样的输入 → 同样的输出
    # 否则 cuDNN 可能为了加速选择非确定性实现
    torch.backends.cudnn.deterministic = True
    # 关闭 cuDNN 的自动性能优化
    # benchmark=True 会根据输入动态选择最快算法，但可能导致结果不稳定
    # 这里设为 False，是为了结果可复现而不是追求速度
    torch.backends.cudnn.benchmark = False

class TextClassificationDataset(Dataset):
    def __init__(self, hf_dataset, vocab, max_len=256):
        self.dataset = hf_dataset
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        text = self.dataset[idx]["text"]
        label = self.dataset[idx]["label"]

        input_ids = encode(text, vocab=self.vocab, max_len=self.max_len)
        input_ids = torch.tensor(input_ids, dtype=torch.long)

        return input_ids, label
    
max_len = 256
train_data = TextClassificationDataset(train_dataset, vocab, max_len=max_len)
valid_data = TextClassificationDataset(valid_dataset, vocab, max_len=max_len)
test_data = TextClassificationDataset(test_dataset, vocab, max_len=max_len)

batch_size = 64
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

import torch.nn as nn
import torch.nn.functional as F

class LSTMAttentionClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim,
                 num_classes=2, num_layers=1, bidirectional=True,
                 dropout=0.5, pad_idx=0, attn_dim=128):
        super().__init__()
        #输入：[B, T]
        #输出：[B, T, embed_dim]
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)

       #输出 H: [B, T, hidden_dim * 2]
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=bidirectional,
            batch_first=True
        )

        self.dropout = nn.Dropout(dropout)
        lstm_out_dim = hidden_dim * (2 if bidirectional else 1)

        #attention
        self.atten_w = nn.Linear(lstm_out_dim, attn_dim, bias=True)
        self.atten_v = nn.Linear(attn_dim, 1, bias=False)

        #分类器
        self.fc = nn.Linear(lstm_out_dim, num_classes)
        self.pad_idx = pad_idx

    def attention(self, H, mask=None):
        """
        H: [B, T, H_dim]
        mask: [B, T]，padding 位置为 0, 真实token为1
        """
        u = torch.tanh(self.atten_w(H))     #[B, T, atten_dim]
        #线性映射成标量打分
        e = self.atten_v(u).squeeze(-1)     #[B, T]

        #mask掉padding的位置
        if mask is not None:
            e = e.masked_fill(mask==0, float("-inf"))

        #softmax归一化得到attention的权重分布
        alpha = F.softmax(e, dim=-1)        #[B, T]

        alpha_expanded = alpha.unsqueeze(-1)

        #加权隐藏状态
        weighted_H = H * alpha_expanded     #[B, T, H_dim]

        #沿时间求和得到上下文向量
        context = weighted_H.sum(dim=1)     #[B, H_dim]

        return context, alpha
    
    def forward(self, input_ids):
        """
        input_ids: [B, T]
        """
        emb = self.embedding(input_ids)     #[B, T, E]
        H, _ = self.lstm(emb)               #[B, T, H_dim]

        #标记真实token的位置
        mask = (input_ids != self.pad_idx).long() #[B, T]

        context, atten_weights = self.attention(H, mask)
        out = self.dropout(context)
        logits = self.fc(out)               #[B, num_classes]

        return logits, atten_weights
    
import torch
import torch.optim as optim
from sklearn.metrics import f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_one_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss, total_correct, total_examples = 0, 0, 0

    for input_ids, labels in dataloader:
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        #区分两种模型，有无attention
        outputs = model(input_ids)
        if isinstance(outputs, tuple):
            logits, _ = outputs
        else:
            logits = outputs

        loss = criterion(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * input_ids.size(0)
        preds = logits.argmax(dim=-1)
        total_correct += (preds == labels).sum().item()
        total_examples += input_ids.size(0)

    avg_loss = total_loss / total_examples
    avg_acc = total_correct / total_examples

    return avg_loss, avg_acc

@torch.no_grad()
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss, total_correct, total_examples = 0, 0, 0
    all_preds, all_labels = [], []

    for input_ids, labels in dataloader:
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        outputs = model(input_ids)
        if isinstance(outputs, tuple):
            logits, _ = outputs
        else:
            logits = outputs

        loss = criterion(logits, labels)
        total_loss += loss.item() * input_ids.size(0)
        preds = logits.argmax(dim=-1)
        total_correct += (preds == labels).sum().item()
        total_examples += input_ids.size(0)

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())
    
    avg_loss = total_loss / total_examples
    avg_acc = total_correct / total_examples
    f1 = f1_score(all_labels, all_preds, average="macro")

    return avg_loss, avg_acc, f1

def run_experiment(model, train_loader, valid_loader, test_loader, lr=1e-3, epochs=5, model_name="model"):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    best_val_f1 = 0.0
    best_state_dict = None
    best_val_metrics = None

    for epoch in range(epochs):
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion)
        val_loss, val_acc, val_f1 = evaluate(model, valid_loader, criterion)
        print(f"[{model_name}] Epoch: {epoch+1}: "
              f"trian loss: {train_loss:.4f}, train_acc: {train_acc:.4f},"
              f"val loss: {val_loss:.4f}, val acc: {val_acc:.4f}, val f1: {val_f1:.4f}")
        
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_state_dict = {k: v.cpu() for k, v in model.state_dict().items()}
            best_val_metrics = (val_loss, val_acc, val_f1)

    #用best checkpoint 在test上评估
    model.load_state_dict(best_state_dict)
    model.to(device)
    test_loss, test_acc, test_f1 = evaluate(model, test_loader, criterion)

    print(f"[{model_name}] BEST on val_f1: "
          f"val loss: {best_val_metrics[0]:.4f}, val acc: {best_val_metrics[1]:.4f}, val f1: {best_val_metrics[2]:.4f}")
    print(f"[{model_name}] Test: loss={test_loss:.4f}, acc={test_acc:.4f}, f1={test_f1:.4f}")

    return {
        "val_loss": best_val_metrics[0],
        "val_acc": best_val_metrics[1],
        "val_f1": best_val_metrics[2],
        "test_loss": test_loss,
        "test_acc": test_acc,
        "test_f1": test_f1
    }

In [7]:
embed_dim = 128
hidden_dim = 128
num_classes = 2
pad_idx = vocab[PAD]
epochs = 5

set_seed(42)

attn_model = LSTMAttentionClassifier(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    hidden_dim=hidden_dim,
    num_classes=num_classes,
    num_layers=1,
    bidirectional=True,
    dropout=0.5,
    pad_idx=pad_idx,
    attn_dim=128
)
metrics_attn = run_experiment(
    attn_model, train_loader, valid_loader, test_loader, lr=1e-3, epochs=epochs, model_name="LSTMAttentionClassifier"
)

[LSTMAttentionClassifier] Epoch: 1: trian loss: 0.4905, train_acc: 0.7489,val loss: 0.3863, val acc: 0.8298, val f1: 0.8283
[LSTMAttentionClassifier] Epoch: 2: trian loss: 0.2961, train_acc: 0.8756,val loss: 0.3085, val acc: 0.8746, val f1: 0.8745
[LSTMAttentionClassifier] Epoch: 3: trian loss: 0.2002, train_acc: 0.9218,val loss: 0.2969, val acc: 0.8758, val f1: 0.8756
[LSTMAttentionClassifier] Epoch: 4: trian loss: 0.1321, train_acc: 0.9516,val loss: 0.3197, val acc: 0.8732, val f1: 0.8731
[LSTMAttentionClassifier] Epoch: 5: trian loss: 0.0745, train_acc: 0.9746,val loss: 0.4092, val acc: 0.8692, val f1: 0.8690
[LSTMAttentionClassifier] BEST on val_f1: val loss: 0.2969, val acc: 0.8758, val f1: 0.8756
[LSTMAttentionClassifier] Test: loss=0.3244, acc=0.8594, f1=0.8591


### 构建结果表
test_dataset：HF 的原始数据集（里面有 "text" / "label"）

attn_model：训练完并且 run_experiment 里已经 load 了 best checkpoint 的模型

encode(text, vocab, max_len)：把原始文本转成 id 序列

循环遍历 test_dataset（按 batch），对每一条跑 attn_model 得到 logits。

用 softmax 得到各类概率、argmax 得到预测标签。

把这些东西塞进 pandas.DataFrame：

text：原始文本

y_true：真实标签（0/1）

y_pred：预测标签（0/1）

prob_pos：预测为正类的概率（或者 0 类的也行，看你习惯）

is_error：是否预测错误

In [12]:
import pandas as pd
import torch.nn.functional as F

@torch.no_grad()
def build_results_table(model, hf_dataset, vocab, max_len=256, batch_size=64, device=device):
    """
    针对给定的 HuggingFace 数据集（如 test_dataset），
    用当前模型跑一遍，返回一个 DataFrame 结果表：
    - text: 原始文本
    - y_true: 真实标签
    - y_pred: 预测标签
    - prob_pos: 预测为正类(1)的概率
    - is_error: 是否预测错误
    """
    model.eval()
    model.to(device)

    all_texts = []
    all_labels = []
    all_preds = []
    all_prob_pos = []
    all_pred_conf = []

    num_samples = len(hf_dataset)

    for start in range(0, num_samples, batch_size):
        #end为当前的batch下标
        end = min(start + batch_size, num_samples)

        batch = hf_dataset[start:end]
        batch_texts = batch["text"]
        batch_labels = batch["label"]

        #编码成ids，对当前batch内每条文本做分词 / 映射到token id，并统一max_len长度
        batch_ids = [encode(t, vocab=vocab, max_len=max_len) for t in batch_texts]
        batch_ids = torch.tensor(batch_ids, dtype=torch.long, device=device)

        outputs = model(batch_ids)

        if isinstance(outputs, tuple):
          logits, attn_weights = outputs
        else:
          logits = outputs
        
        #通过softmax将logits打分转换成概率分布
        probs = F.softmax(logits, dim=-1) #[batch_size, num_classes]

        #取标签1（正类）的概率
        prob_pos = probs[:, 1].detach().cpu().tolist()          #lis[int]
        preds = probs.argmax(dim=-1).detach().cpu().tolist()    #list[float]
        #预测置信度
        pred_conf = probs.max(dim=-1).values.cpu().tolist()     #list[float]

        #收集当前batch的结果到总列表中
        all_texts.extend(batch_texts)
        all_labels.extend(batch_labels)
        all_preds.extend(preds)
        all_prob_pos.extend(prob_pos)
        all_pred_conf.extend(pred_conf)

    #长度对齐 sanity check
    assert len(all_texts) == len(all_labels) == len(all_preds) == len(all_prob_pos) == len(all_pred_conf)

    df = pd.DataFrame({
       "text": all_texts,
       "y_true": all_labels,
       "y_pred": all_preds,
       "prob_pos": all_prob_pos,
       "pred_conf": all_pred_conf
    })
    #根据真实标签与预测标签是否相等来标记每条样本是否预测错误
    df["is_error"] = df["y_true"] != df["y_pred"]

    return df

In [None]:
results_attn_test = build_results_table(
    model=attn_model,
    hf_dataset=test_dataset,
    vocab=vocab,
    max_len=256,
    batch_size=64,
    device=device
)

print(results_attn_test.head())

results_attn_test.to_csv("imdb_lstm_attention_test_results.csv", index=False, encoding="utf-8-sig")
print("LSTM + Attention 测试集结果已保存 -> imdb_lstm_attention_test_results.csv")

                                                text  y_true  y_pred  \
0  I love sci-fi and am willing to put up with a ...       0       0   
1  Worth the entertainment value of a rental, esp...       0       0   
2  its a totally average film with a few semi-alr...       0       0   
3  STAR RATING: ***** Saturday Night **** Friday ...       0       0   
4  First off let me say, If you haven't enjoyed a...       0       1   

   prob_pos  pred_conf  is_error  
0  0.002610   0.997390     False  
1  0.116442   0.883558     False  
2  0.016801   0.983199     False  
3  0.016499   0.983501     False  
4  0.935788   0.935788      True  
LSTM + Attention 测试集结果已保存 -> imdb_lstm_attention_test_results.csv


In [5]:
df = pd.read_csv("imdb_lstm_attention_test_results.csv")
errors = df[df["is_error"]].copy()
print(f"总样本数：{len(df)}\n")
print(f"错误样本数：{len(errors)}\n")
print(f"错误率：{len(errors) / len(df)}")

总样本数：25000

错误样本数：3515

错误率：0.1406


## step 1
1.整体 accuracy / F1

2.按类别的 precision / recall / F1

3.混淆矩阵（confusion matrix）

In [6]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score, f1_score, precision_recall_fscore_support, confusion_matrix, classification_report
)

y_true = df["y_true"].to_numpy()
y_pred = df["y_pred"].to_numpy()

#1.整体指标
acc = accuracy_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average="macro")
f1_micro = f1_score(y_true, y_pred, average="micro")
f1_weighted = f1_score(y_true, y_pred, average="weighted")

print("=== overall metrics ===")
print(f"Accuracy       : {acc:.4f}\n"
        f"F1 macro       : {f1_macro:.4f}\n"
        f"F1 micro       : {f1_micro:.4f}\n"
        f"F1 weighted    : {f1_weighted:.4f}\n")

#2.按类别指标
rep = classification_report(y_true, y_pred, target_names=["neg(0)", "pos(1)"], digits=4, output_dict=True)
#T是转置，让报告里的每个指标成为行
rep_df = pd.DataFrame(rep).T
#在jupyter notebook中显示dataframe
display(rep_df)

#3.混淆矩阵（含归一化）
cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
cm_df = pd.DataFrame(cm, index=["true_neg(0)", "true_pos(1)"], columns=["pre_neg(0)", "pre_pos(01)"])
print("\nConfusion Matrix:")
print(cm_df)

#按行归一化, 保留原数组维度
cm_norm = cm / cm.sum(axis=1, keepdims=True)
cm_norm_df = pd.DataFrame(cm_norm, index=["true_neg(0)", "true_pos(1)"], columns=["pred_neg(0)", "pred_pos(1)"])
print("\nNormalized Confusion Matrix:")
print(cm_norm_df.round(4))

=== overall metrics ===
Accuracy       : 0.8594
F1 macro       : 0.8591
F1 micro       : 0.8594
F1 weighted    : 0.8591



Unnamed: 0,precision,recall,f1-score,support
neg(0),0.830355,0.90336,0.865321,12500.0
pos(1),0.894044,0.81544,0.852935,12500.0
accuracy,0.8594,0.8594,0.8594,0.8594
macro avg,0.8622,0.8594,0.859128,25000.0
weighted avg,0.8622,0.8594,0.859128,25000.0



Confusion Matrix:
             pre_neg(0)  pre_pos(01)
true_neg(0)       11292         1208
true_pos(1)        2307        10193

Normalized Confusion Matrix:
             pred_neg(0)  pred_pos(1)
true_neg(0)       0.9034       0.0966
true_pos(1)       0.1846       0.8154


## step 2

In [7]:
#计算 FN/FP 数量
TN, FP, FN, TP = cm[0,0], cm[0,1], cm[1,0], cm[1,1]

print(f"FP (neg->pos) = {FP}")
print(f"FN (pos->neg) = {FN}")

FP (neg->pos) = 1208
FN (pos->neg) = 2307


In [19]:
fp = errors[(errors["y_true"] == 0) & (errors["y_pred"] == 1)].copy()  # neg->pos
fn = errors[(errors["y_true"] == 1) & (errors["y_pred"] == 0)].copy()  # pos->neg
print("FP:", len(fp), "FN:", len(fn))

FP: 1208 FN: 2307


In [29]:
high_conf_fp = fp.sort_values("pred_conf", ascending=False).head(50)
high_conf_fn = fn.sort_values("pred_conf", ascending=False).head(50)
def preview(df_subset, k=5, max_chars=300):
    for _, r in df_subset.head(k).iterrows():
        print("="*60)
        print(f"true={r['y_true']} pred={r['y_pred']} pred_conf={r['pred_conf']:.4f} prob_pos={r['prob_pos']:.4f}")
        print(r["text"][:max_chars].replace("\n"," ") + ("..." if len(r["text"])>max_chars else ""))

print("---- High-conf FN (pos->neg) ----")
preview(high_conf_fn, k=10)

print("---- High-conf FP (neg->pos) ----")
preview(high_conf_fp, k=10)


---- High-conf FN (pos->neg) ----
true=1 pred=0 pred_conf=0.9999 prob_pos=0.0001
David Morse and Andre Braugher are very talented actors, which is why I'm trying so hard to support this program. Unfortunately, an irrational plot, and very poor writing is making it difficult for me. I'm hoping that the show gets a serious overhaul, or that the actors find new projects that are wo...
true=1 pred=0 pred_conf=0.9992 prob_pos=0.0008
It's not Citizen Kane, but it does deliver. Cleavage, and lots of it.<br /><br />Badly acted and directed, poorly scripted. Who cares? I didn't watch it for the dialog.
true=1 pred=0 pred_conf=0.9986 prob_pos=0.0014
Sam Firstenberg's "Ninja 3:The Domination" mixes martial arts with "The Exorcist" like horror.The horror elements thrown on screen are simply laughable,but the film works as a mindless action/martial arts flick.The fight scenes are well-choreographed and exciting,and the film is never boring.So forg...
true=1 pred=0 pred_conf=0.9983 prob_pos=0.0017
G

高置信错例偏“模型盲点”，边界错例偏“决策阈值附近/特征不足/可通过数据增强修复”。

对于二分类，用 prob_pos 做桶更直观：

FN（pos→neg）：模型预测 neg，通常 prob_pos 很低；边界 FN 往往在 prob_pos≈0.4~0.5

FP（neg→pos）：边界 FP 往往在 prob_pos≈0.5~0.6

In [21]:
# 边界FN：本来是pos，但prob_pos靠近0.5（模型其实有点犹豫）
border_fn = fn[(fn["prob_pos"] > 0.35) & (fn["prob_pos"] < 0.5)].sample(n=min(50, len(fn)), random_state=42)

# 边界FP：本来是neg，但prob_pos靠近0.5（模型也犹豫）
border_fp = fp[(fp["prob_pos"] > 0.5) & (fp["prob_pos"] < 0.65)].sample(n=min(50, len(fp)), random_state=42)

print("border_fn:", len(border_fn), "border_fp:", len(border_fp))


border_fn: 50 border_fp: 50


In [None]:
import re  # 导入正则表达式库，用于做简单的英文分词（按字母序列切词）

def simple_tokenizer(text):
    text = text.lower()  # 统一转小写，避免 "Good" 和 "good" 被当成两个词
    return re.findall(r"[a-z]+", text)  # 用正则找出所有连续的英文小写字母串，作为 token 列表


def add_diagnostics(df_in, vocab, max_len=256, unk_id=1):
    """
    给输入的 DataFrame 增加一些“诊断列”，用于错误分析的分桶抽样：
    - tok_len: 原文本分词后的 token 数（未截断前）
    - is_truncated: token 数是否超过 max_len（是否会被截断）
    - unk_ratio: 编码到词表后，<unk> 占比（词表覆盖不足的信号）
    """
    df2 = df_in.copy()  # 复制一份，避免直接修改原 DataFrame（防止副作用）

    lens = []       # 存每条样本的 token 总长度（未截断）
    truncated = []  # 存每条样本是否会被截断（True/False）
    unk_ratio = []  # 存每条样本的 <unk> 比例（0~1）

    for t in df2["text"]:  # 遍历 DataFrame 中每条文本（逐样本处理）
        toks = simple_tokenizer(t)          # 对当前文本做分词，得到 token 列表
        lens.append(len(toks))              # 记录 token 数（文本原始长度信号）
        truncated.append(len(toks) > max_len)  # 判断是否超过 max_len（超过意味着 encode 会截断）

        # 将 token 映射到词表 id；只取前 max_len 个 token（与 encode 的截断逻辑一致）
        ids = [vocab.get(tok, unk_id) for tok in toks[:max_len]]

        # 如果当前文本分词后为空（极少情况，比如全是符号），UNK 比例设为 1.0
        if len(ids) == 0:
            unk_ratio.append(1.0)
        else:
            # 统计 ids 中等于 unk_id 的个数，再除以长度，得到 <unk> 占比
            unk_ratio.append(sum(i == unk_id for i in ids) / len(ids))

    df2["tok_len"] = lens            # 新增列：token 总长度（未截断前）
    df2["is_truncated"] = truncated  # 新增列：是否会被截断
    df2["unk_ratio"] = unk_ratio     # 新增列：<unk> 比例

    return df2  # 返回带诊断信息的新 DataFrame


# 在错误样本 errors 上增加诊断列：
# unk_id 用 vocab["<unk>"]，确保与训练/编码时的 UNK id 一致
errors_diag = add_diagnostics(
    errors,
    vocab=vocab,
    max_len=256,
    unk_id=vocab["<unk>"]
)


In [25]:
# 截断桶：长文本被截断导致关键信息丢失
trunc_samples = errors_diag[errors_diag["is_truncated"]].sample(
    n=min(50, (errors_diag["is_truncated"]).sum()), random_state=42
)

# 高UNK桶：词表覆盖不足/表达很口语/拼写变体多
high_unk_samples = errors_diag[errors_diag["unk_ratio"] > 0.1].sample(
    n=min(50, (errors_diag["unk_ratio"] > 0.1).sum()), random_state=42
)

print("trunc_samples:", len(trunc_samples), "high_unk_samples:", len(high_unk_samples))


trunc_samples: 50 high_unk_samples: 22


In [None]:
# 一组“转折/让步”关键词：常见结构是“前面夸，but 后面骂”，很容易导致情感翻转
contrast = ["but", "however", "though", "yet", "although", "nevertheless"]

# 一组“否定”关键词：否定会改变情感极性（not good / not bad），模型常在这类句子上犯错
negation = ["not", "no", "never", "nothing", "none", "hardly", "barely"]

def contains_any(text, keywords):
    t = text.lower()                     # 统一小写，避免大小写影响匹配（"But" vs "but"）
    return any(k in t for k in keywords) # 只要 keywords 里任意一个子串出现在文本里，就返回 True

# 给 errors_diag 增加一列：该文本是否包含“转折/让步”关键词（True/False）
errors_diag["has_contrast"] = errors_diag["text"].apply(
    lambda x: contains_any(x, contrast)  # 对每条文本 x 调用 contains_any，判断是否包含 contrast 关键词
)

# 给 errors_diag 增加一列：该文本是否包含“否定”关键词（True/False）
errors_diag["has_negation"] = errors_diag["text"].apply(
    lambda x: contains_any(x, negation)  # 对每条文本 x 调用 contains_any，判断是否包含 negation 关键词
)

# 从“包含转折词”的错误样本里抽样（最多抽 50 条；如果不足 50 条就全抽）
contrast_samples = errors_diag[errors_diag["has_contrast"]].sample(
    n=min(50, (errors_diag["has_contrast"]).sum()),  # sum() 会把 True 当 1，统计总共有多少条 True
    random_state=42                                  # 固定随机种子，保证每次抽样结果可复现
)

# 从“包含否定词”的错误样本里抽样（最多抽 50 条；如果不足 50 条就全抽）
negation_samples = errors_diag[errors_diag["has_negation"]].sample(
    n=min(50, (errors_diag["has_negation"]).sum()),  # 同理：最多 50，少于 50 就按实际数量抽
    random_state=42                                  # 固定随机种子，保证可复现
)

# 打印两个桶分别抽到了多少条（用于 sanity check：是不是桶里压根没数据）
print("contrast_samples:", len(contrast_samples), "negation_samples:", len(negation_samples))


contrast_samples: 50 negation_samples: 50


In [31]:
to_label = pd.concat([
    high_conf_fn,
    high_conf_fp,
    border_fn,
    border_fp,
    trunc_samples,
    high_unk_samples,
    contrast_samples,
    negation_samples
], ignore_index=True).drop_duplicates(subset=["text"])

# 只取前200条（你也可以随机抽200）
to_label = to_label.sample(n=min(200, len(to_label)), random_state=42)

display(to_label)

to_label["error_type"] = ""   # 手动标：label_issue / semantic_hard / data_coverage / preprocess_length ...
to_label["comment"] = ""
to_label.to_csv("imdb_error_samples_to_label.csv", index=False, encoding="utf-8-sig")
print("Saved -> imdb_error_samples_to_label.csv, size=", len(to_label))


Unnamed: 0,text,y_true,y_pred,prob_pos,pred_conf,is_error,tok_len,is_truncated,unk_ratio,has_contrast,has_negation
224,The very first talking picture has returned fr...,1,0,0.394698,0.605301,True,523.0,True,0.074219,,
42,If you want to have a great time then this is ...,1,0,0.014021,0.985979,True,,,,,
266,Basically an endearingly chintzy and moronic $...,1,0,0.048224,0.951776,True,259.0,True,0.101562,,
312,Extraordinary Rendition is a frightening pract...,1,0,0.467025,0.532975,True,649.0,True,0.023438,True,True
56,Woody Allen (who I have to confess at the outs...,0,1,0.995810,0.995810,True,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
115,"Walter Matthau and Jack Lemmon, both of whom a...",1,0,0.407434,0.592566,True,,,,,
199,I agree that Mary Woronov (Murdoch's secretary...,0,1,0.505140,0.505140,True,,,,,
305,It was the first action movie made in banned i...,0,1,0.629593,0.629593,True,86.0,False,0.000000,True,True
247,I'm sick and tired of people complaining that ...,1,0,0.090048,0.909952,True,888.0,True,0.015625,,


Saved -> imdb_error_samples_to_label.csv, size= 200
