# 基于IMDB的情感分类

## 数据加载与预处理

我们首先使用 TensorFlow Keras 的内置 IMDB 数据集（包含 25,000 条已标注的电影评论
keras.io
），将评论转换为固定长度的词索引序列，并构造 PyTorch 的 Dataset 和 DataLoader 以便后续训练。关键步骤包括：设置词汇表大小、最大序列长度、序列填充，以及将数据转换为 PyTorch 张量。

In [1]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
vocab_size = 10000    # 词汇表大小（只考虑最常见的10000个词）
maxlen = 200          # 文本序列最大长度

In [3]:
print("Loading IMDB dataset...")
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size)

Loading IMDB dataset...


In [4]:
# 对序列进行填充，使其长度一致
train_data = pad_sequences(train_data, maxlen=maxlen, padding='post')
test_data = pad_sequences(test_data, maxlen=maxlen, padding='post')


In [5]:
# 构建 PyTorch Dataset
class MovieDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data    # 序列数据（numpy数组）
        self.labels = labels  # 标签（0 或 1）

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # 返回单条样本（序列张量、标签张量）
        x = torch.tensor(self.data[idx], dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.float)  # 使用 float 类型以配合 BCE 损失
        return x, y

In [6]:
# 实例化 Dataset 和 DataLoader
train_dataset = MovieDataset(train_data, train_labels)
test_dataset  = MovieDataset(test_data, test_labels)
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Number of training samples: {len(train_dataset)}, testing samples: {len(test_dataset)}")

Number of training samples: 25000, testing samples: 25000


## RNN模型

我们首先实现一个**简单的RNN（循环神经网络）**模型，包括词嵌入层（Embedding）和一个基本的 RNN 层，以及一个全连接输出层。模型结构为：输入文本序列 → Embedding → RNN → 全连接层 → 输出单个标量。训练过程中使用二元交叉熵损失函数（BCEWithLogitsLoss）和 Adam 优化器。

In [7]:
import torch.nn as nn
import torch.optim as optim

# 定义 RNN 模型
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)        # 词嵌入层
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)  # 基础 RNN 层
        self.fc = nn.Linear(hidden_dim, 1)                          # 输出层

    def forward(self, x):
        x = self.embedding(x)              # [batch, seq_len] -> [batch, seq_len, embed_dim]
        output, hidden = self.rnn(x)       # output: 所有时间步的输出, hidden: 最后时间步的隐藏状态
        out = self.fc(hidden.squeeze(0))   # 取最后时间步的隐藏状态预测
        return out

# 设备选择（若有GPU则使用GPU）
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 实例化模型、损失函数和优化器
model_rnn = RNNModel(vocab_size=vocab_size, embed_dim=128, hidden_dim=128).to(device)
criterion = nn.BCEWithLogitsLoss()  # 二元交叉熵（包含 Sigmoid）
optimizer = optim.Adam(model_rnn.parameters(), lr=0.001)

# 训练 RNN 模型
epochs = 3
for epoch in range(epochs):
    model_rnn.train()
    total_loss = 0.0
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model_rnn(inputs).squeeze(1)  # 模型输出形状 [batch, 1]
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

# 在测试集上评估 RNN 模型
model_rnn.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = torch.sigmoid(model_rnn(inputs).squeeze(1))  # 计算 Sigmoid 概率
        preds = (outputs >= 0.5).float()  # 阈值分类
        correct += (preds == labels).sum().item()
        total += labels.size(0)
accuracy = correct / total
print(f"RNN 模型测试准确率: {accuracy*100:.2f}%")


Epoch 1/3, Loss: 0.6885
Epoch 2/3, Loss: 0.6704
Epoch 3/3, Loss: 0.6481
RNN 模型测试准确率: 55.09%


## LSTM模型
我们实现**LSTM（长短时记忆）**模型。与 RNN 类似，模型同样包含 Embedding 层和 LSTM 层，只是在循环层中替换为 nn.LSTM。LSTM 能更好地捕捉长程依赖，往往在文本任务上效果优于普通 RNN。其余训练设置与 RNN 相同。

In [8]:
# 定义 LSTM 模型
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)           # 词嵌入层
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)   # LSTM 层
        self.fc = nn.Linear(hidden_dim, 1)                             # 输出层

    def forward(self, x):
        x = self.embedding(x)                        # [batch, seq_len, embed_dim]
        output, (hidden, cell) = self.lstm(x)        # hidden: [num_layers, batch, hidden_dim]
        out = self.fc(hidden.squeeze(0))             # 取最后一层隐藏状态
        return out

# 实例化 LSTM 模型
model_lstm = LSTMModel(vocab_size=vocab_size, embed_dim=128, hidden_dim=128).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model_lstm.parameters(), lr=0.001)

# 训练 LSTM 模型
epochs = 6
for epoch in range(epochs):
    model_lstm.train()
    total_loss = 0.0
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model_lstm(inputs).squeeze(1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

# 在测试集上评估 LSTM 模型
model_lstm.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = torch.sigmoid(model_lstm(inputs).squeeze(1))
        preds = (outputs >= 0.5).float()
        correct += (preds == labels).sum().item()
        total += labels.size(0)
accuracy = correct / total
print(f"LSTM 模型测试准确率: {accuracy*100:.2f}%")


Epoch 1/6, Loss: 0.6676
Epoch 2/6, Loss: 0.5463
Epoch 3/6, Loss: 0.4343
Epoch 4/6, Loss: 0.4528
Epoch 5/6, Loss: 0.4003
Epoch 6/6, Loss: 0.3421
LSTM 模型测试准确率: 79.87%


## BERT模型
最后，我们使用BERT模型（bert-base-uncased）进行情感分类。BERT 模型接收原始文本输入，因此需要先将索引序列还原为文本，然后使用 Hugging Face 的 BertTokenizer 和 BertForSequenceClassification。关键步骤如下：解码词索引序列为文本 → 使用 BERT 分词器编码文本 → 构建 PyTorch Dataset → 微调 BERT 模型。

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

# 1. 将索引序列解码为原始文本
word_index = imdb.get_word_index()  # Keras IMDB 的词到索引映射
# 由于 Keras 在加载数据时使用了 index_from=3 的偏移，需要加回去
reverse_word_index = {value+3: key for key, value in word_index.items()}
reverse_word_index[0] = "<PAD>"
reverse_word_index[1] = "<START>"
reverse_word_index[2] = "<UNK>"

# 生成文本列表（去除填充符号）
texts_train = []
for seq in train_data:
    words = [reverse_word_index.get(idx, "<PAD>") for idx in seq]
    # 去掉特殊标记
    words = [w for w in words if w not in ["<PAD>", "<START>", "<UNK>"]]
    texts_train.append(" ".join(words))

texts_test = []
for seq in test_data:
    words = [reverse_word_index.get(idx, "<PAD>") for idx in seq]
    words = [w for w in words if w not in ["<PAD>", "<START>", "<UNK>"]]
    texts_test.append(" ".join(words))

# 2. 使用 BERT 分词器对文本进行编码
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128  # BERT 最大序列长度（可根据显存调整）
train_encodings = tokenizer(texts_train, truncation=True, padding=True, max_length=max_len)
test_encodings  = tokenizer(texts_test,  truncation=True, padding=True, max_length=max_len)

# 将标签转换为列表（长整数）
train_labels_list = train_labels.tolist()
test_labels_list  = test_labels.tolist()

# 3. 构建 PyTorch Dataset
class IMDBDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset_bert = IMDBDataset(train_encodings, train_labels_list)
test_dataset_bert  = IMDBDataset(test_encodings,  test_labels_list)
train_loader_bert  = DataLoader(train_dataset_bert, batch_size=16, shuffle=True)
test_loader_bert   = DataLoader(test_dataset_bert,  batch_size=16, shuffle=False)

# 4. 加载预训练 BERT 模型（2 分类）
model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

izer = optim.Adam(model_bert.parameters(), lr=2e-5)

# 5. 训练 BERT 模型（微调）
epochs = 2
for epoch in range(epochs):
    model_bert.train()
    total_loss = 0.0
    for batch in train_loader_bert:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model_bert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader_bert)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

# 6. 在测试集上评估 BERT 模型
model_bert.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader_bert:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model_bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
accuracy = correct / total
print(f"BERT 模型测试准确率: {accuracy*100:.2f}%")


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2, Loss: 0.3172
Epoch 2/2, Loss: 0.1884
BERT 模型测试准确率: 90.26%
