# IMDb 文本情感分析

## RNN分析——深度学习初体验（坐牢）

In [1]:
#测试cuda是否可用
import torch
if torch.cuda.is_available():
    print("CUDA is available.")
else:
    print("CUDA is not available.")

CUDA is available.


In [2]:
import warnings
warnings.filterwarnings("ignore")
#导入数据
import pandas as pd
data = pd.read_csv('E:/本科/数据挖掘与商务分析/hw/final/IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
#将数据集分为训练集和测试集
from sklearn.model_selection import train_test_split
X = data['review']
y = data['sentiment']
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
test_dataset = [(x, y) for x, y in zip(X_test, y_test)]
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.2, random_state=1)
train_dataset = [(x, y) for x, y in zip(X_train, y_train)]
valid_dataset = [(x, y) for x, y in zip(X_valid, y_valid)]

In [4]:
import re
from collections import Counter, OrderedDict

token_counts = Counter()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized


for review, _ in train_dataset:
    tokens = tokenizer(review)
    token_counts.update(tokens)
 
    
print('Vocab-size:', len(token_counts))

Vocab-size: 86341


In [5]:
#利用torchtext的vocab将token转换为整数
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True) 
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vocab = vocab(ordered_dict)
vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[11, 7, 35, 467]


In [6]:
#定义转换函数
import torch.nn as nn
device = torch.device('cuda' )
review_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
sentiment_pipeline = lambda x: 1 if x == 'positive' else 0

def collate_batch(batch):
    review_list, sentiment_list, lengths = [], [], []
    for _review, _sentiment,in batch:
        processed_text =torch.tensor(review_pipeline(_review), dtype=torch.int64)
        review_list.append(processed_text)
        sentiment_list.append(sentiment_pipeline(_sentiment))
        lengths.append(processed_text.size(0))
    sentiment_list = torch.tensor(sentiment_list)
    lengths = torch.tensor(lengths)
    padded_review_list = nn.utils.rnn.pad_sequence(review_list, batch_first=True)
    return padded_review_list.to(device), sentiment_list.to(device), lengths.to(device)

In [7]:
#small batch 看看实力
from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
review_batch, sentiment_batch, length_batch = next(iter(dataloader))
print(review_batch)
print(sentiment_batch)
print(length_batch)
print(review_batch.shape)

tensor([[   11,     7,    39,  ...,     0,     0,     0],
        [   10,  1304,     6,  ...,     0,     0,     0],
        [    2,   590,  2348,  ..., 25611,   785, 29777],
        [   10,   103,    11,  ...,     0,     0,     0]], device='cuda:0')
tensor([1, 1, 0, 1], device='cuda:0')
tensor([164, 120, 448, 254], device='cuda:0')
torch.Size([4, 448])


In [8]:
#batch data生成data loader
batch_size = 32 

train_dl = DataLoader(train_dataset, batch_size=batch_size,shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size,shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [9]:
#将句子转为嵌入编码
embedding = nn.Embedding(num_embeddings=10, embedding_dim=3, padding_idx=0)

#example
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
print(embedding(text_encoded_input))

tensor([[[ 0.8969, -0.1245,  1.3060],
         [ 0.4696,  0.0376, -0.4505],
         [-1.2924,  0.7151,  0.7477],
         [-0.8698,  1.5329, -0.5256]],

        [[-1.2924,  0.7151,  0.7477],
         [-1.3677, -1.6807,  0.2428],
         [ 0.4696,  0.0376, -0.4505],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


### 构建循环神经网络

In [10]:
#用RNN进行情感分析
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0) 
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size,  batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)
model

RNN(
  (embedding): Embedding(86343, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [11]:
#编写train函数
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        text_batch = text_batch.to(device)
        label_batch = label_batch.to(device).float()  # 确保标签是浮点类型
        lengths = lengths.to(device)
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
 
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    all_labels, all_preds = [], []
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            text_batch = text_batch.to(device)
            label_batch = label_batch.to(device).float()  # 确保标签是浮点类型
            lengths = lengths.to(device)
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
            # 收集所有预测分数和真实标签
            all_labels.extend(label_batch.cpu().numpy())
            all_preds.extend(pred.cpu().numpy())
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset), all_labels, all_preds

In [12]:
#编写损失函数
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10 

torch.manual_seed(1)

#开始显卡炼丹 
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid,_,_ = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')


Epoch 0 accuracy: 0.5945 val_accuracy: 0.6593
Epoch 1 accuracy: 0.6984 val_accuracy: 0.7422
Epoch 2 accuracy: 0.7612 val_accuracy: 0.8031
Epoch 3 accuracy: 0.8486 val_accuracy: 0.8464
Epoch 4 accuracy: 0.8827 val_accuracy: 0.8664
Epoch 5 accuracy: 0.9051 val_accuracy: 0.8744
Epoch 6 accuracy: 0.9263 val_accuracy: 0.8784
Epoch 7 accuracy: 0.9401 val_accuracy: 0.8769
Epoch 8 accuracy: 0.9527 val_accuracy: 0.8814
Epoch 9 accuracy: 0.9631 val_accuracy: 0.8855


In [13]:
#用sklearn评估测试集效果
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
acc_test, loss_test, true_labels, pred_scores= evaluate(test_dl)

# 二分类的阈值设置为0.5
pred_labels = [1 if score >= 0.5 else 0 for score in pred_scores]
precision = precision_score(true_labels, pred_labels)
recall = recall_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels)
auc = roc_auc_score(true_labels, pred_scores)
print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'F1 Score: {f1:.3f}')
print(f'AUC Score: {auc:.3f}')

Precision: 0.858
Recall: 0.934
F1 Score: 0.894
AUC Score: 0.955


### 双向循环神经网络

In [14]:
#用RNN进行情感分析
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0) 
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)
model

RNN(
  (embedding): Embedding(86343, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [15]:
#编写train函数
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        text_batch = text_batch.to(device)
        label_batch = label_batch.to(device).float()  # 确保标签是浮点类型
        lengths = lengths.to(device)
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
 
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    all_labels, all_preds = [], []
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            text_batch = text_batch.to(device)
            label_batch = label_batch.to(device).float()  # 确保标签是浮点类型
            lengths = lengths.to(device)
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
            # 收集所有预测分数和真实标签
            all_labels.extend(label_batch.cpu().numpy())
            all_preds.extend(pred.cpu().numpy())
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset), all_labels, all_preds

In [16]:
#编写损失函数
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10 

torch.manual_seed(1)

#开始显卡炼丹 
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid,_,_ = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy: 0.6147 val_accuracy: 0.7330
Epoch 1 accuracy: 0.7090 val_accuracy: 0.7538
Epoch 2 accuracy: 0.8048 val_accuracy: 0.8373
Epoch 3 accuracy: 0.8787 val_accuracy: 0.8520
Epoch 4 accuracy: 0.9123 val_accuracy: 0.8761
Epoch 5 accuracy: 0.9342 val_accuracy: 0.8782
Epoch 6 accuracy: 0.9509 val_accuracy: 0.8818
Epoch 7 accuracy: 0.9643 val_accuracy: 0.8534
Epoch 8 accuracy: 0.9765 val_accuracy: 0.8779
Epoch 9 accuracy: 0.9850 val_accuracy: 0.8816


In [17]:
#用sklearn评估测试集效果
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
acc_test, loss_test, true_labels, pred_scores= evaluate(test_dl)

# 二分类的阈值设置为0.5
pred_labels = [1 if score >= 0.5 else 0 for score in pred_scores]
precision = precision_score(true_labels, pred_labels)
recall = recall_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels)
auc = roc_auc_score(true_labels, pred_scores)
print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'F1 Score: {f1:.3f}')
print(f'AUC Score: {auc:.3f}')

Precision: 0.901
Recall: 0.865
F1 Score: 0.883
AUC Score: 0.952


### 增加神经网络层数

In [18]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn1 = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True, bidirectional=True)
        self.rnn2 = nn.LSTM(rnn_hidden_size*2, rnn_hidden_size, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, fc_hidden_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn1(out)
        out, _ = self.rnn2(out)  # 这里需要使用pad_packed_sequence来恢复序列
        out = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)  # 添加这行代码
        out, output = out  # 现在out是恢复后的序列，output是序列的隐藏状态
        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)  # 这里需要确保hidden的形状正确
        out = self.fc1(out)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        out = self.sigmoid(out)
        return out

vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)
model

RNN(
  (embedding): Embedding(86343, 20, padding_idx=0)
  (rnn1): LSTM(20, 64, batch_first=True, bidirectional=True)
  (rnn2): LSTM(128, 64, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [19]:
#编写train函数
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        text_batch = text_batch.to(device)
        label_batch = label_batch.to(device).float()  # 确保标签是浮点类型
        lengths = lengths.to(device)
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
 
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    all_labels, all_preds = [], []
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            text_batch = text_batch.to(device)
            label_batch = label_batch.to(device).float()  # 确保标签是浮点类型
            lengths = lengths.to(device)
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
            # 收集所有预测分数和真实标签
            all_labels.extend(label_batch.cpu().numpy())
            all_preds.extend(pred.cpu().numpy())
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset), all_labels, all_preds

In [20]:
#编写损失函数
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10 

torch.manual_seed(1)

#开始显卡炼丹 
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid,_,_ = evaluate(valid_dl)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')

Epoch 0 accuracy: 0.6402 val_accuracy: 0.6529
Epoch 1 accuracy: 0.7530 val_accuracy: 0.7700
Epoch 2 accuracy: 0.7940 val_accuracy: 0.8285
Epoch 3 accuracy: 0.8718 val_accuracy: 0.8660
Epoch 4 accuracy: 0.9062 val_accuracy: 0.8819
Epoch 5 accuracy: 0.9309 val_accuracy: 0.8862
Epoch 6 accuracy: 0.9469 val_accuracy: 0.8910
Epoch 7 accuracy: 0.9608 val_accuracy: 0.8851
Epoch 8 accuracy: 0.9722 val_accuracy: 0.8825
Epoch 9 accuracy: 0.9821 val_accuracy: 0.8882


In [21]:
#用sklearn评估测试集效果
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
acc_test, loss_test, true_labels, pred_scores= evaluate(test_dl)

# 二分类的阈值设置为0.5
pred_labels = [1 if score >= 0.5 else 0 for score in pred_scores]
precision = precision_score(true_labels, pred_labels)
recall = recall_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels)
auc = roc_auc_score(true_labels, pred_scores)
print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'F1 Score: {f1:.3f}')
print(f'AUC Score: {auc:.3f}')

Precision: 0.906
Recall: 0.881
F1 Score: 0.893
AUC Score: 0.955


可能由于过拟合问题，难以进一步提升

## 引入Transformer——Attention is all you need

In [22]:
torch.backends.cudnn.deterministic = True #确保结果可重现
torch.manual_seed(1)
num_epochs = 3