In [29]:
# 文本情感分类
# 使用循环神经网络

import collections
import os
import random
import tarfile
from imp import reload
import torch
from torch import nn
import torch.utils.data as Data
import torchtext.vocab as Vocab

import sys
sys.path.append("..")
import d2l_pytorch.d2l as d2l

reload(d2l)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DATA_ROOT  = "..//..//ACLImdb_v1//"



In [3]:
# 检验数据集是否存在

data_root = DATA_ROOT+"aclImdb"
print(os.path.exists(data_root))

True


In [4]:
from tqdm import tqdm


def read_imdb(folder="train", data_root=DATA_ROOT + "aclimdb"):
  data = []
  for label in ["pos", "neg"]:
    folder_name = os.path.join(data_root, folder, label)

    for file in tqdm(os.listdir(folder_name)):
      with open(os.path.join(folder_name, file), "rb") as f:
        review = f.read().decode("utf-8").replace("\n", "").lower()
        data.append([review, 1 if label == "pos" else 0])
        
  random.shuffle(data)
  return data


In [5]:
train_data, test_data = read_imdb("train"),read_imdb("test")

100%|██████████| 12500/12500 [00:08<00:00, 1501.79it/s]
100%|██████████| 12500/12500 [00:08<00:00, 1517.40it/s]
100%|██████████| 12500/12500 [00:07<00:00, 1591.05it/s]
100%|██████████| 12500/12500 [00:14<00:00, 864.26it/s] 


In [6]:
# 预处理数据

# 分词
def get_tokenized_imdb(data):
  """
  data: list of [string, label]
  """
  def tokenizer(text):
    return [tok.lower() for tok in text.split(' ')]

  return [tokenizer(review) for review, _ in data]

In [7]:
# 创建字典
def get_vocab_imdb(data):
  tokenized_data = get_tokenized_imdb(data)
  counter = collections.Counter([tk for st in tokenized_data for tk in st])
  return Vocab.Vocab(counter, min_freq=5)

vocab = get_vocab_imdb(train_data)
print("# words in vocab:", len(vocab))

# words in vocab: 46152


In [8]:
# 每条评论长度不一致所以不能直接合成小批量
# 需要对每条评论进行分词，并通过字典转换成词索引，通过截断或补0来将每条评论长度固定到500


def preprocess_imdb(data, vocab):
  max_l = 500

  def pad(x):
    return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

  tokenized_data = get_tokenized_imdb(data)
  features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
  labels = torch.tensor([score for _, score in data])
  return features, labels


In [9]:
batch_size = 64
train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab))

test_set = Data.TensorDataset(*preprocess_imdb(test_data,vocab))

train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

for X, y in train_iter:
  print("X", X.shape, "y", y.shape)
  break

print("# batches: ", len(train_iter))

X torch.Size([64, 500]) y torch.Size([64])
# batches:  391


In [12]:
# 搭建RNN循环神经网络


class BiRNN(nn.Module):
  def __init__(self, vocab, embed_size, num_hiddens, num_layers):
    super(BiRNN, self).__init__()
    self.embedding = nn.Embedding(len(vocab), embed_size)
    self.encoder = nn.LSTM(
      input_size=embed_size, hidden_size=num_hiddens, num_layers=num_layers, bidirectional=True
    )
    self.decoder = nn.Linear(4 * num_hiddens, 2)

  def forward(self, inputs):
    # inputs的形状是(批量大小，词数), 因为LSTM需要将序列长度(seq_len)作为第一维，所以将输入转置后再提取词特征
    # 输出性状为(词数，批量大小，词向量维度)
    embeddings = self.embedding(inputs.permute(1, 0))
    # rnn.LSTM只传入输入embeddings, 因此只返回最后一层的隐藏层在各时间步的隐藏状态
    # outputs形状是（词数，批量大小，2*隐藏单元个数）
    outputs, _ = self.encoder(embeddings)
    # output , (h, c)
    # 连接初始时间步和最终时间步的隐藏状态作为全连接层输入。它的形状为 （批量大小，4*隐藏单元个数）
    encoding = torch.cat((outputs[0], outputs[-1]), -1)
    outs = self.decoder(encoding)
    return outs

In [13]:
embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)

In [20]:
# 情感分类的训练数据集不是很大，为了应对过拟合，直接使用更大规模的语料预训练的词向量作为每个词的特征向量
glove_vocab = Vocab.GloVe(name="6B", dim=100, cache=os.path.join("../Datasets/Glove"))


100%|█████████▉| 399999/400000 [00:31<00:00, 12800.97it/s]


In [23]:
from d2l_pytorch.d2l import load_pretrained_embedding


net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.embedding.weight.requires_grad = False


There are 21202 oov words.


In [25]:
lr, num_epochs = 0.01, 5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.5667, train acc 0.703, test acc 0.775 , time 49.1 sec
epoch 2, loss 0.2198, train acc 0.798, test acc 0.807 , time 45.5 sec
epoch 3, loss 0.1239, train acc 0.839, test acc 0.773 , time 45.8 sec
epoch 4, loss 0.0843, train acc 0.858, test acc 0.849 , time 45.9 sec
epoch 5, loss 0.0578, train acc 0.879, test acc 0.854 , time 46.2 sec


In [31]:
d2l.predict_sentiment(net, vocab, ["this", "movie", "is","so","great"])


'positive'

In [32]:
d2l.predict_sentiment(net, vocab, ["this", "movie", "is", "so", "bad"])

'negative'