In [2]:
# 文本情感分类
# 使用卷积神经网络（textCNN）

import collections
import os
import random
import tarfile
from imp import reload
import torch
from torch import nn
import torch.utils.data as Data
import torchtext.vocab as Vocab
import torch.nn.functional as F

import sys

sys.path.append("..")
import d2l_pytorch.d2l as d2l

reload(d2l)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DATA_ROOT = "..//..//ACLImdb_v1//"


  from imp import reload


In [3]:
def corr1d(X, K):
  w = K.shape[0]
  Y = torch.zeros((X.shape[0] - w +1))
  for i in range(Y.shape[0]):
    Y[i] = (X[i: i+w] * K).sum()

  return Y

In [4]:
X, K = torch.tensor([0, 1, 2, 3, 4, 5, 6]), torch.tensor([1, 2])
corr1d(X, K)

tensor([ 2.,  5.,  8., 11., 14., 17.])

In [5]:
def corr1d_multi_in(X, K):
  return torch.stack([corr1d(x, k) for x, k in zip(X, K)]).sum(dim=0)


X = torch.tensor(
  [
    [0, 1, 2, 3, 4, 5, 6],
    [1, 2, 3, 4, 5, 6, 7],
    [2, 3, 4, 5, 6, 7, 8],
  ]
)
K = torch.tensor([[1, 2], [3, 4], [-1, -3]])
corr1d_multi_in(X, K)

tensor([ 2.,  8., 14., 20., 26., 32.])

In [6]:
class GlobalMaxPool1d(nn.Module):
  def __init__(self):
    super(GlobalMaxPool1d, self).__init__()
  
  def forward(self, x):
    # x shape: (batch_size, channel, seq_len)
    # return shape: (batch_size, channel, 1)
    return F.max_pool1d(x, kernel_size=x.shape[2])

In [7]:
batch_size = 64
train_data = d2l.read_imdb("train", data_root="..//..//ACLImdb_v1//aclimdb")
test_data = d2l.read_imdb("test", "..//..//ACLImdb_v1//aclimdb")

vocab = d2l.get_vocab_imdb(train_data)

train_set = Data.TensorDataset(*d2l.preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*d2l.preprocess_imdb(test_data, vocab))

train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)


100%|██████████| 12500/12500 [00:03<00:00, 4013.38it/s]
100%|██████████| 12500/12500 [00:02<00:00, 4556.73it/s]
100%|██████████| 12500/12500 [00:02<00:00, 5362.73it/s]
100%|██████████| 12500/12500 [00:02<00:00, 5375.49it/s]


In [8]:
# 搭建textCNN模型
# 模型主要使用了一维卷积层和时序最大池化层
# 假设输入的文本序列由n个词组成，每个词用d维的词向量表示。那输入样本的宽为n，高为1，输入通道数为d

""" 
  1. 定义多个一维卷积核，并使用这些卷积核对输入分别做卷积计算。宽度不同的卷积核可能会捕捉到不同个数的相邻词的相关性
  2. 对输出的所有通道分别做时序最大池化，再将这些通道的池化输出连结为向量
  3. 通过全连接层将连结后的向量变换为有关各类别的输出。这一步可以使用丢弃层应对过拟合
"""


class TextCNN(nn.Module):
  def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
    super(TextCNN, self).__init__()
    self.embedding = nn.Embedding(len(vocab), embed_size)

    # 不参与训练的嵌入层
    self.constant_embedding = nn.Embedding(len(vocab), embed_size)
    self.dropout = nn.Dropout(0.5)
    self.decoder = nn.Linear(sum(num_channels), 2)

    # 时序最大池化层没有权重， 所以可以共用一个实例
    self.pool = GlobalMaxPool1d()
    self.convs = nn.ModuleList()  # 创建多个一维卷积层

    for c, k in zip(num_channels, kernel_sizes):
      self.convs.append(nn.Conv1d(in_channels=2 * embed_size, out_channels=c, kernel_size=k))

  def forward(self, inputs):
    # 将两个性状是（批量大小，词数，词向量维度）的嵌入层的输出按词向量连结
    embeddings = torch.cat((self.embedding(inputs), self.constant_embedding(inputs)), dim=2)
    
    # 根据Conv1D要求的输入格式，将词向量维，即一维卷积层的通道维（即词向量那一维），变换到前一维
    embeddings = embeddings.permute(0, 2, 1)

    #对于每个一维卷积层，在时序最大化池化后会得到一个性状为（批量大小，通道大小，1）的Tensor。使用flatten函数去掉最后一维，然后在通道维上连结
    encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
    outputs = self.decoder(self.dropout(encoding))
    return outputs