In [1]:
import collections
import os
import random
import tarfile
import torch
from torch import nn,optim
import torchtext.vocab as Vocab
import torch.utils.data as Data

import sys
sys.path.append('../code')
import d2lzh_pytorch as d2l

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

DATA_ROOT='../data'

In [2]:
fname=os.path.join(DATA_ROOT,'aclImdb_v1.tar.gz')
if not os.path.exists(os.path.join(DATA_ROOT,'aclImdb')):
    print('从压缩包解压...')
    with tarfile.open(fname,'r') as f:
        f.extractall(DATA_ROOT)

In [3]:
from tqdm import tqdm

def read_imdb(folder='train',data_root='../data/aclImdb/'):
    data=[]
    for label in ['pos','neg']:
        folder_name=os.path.join(data_root,folder,label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name,file),'rb') as f:
                review=f.read().decode('utf-8').replace('\n','').lower()
                data.append([review,1 if label=='pos' else 0])
    random.shuffle(data)
    return data

train_data,test_data=read_imdb('train'),read_imdb('test')

100%|██████████| 12500/12500 [00:01<00:00, 6345.00it/s]
100%|██████████| 12500/12500 [00:01<00:00, 7234.11it/s]
100%|██████████| 12500/12500 [00:01<00:00, 9850.71it/s] 
100%|██████████| 12500/12500 [00:01<00:00, 9973.84it/s] 


In [6]:
len(train_data)
#len(test_data)

25000

In [7]:
train_data[0]

['this move is slow, plodding, cold, dark, and without a plot or hope. it follows that tried and true european formula that they love to subsidize, that is never seen, but that the critics think makes an "important point".<br /><br />the movie is valuable if nothing more than to show the huge difference in the thinking between americans and europeans regarding employment. in this movie the men are still nursing their wounds from years ago and feel it\'s the government\'s duty to provide them with work. whereas in the u.s. we know we have to go out there and create value for someone.<br /><br />spain never looked so backward!',
 0]

In [9]:
def get_tokenized_imdb(data):
    def tokenizer(text):
        return [tok.lower() for tok in text.split(' ')]
    return [tokenizer(review) for review,_ in data]

In [10]:
def get_vocab_imdb(data):
    tokenized_data=get_tokenized_imdb(data)
    counter=collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter,min_freq=5)

vocab=get_vocab_imdb(train_data)
'# words in vocab:',len(vocab)

('# words in vocab:', 46152)

In [31]:
vocab.itos[0]

'<unk>'

In [21]:
def preprocess_imdb(data,vocab):
    max_l=500
    
    def pad(x):
        return x[:max_l] if len(x)>max_l else x+[0]*(max_l-len(x))
    
    tokenized_data=get_tokenized_imdb(data)
    features=torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels=torch.tensor([score for _,score in data])
    return features,labels

In [22]:
batch_size=64
train_set=Data.TensorDataset(*preprocess_imdb(train_data,vocab))
test_set=Data.TensorDataset(*preprocess_imdb(test_data,vocab))
train_iter=Data.DataLoader(train_set,batch_size,shuffle=True)
test_iter=Data.DataLoader(test_set,batch_size)

In [24]:
for X,y in train_iter:
    print('X',X.shape,'y',y.shape)
    break
'#batches:', len(train_iter)

X torch.Size([64, 500]) y torch.Size([64])


('#batches:', 391)

In [32]:
class BiRNN(nn.Module):
    def __init__(self,vocab,embed_size,num_hidens,num_layers):
        super(BiRNN,self).__init__()
        self.embedding=nn.Embedding(len(vocab),embed_size)
        self.encoder=nn.LSTM(input_size=embed_size,hidden_size=num_hidens,
                            num_layers=num_layers,bidirectional=True)
        self.decoder=nn.Linear(4*num_hidens,2)
        
    def forward(self,inputs):
        embeddings=self.embedding(inputs.permute(1,0))
        outputs,_=self.encoder(embeddings)
        encoding=torch.cat((outputs[0],outputs[-1]),-1)
        outs=self.decoder(encoding)
        return outs

In [33]:
embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)

In [35]:
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT))

100%|█████████▉| 398589/400000 [00:30<00:00, 29340.88it/s]

In [38]:
def load_pretrained_embedding(words,pretrained_vocab):
    embed=torch.zeros(len(words),pretrained_vocab.vectors[0].shape[0])
    oov_count=0
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed

net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.embedding.weight.requires_grad = False

There are 21202 oov words.


In [40]:
lr, num_epochs = 0.01, 5
# 要过滤掉不计算梯度的embedding参数
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.6064, train acc 0.657, test acc 0.793, time 155.5 sec
epoch 2, loss 0.1991, train acc 0.823, test acc 0.841, time 156.9 sec
epoch 3, loss 0.1144, train acc 0.852, test acc 0.855, time 155.5 sec
epoch 4, loss 0.0736, train acc 0.878, test acc 0.850, time 156.9 sec
epoch 5, loss 0.0512, train acc 0.895, test acc 0.860, time 155.9 sec


In [41]:
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def predict_sentiment(net, vocab, sentence):
    """sentence是词语的列表"""
    device = list(net.parameters())[0].device
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    print(sentencetence)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return 'positive' if label.item() == 1 else 'negative'


In [42]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great']) # positive


tensor([10, 20,  7, 38, 88], device='cuda:0')


'positive'

In [43]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad']) # negative

tensor([10, 20,  7, 38, 97], device='cuda:0')


'negative'