### 用torchText加载imdb数据

In [1]:
from torchtext import data
from torchtext.vocab import Vectors
from tqdm import tqdm
import torchtext
import torch
from torch import nn
import torch.nn.functional as F

from torch.nn import init
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [22]:
# 启动tensorboardX

from tensorboardX import SummaryWriter

writer = SummaryWriter("./tensorboard")

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
SENTENCE = data.Field(sequential=True, lower=True, include_lengths=True, use_vocab=True, batch_first=False)
LABEL = data.LabelField(sequential=False, use_vocab=True)



In [5]:
trainDataset, valDataset = data.TabularDataset.splits(path='.', train='IMDBTrain.tsv', validation='IMDBTest.tsv', format='tsv', skip_header=True, fields=[('sentence', SENTENCE), ('label', LABEL)])



In [6]:
vectors = torchtext.vocab.Vectors(name = 'glove.6B.100d.txt', cache = '../../glove')
SENTENCE.build_vocab(trainDataset, vectors=vectors, unk_init=init.xavier_normal)
LABEL.build_vocab(trainDataset)

In [7]:
SENTENCE.vocab.freqs.most_common(10)

[('the', 322198),
 ('a', 159953),
 ('and', 158572),
 ('of', 144462),
 ('to', 133967),
 ('is', 104171),
 ('in', 90527),
 ('i', 70480),
 ('this', 69714),
 ('that', 66292)]

In [8]:
print(SENTENCE.vocab.itos[1510])
print(SENTENCE.vocab.stoi['bore'])

print(SENTENCE.vocab.vectors.shape)

thats
3637
torch.Size([251639, 100])


In [9]:
# torchtext有大龄内置的迭代器，这里选择的有BucketIterator

trainIter = data.BucketIterator(trainDataset, batch_size=16, sort_key=lambda x: len(x.sentence), shuffle=True, device=DEVICE)
valIter = data.BucketIterator(valDataset, batch_size=32, sort_key=lambda x: len(x.sentence), shuffle=True, device=DEVICE)



In [10]:
# BucketIterator是torchtext最强大的功能之一，会自动输入序列进行shuffle并作bucket

In [11]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, 
                 n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                           bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.dropout(self.embedding(text)) #[sent len, batch size, emb dim]
        output, (hidden, cell) = self.rnn(embedded)
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)) # [batch size, hid dim * num directions]
        
        return F.log_softmax(self.fc(hidden))


In [12]:
# 在这里，使用torch.utils.rnn.pack_padded_sequence和torch.utils.rnn.pack_packed_sequence
class RNN2(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, 
                 n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                           bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, tokens, inputLengths):
        
        packed = pack_padded_sequence(input=tokens, lengths=inputLengths, enforce_sorted=False)
        
        embedded = self.dropout(self.embedding(tokens)) #[sent len, batch size, emb dim]
        packed = pack_padded_sequence(input=embedded, lengths=inputLengths, enforce_sorted=False)
        _, (hiddens, cells) = self.rnn(packed)
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        
        outputs = self.dropout(torch.cat((hiddens[-2,:,:], hiddens[-1,:,:]), dim=1)) # [batch size, hid dim * num directions]
        
        return F.log_softmax(self.fc(outputs))


In [13]:
INPUT_DIM = len(SENTENCE.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 50
OUTPUT_DIM = 2
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.2
PAD_IDX = SENTENCE.vocab.stoi[SENTENCE.pad_token]

model = RNN2(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, 
            N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)
model = model.to(DEVICE)

In [14]:
pretrained_embeddings = SENTENCE.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = SENTENCE.vocab.stoi[SENTENCE.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')


In [15]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
model = model.to(DEVICE)
criterion = criterion.to(DEVICE)


In [19]:
def train(model, iterator, optimizer, criterion, skip=500):
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        predictions = model(batch.sentence[0], batch.sentence[1])
        #print("predictions ", predictions.size())
        loss = criterion(predictions, batch.label)
        

        loss.backward()
        optimizer.step()
        
        _, preds = torch.max(predictions.detach(), dim=1)
        acc = torch.mean((preds==batch.label.detach()).double())

        epoch_loss += loss.item()
        
        
        epoch_acc += acc.item()
        if i % skip == 0:
            print(" Train Mini batch loss ", loss.item())
            print(" Train Mini batch acc  ", acc.item())
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [17]:
def evaluate(model, iterator, criterion, skip=500):
    
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            predictions = model(batch.sentence[0], batch.sentence[1])
            loss = criterion(predictions, batch.label)

            epoch_loss += loss.item()
            _, preds = torch.max(predictions, dim=1)
            acc = torch.mean((preds==batch.label).double())
            
            epoch_acc += acc.item()
            if i % skip == 0:
                print("Valid Mini batch loss ", loss.item())
                print("Valid Mini batch acc  ", acc.item())
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [18]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [24]:
N_EPOCHS = 2
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    
    train_loss, train_acc = train(model, trainIter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valIter, criterion)
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    
    writer.add_scalar('train/acc', train_acc, epoch)
    writer.add_scalar('train/loss', train_loss, epoch)
    writer.add_scalar('valid/acc', valid_acc, epoch)
    writer.add_scalar('valid/loss', valid_loss, epoch)
    
    print(f'Epoch: {epoch+1:2} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\n\tTrain Loss: {train_loss:.3f} ')
    print(f'\tValid Loss: {valid_loss:.3f} \tValid Acc: {valid_acc:.3f} \n')



 Train Mini batch loss  0.03232467919588089
 Train Mini batch acc   1.0
 Train Mini batch loss  0.02595200389623642
 Train Mini batch acc   1.0
 Train Mini batch loss  0.0103584760800004
 Train Mini batch acc   1.0
 Train Mini batch loss  0.004505004733800888
 Train Mini batch acc   1.0
Valid Mini batch loss  0.1831435114145279
Valid Mini batch acc   0.90625
Valid Mini batch loss  0.6489431858062744
Valid Mini batch acc   0.78125
Epoch:  1 | Epoch Time: 5m 47s

	Train Loss: 0.054 
	Valid Loss: 0.381 	Valid Acc: 0.862 

 Train Mini batch loss  0.002244626171886921
 Train Mini batch acc   1.0
 Train Mini batch loss  0.005457038059830666
 Train Mini batch acc   1.0
 Train Mini batch loss  0.0001690689823590219
 Train Mini batch acc   1.0
 Train Mini batch loss  0.09442286193370819
 Train Mini batch acc   0.9375
Valid Mini batch loss  0.34614986181259155
Valid Mini batch acc   0.90625
Valid Mini batch loss  0.5760980248451233
Valid Mini batch acc   0.8125
Epoch:  2 | Epoch Time: 5m 23s

	T

In [25]:
writer.export_scalars_to_json("./test.json")
writer.close()