In [1]:
# seq2seq

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import spacy
import numpy as np

import random
import math
import time
print(torch.__version__)

1.5.0


In [4]:
seed = 1234
def randomSeed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic=True
randomSeed(seed)

In [5]:
spacy_de = spacy.load("de_core_news_sm")
spacy_en = spacy.load("en_core_web_sm")

In [6]:
a = [tok.text for tok in spacy_en.tokenizer("hello this is my world!")][::-1]
print(a)

['!', 'world', 'my', 'is', 'this', 'hello']


In [7]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [9]:
SRC = Field(tokenize=tokenize_de, init_token = "<sos>", eos_token="<eos>", lower=True)

TRG = Field(tokenize=tokenize_en, init_token = "<sos>", eos_token="<eos>", lower=True)

In [11]:
train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"), fields=(SRC, TRG))

In [12]:
print(vars(train_data.examples[2]))

{'src': ['.', 'holz', 'aus', 'spielhaus', 'ein', 'in', 'klettert', 'mädchen', 'kleines', 'ein'], 'trg': ['a', 'little', 'girl', 'climbing', 'into', 'a', 'wooden', 'playhouse', '.']}


In [13]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)  # 可以加载预训练的词向量

In [50]:
print(TRG.vocab.stoi[TRG.pad_token]) # 词对应的数字index
print(TRG.vocab.itos[1])
print(TRG.vocab.vectors)

1
<pad>
None


In [43]:
TRG.vocab

<torchtext.vocab.Vocab at 0x7fd45ef705f8>

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

In [16]:
device

device(type='cuda')

In [17]:
BATCH_SIZE = 128
train_iter, valid_iter, test_iter = BucketIterator.splits((train_data, valid_data, test_data),
                                                         batch_size=128, device = device)

In [62]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [63]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout= dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

### seq2seq

In [64]:
class seq2seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, tag, teacher_forcing_rate = 0.5):
        batch_size = tag.shape[1]
        tag_len = tag.shape[0]
        tag_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(tag_len, batch_size, tag_vocab_size).to(device)
        hidden, cell = self.encoder(src)
        input = tag[0, :]
        for t in range(1, tag_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_rate
            top1 = output.argmax(1)
            input = tag[t] if teacher_force else top1
            
        return outputs
    
        

In [65]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = seq2seq(enc, dec, device).to(device)

In [66]:
def init_weights(n):
    for name, param in n.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

In [67]:
model.apply(init_weights)

seq2seq(
  (encoder): Encoder(
    (embedding): Embedding(7854, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [68]:
optimizer = optim.Adam(model.parameters())

In [69]:
TRG_PAD_INDEX = TRG.vocab.stoi[TRG.pad_token] # 1
print(TRG_PAD_INDEX)
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_INDEX)

1


In [80]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i , batch in enumerate(iterator):
#         print("the batch is:", batch)
        src = batch.src
        trg = batch.trg
        optimizer.zero_grad()
        output = model(src, trg)
        
        output_dim = output.shape[-1]
#         print(output_dim)
        output = output[1:].view(-1, output_dim)
#         print("before reshape trg is:", trg.shape)
        trg = trg[1:].view(-1)
#         print('after reshape the trg is: ', trg.shape)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [81]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [83]:
N_EPOCHS = 2
CLIP=1
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iter, criterion)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best.pt')
#     print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

	Train Loss: 3.681 | Train PPL:  39.676
	 Val. Loss: 4.244 |  Val. PPL:  69.679
	Train Loss: 3.573 | Train PPL:  35.639
	 Val. Loss: 4.101 |  Val. PPL:  60.385
