In [152]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext 
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np
import random, math, time

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [153]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [154]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [155]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [156]:
SRC = Field(tokenize = tokenize_de,
            init_token = '<sos>', 
            eos_token = '<eos>',
            lower = True,
            batch_first = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True, 
            batch_first = True)

In [157]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

In [158]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [159]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [160]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
     batch_size = BATCH_SIZE,
     device = device)

In [161]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device, 
                 max_length=100):
        super().__init__()
        
        self.device = device
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads,
                                                  pf_dim,
                                                  dropout,
                                                  device)
                                    for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        # src = [batch_size, seq_len]
        # src_mask = [batch_size, seq_len]
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        # pos = [batch, src_len]
        
        src = self.dropout((self.tok_embedding(src)*self.scale) + self.pos_embedding(pos))
        
        # src = [batch_size, src_len, hid_dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
        # src = [batch_size, src_len, hid_dim]
        return src

In [162]:
class EncoderLayer(nn.Module):
    def __init__(self,
                 hid_dim,
                 n_heads,
                 pf_dim,
                 dropout,
                 device):
        super().__init__()
        
        self.multihead_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)

        
        self.dropout = nn.Dropout(dropout)
    def forward(self, src, src_mask):
        
        # src=[batch_size, src_len, hid_dim]
        # src_mask = [batch, src_len]
        
        # self attention
        _src, _ = self.multihead_attention(src, src, src, src_mask)
        
        # dropout, residual connection and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        
        # positionwise feedforward
        _src = self.positionwise_feedforward(src)
        
        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        # src = [batch_size, src_len, hid_dim]
        
        return src

In [163]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        assert hid_dim % n_heads == 0
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
    
    def forward(self, query, key, value, mask=None):
        
        batch_size = query.shape[0]
        # query = [batch_size, qur_len, hid_dim]
        # key = [batch_size, key_len, hid_dim]
        # value = [batch_size, value_len, hid_dim]
        
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        # Q = [batch_size, qur_len, hid_dim]
        # K = [batch_size, key_len, hid_dim]
        # V = [batch_size, value_len, hid_dim]
        
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        #Q = [batch size, n_heads, query_len, head_dim]
        #K = [batch size, n_heads, key_len, head_dim]
        #V = [batch size, n_heads, value_len, head_dim]
        
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2))/self.scale
        
        # energy = [batch_size, n_heads, query_len, key_len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim=-1)
        
        x = torch.matmul(self.dropout(attention), V)
        
        # x = [batch, query_len, n_heads, head_dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        # x  = [batch, query_len, n_heads, head_dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        
        # x = [batch_size, query_len, hid_dim]
        
        x = self.fc_o(x)
        
        # x = [batch_size, query_len, hid_dim]
        
        return x, attention

In [164]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        # x = [batch, src_len, hid_dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        # x = [batch_size, src_len, pf_dim]
        
        x = self.fc_2(x)
        
        # x = [batch_size, src_len, hid_dim]
        
        return x

In [165]:
class Decoder(nn.Module):
    def __init__(self, 
                output_dim,
                hid_dim,
                n_layers, 
                n_heads,
                pf_dim,
                dropout,
                device,
                max_length=100):
        super().__init__()
        
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, n_heads, 
                                                 pf_dim,
                                                 dropout,
                                                 device)
                                    for _ in range(n_layers)])
    
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        # trg = [batch, trg_len]
        # enc_src = [batch_size, src_len, hid_dim]
        # trg_mask = [batch_size, trg_len]
        # src_mask = [batch_size, src_len]
        
        batch_size = trg.shape[0]
        
        trg_len = trg.shape[1]
        
        pos = torch.range(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        # pos = [batach, trg_len]
        
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        
        # trg = [batch_size, trg_len, hid_dim]
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
            
        # trg = [batch, trg_len, hid_dim]
        
        # attention = [batch, n_heads, trg_len, src_len]
        
        output = self.fc_out(trg)
        
        # output = [batch, trg_len, output_dim]
        
        return otuput, attention
        

In [166]:
class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        
        self.decoder_multihead_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        
        self.encoder_multihead_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, pf_dim, dropout)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, trg, enc_src, trg_mask, src_mask):
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, trg len]
        #src_mask = [batch size, src len]
        
        # First blocks in Decoder part: multihead attention
        _trg, _ = self.decoder_multihead_attention(trg, trg, trg, trg_mask)
        
        # dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
        
        # trg = [batch, trg_len, hid_dim]
        
        # Second blocks in Decoder part: encoder_attention
        _trg, attention = self.encoder_multihead_attention(trg, enc_src, enc_src, src_mask)
        
        # dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
        
        # Thirt blocks in Decoder part: positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        # trg = [batch_size, trg_len, hid_dim]
        # attention = [batch, n_heads, trg_len, src_len]
        
        return trg, attention

### Seq2Seq

In [167]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        
        # src = [batch, src_len]
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        
        # src_mask = [batch_size, 1, 1, src_len]
        return src_mask
    
    def make_trg_mask(self, trg):
        
        # trg = [batch_size, trg_len]
        
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        
        # trg_pad_mask = [batch, 1, 1, trg_len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()
        
        # trg_sub_mask = [trg_len, trg_len]
        
        trg_mask = trg_pad_mask & trg_sub_mask
        
        # trg_mask = [batch, 1, trg_len, trg_len]
        
        return trg_mask
        
    def forward(self, src, trg):
        
        # src = [batch_size, src_len]
        
        # trg = [batch_size, trg_len]
        
        src_mask = self.make_src_mask(src)
        
        trg_mask = self.make_trg_mask(src)
        
        # src_mask = [batch, 1, 1, src_len]
        
        # trg_mask = [batch, 1, trg_len, trg_len]
        
        enc_src = self.encoder(src, src_mask)
        
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        # output = [batch, trg_len, output_dim]
        
        # attention = [batch, n_heads, trg_len, src_len]
        
        return output, attention

In [168]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [169]:
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [170]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 9,038,597 trainable parameters


In [171]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [172]:
model.apply(initialize_weights);

In [173]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [174]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [175]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        src = batch.src
        
        trg = batch.trg
        
        optimizer.zero_grad()
        
        # output = [batch, trg_len-1, output_dim]
        
        # trg = [batch_size, trg_len]
        
        output, _ = model(src, trg[:, :-1])  # 所有batch_size 和 前trg_len-1个作为输入
        
        output_dim = output.shape[-1]
        
        output = otuput.contiguous().view(-1, output_dim)
        
        trg = trg[:, 1, :].contiguous.view(-1)
        
        # output = [batch*trg_len-1, output_dim]
        
        # trg = [batch*trg_len-1]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [176]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output, _ = model(src, trg[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [179]:
N_EPOCHS = 5
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
#     epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-model.pt')
    
#     print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')



RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)