# Convolutional Sequence to Sequence Model

- [PyTorch implementation](https://github.com/bentrevett/pytorch-seq2seq/blob/master/5%20-%20Convolutional%20Sequence%20to%20Sequence%20Learning.ipynb)

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy

import random
import math
import time

In [3]:
SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [5]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [6]:
SRC = Field(tokenize=tokenize_de,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

TRG = Field(tokenize=tokenize_en,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

In [7]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG))

In [8]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)

In [11]:
class Encoder(nn.Module):
    
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, device):
        super().__init__()
        
        assert kernel_size % 2 == 1, 'Kernel size must be odd!'
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.kernel_size = kernel_size
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(input_dim, emb_dim)
        # 100は何？seq_lenの最大長か？
        # 100個の単語をembeddingするのと同じように100種類の異なる長さをembeddingする
        self.pos_embedding = nn.Embedding(100, emb_dim)

        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        # channel数 = featuresは2倍になる
        # paddingがSAMEなのでseq_lenは変わらない
        self.convs = nn.ModuleList([nn.Conv1d(in_channels=hid_dim,
                                              out_channels=2 * hid_dim,
                                              kernel_size=kernel_size,
                                              padding=(kernel_size - 1) // 2)
                                    for _ in range(n_layers)])
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [batch_size, seq_len]
        
        # create position tensor
        pos = torch.arange(0, src.shape[1]).unsqueeze(0).repeat(src.shape[0], 1).to(self.device)
        
        # pos = [batch_size, seq_len]
        
        # embed tokens and positions
        tok_embedded = self.tok_embedding(src)
        pos_embedded = self.pos_embedding(pos)
        
        # tok_embedded = pos_embedded = [batch_size, seq_len, emb_dim]
        
        # combine embeddings by elementwise summing
        # concatではなくsumを使うのはなぜか？
        embedded = self.dropout(tok_embedded + pos_embedded)

        # embedded = [batch_size, seq_len, emb_dim]
        
        # pass embedded through linear layer to go through emb_dim => hid_dim
        conv_input = self.emb2hid(embedded)
        
        # conv_input = [batch_size, seq_len, hid_dim]
        
        # permute for convolutional layer
        conv_input = conv_input.permute(0, 2, 1)
        
        # conv_input = [batch_size, hid_dim, seq_len]
        # Conv1dの入力は (N, C, L)

        for i, conv in enumerate(self.convs):
            # path through convolutional layer
            conved = conv(self.dropout(conv_input))
            
            # conved = [batch_size, 2 * hid_dim, seq_len]
            
            # pass through GLU activation function
            # TODO: hid_dimが半分になるのか？？？
            conved = F.glu(conved, dim=1)
            
            # conved = [batch_size, hid_dim, seq_len]
            
            # apply residual connection
            conved = (conved + conv_input) * self.scale
            
            # conved = [batch_size, hid_dim, seq_len]
            
            # set conv_input to conved for next loop iteration
            conv_input = conved
        
        # permute and convert back to emb_dim
        conved = self.hid2emb(conved.permute(0, 2, 1))
        
        # conved = [batch_size, seq_len, emb_dim]
        
        # elementwise sum output (conved) and input (embedded) to be used for attention
        combined = (conved + embedded) * self.scale
        
        # combined = [batch_size, seq_len, emb_dim]
        
        return conved, combined

In [12]:
torch.arange(0, 5).unsqueeze(0)

tensor([[0, 1, 2, 3, 4]])

In [13]:
pos = torch.arange(0, 5).unsqueeze(0).repeat(16, 1)
pos.shape

torch.Size([16, 5])

In [14]:
pos_embedding = nn.Embedding(5, 256)
pos_embedding(pos).shape

torch.Size([16, 5, 256])

In [37]:
class Decoder(nn.Module):
    
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, pad_idx, device):
        super().__init__()
        
        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.kernel_size = kernel_size
        self.dropout = dropout
        self.pad_idx = pad_idx
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(output_dim, emb_dim)
        self.pos_embedding = nn.Embedding(100, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.attn_hid2emb = nn.Linear(hid_dim, emb_dim)
        self.attn_emb2hid = nn.Linear(emb_dim, hid_dim)
        
        self.out = nn.Linear(emb_dim, output_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(hid_dim, 2 * hid_dim, kernel_size)
                                    for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
    
    def calculate_attention(self, embedded, conved, encoder_conved, encoder_combined):
        # embedded = [batch_size, trg_seq_len, emb_dim]
        # conved = [batch_size, hid_dim, trg_seq_len]
        # encoder_conved = encoder_combined = [batch_size, src_seq_len, emb_dim]
        
        # permute and convert back to emb_dim
        conved_emb = self.attn_hid2emb(conved.permute(0, 2, 1))
        
        combined = (embedded + conved_emb) * self.scale
        
        # combined = [batch_size, trg_seq_len, emb_dim]
        
        energy = torch.matmul(combined, encoder_conved.permute(0, 2, 1))
        
        # energy = [batch_size, trg_seq_len, src_seq_len]
        
        attention = F.softmax(energy, dim=2)
        
        # attention = [batch_size, trg_seq_len, src_seq_len]
        
        attended_encoding = torch.matmul(attention, (encoder_conved + encoder_combined))
        
        # attended_encoding = [batch_size, trg_seq_len, emb_dim]
        
        # convert from emb_dim => hid_dim
        attended_encoding = self.attn_emb2hid(attended_encoding)
        
        # attended_encoding = [batch_size, trg_seq_len, hid_dim]
        
        attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale
        
        # attended_combined = [batch_size, hid_dim, trg_seq_len]
        
        return attention, attended_combined
    
    def forward(self, trg, encoder_conved, encoder_combined):
        # trg = [batch_size, seq_len]
        # encoder_conved = encoder_combined = [batch_size, seq_len, emb_dim]
        
        # create position tensor
        pos = torch.arange(0, trg.shape[1]).unsqueeze(0).repeat(trg.shape[0], 1).to(self.device)
        
        # pos = [batch_size, seq_len]
        
        # embed tokens and positions
        tok_embedded = self.tok_embedding(trg)
        pos_embedded = self.pos_embedding(pos)
        
        # tok_embedded = pos_embedded = [batch_size, seq_len, emb_dim]
        
        # combine embeddings by elementwise summing
        embedded = self.dropout(tok_embedded + pos_embedded)
        
        # embedded = [batch_size, seq_len, emb_dim]
        
        # pass embedded through linear layer to go through emb_dim => hid_dim
        conv_input = self.emb2hid(embedded)
        
        # conv_input = [batch_size, seq_len, hid_dim]
        
        # permute for convokutional layer
        conv_input = conv_input.permute(0, 2, 1)
        
        # conv_input = [batch_size, hid_dim, seq_len]
        
        for i, conv in enumerate(self.convs):
            conv_input = self.dropout(conv_input)
            
            # need to pad so decoder can't cheat
            padding = torch.zeros(conv_input.shape[0], conv_input.shape[1],
                                  self.kernel_size - 1).fill_(self.pad_idx).to(device)
            padded_conv_input = torch.cat((padding, conv_input), dim=2)
            
            # padded_conv_input = [batch_size, hid_dim, seq_len + kernel_size - 1]
            
            # pass through convolutional layer
            conved = conv(padded_conv_input)
            
            # conved = [batch_size, 2 * hid_dim, seq_len]
            
            # pass through GLU activation function
            conved = F.glu(conved, dim=1)
            
            # conved = [batch_size, hid_dim, seq_len]
            
            attention, conved = self.calculate_attention(embedded, conved,
                                                         encoder_conved, encoder_combined)
            
            # attention = [batch_size, trg_seq_len, src_seq_len]
            # conved = [batch_size, hid_dim, seq_len]
            
            # apply residual connection
            conved = (conved + conv_input) * self.scale
            
            # conved = [batch_size, hid_dim, seq_len]
            
            # set conv_input to conved for next loop iteration
            conv_input = conved
            
        conved = self.hid2emb(conved.permute(0, 2, 1))
        
        # conved = [batch_size, seq_len, hid_dim]
        
        output = self.out(self.dropout(conved))
        
        # output = [batch_size, seq_len, output_dim]
        
        return output, attention

In [38]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, trg):
        # src = [batch_size, src_seq_len]
        # trg = [batch_size, trg_seq_len]
        
        # encoder_convedは、EncoderのConvブロックの出力
        # encoder_combinedは、encoder_conved + src_embeddings + positional embeddings
        encoder_conved, encoder_combined = self.encoder(src)
        
        # encoder_conved = encoder_combined = [batch_size, src_seq_len, emb_dim]
        
        # calculate predictions of next words
        # outputはターゲットの単語の予測結果
        output, attention = self.decoder(trg, encoder_conved, encoder_combined)
        
        # output = [batch_size, trg_seq_len, output_dim]
        # attention = [batch_size, trg_seq_len, src_seq_len]
        
        return output, attention

In [48]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
EMB_DIM = 256
HID_DIM = 512
ENC_LAYERS = 10
DEC_LAYERS = 10
ENC_KERNEL_SIZE = 3
DEC_KERNEL_SIZE = 3
ENC_DROPOUT = 0.25
DEC_DROPOUT = 0.25
PAD_IDX = TRG.vocab.stoi['<pad>']

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_LAYERS, ENC_KERNEL_SIZE, ENC_DROPOUT, device)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, DEC_LAYERS, DEC_KERNEL_SIZE, DEC_DROPOUT, PAD_IDX, device)
model = Seq2Seq(enc, dec, device).to(device)

In [49]:
model

Seq2Seq(
  (encoder): Encoder(
    (tok_embedding): Embedding(7855, 256)
    (pos_embedding): Embedding(100, 256)
    (emb2hid): Linear(in_features=256, out_features=512, bias=True)
    (hid2emb): Linear(in_features=512, out_features=256, bias=True)
    (convs): ModuleList(
      (0): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (1): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (2): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (3): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (4): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (5): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (6): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (7): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (8): Conv1d(512, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
      (9): Conv1d(512, 1024, kernel_size=(3,), stride=(1,),

In [50]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 37,351,685 trainable parameters


In [51]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [52]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        output, _ = model(src, trg[:, :-1])
        
        # output = [batch_size, trg_seq_len - 1, output_dim]
        # trg = [batch_size, trg_seq_len]
        
        output = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:, 1:].contiguous().view(-1)
        
        # output = [batch_size * trg_seq_len - 1, output_dim]
        # trg = [batch_size * trg_seq_len - 1]

        loss = criterion(output, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)

In [53]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg
            
            output, _ = model(src, trg[:, :-1])
            
            # output = [batch_size, trg_seq_len - 1, output_dim]
            # trg = [batch_size, trg_seq_len]
            
            output = output.contiguous().view(-1, output.shape[-1])
            trg = trg[:, 1:].contiguous().view(-1)
            
            # output = [batch_size * trg_seq_len - 1, output_dim]
            # trg = [batch_size * trg_seq_len - 1]
            
            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [54]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [55]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut5-model.pt')
        
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\tValid Loss: {valid_loss:.3f} | Valid PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 32s
	Train Loss: 4.352 | Train PPL:  77.635
	Valid Loss: 3.086 | Valid PPL:  21.889
Epoch: 02 | Time: 0m 32s
	Train Loss: 3.019 | Train PPL:  20.469
	Valid Loss: 2.419 | Valid PPL:  11.229
Epoch: 03 | Time: 0m 32s
	Train Loss: 2.594 | Train PPL:  13.382
	Valid Loss: 2.170 | Valid PPL:   8.763
Epoch: 04 | Time: 0m 32s
	Train Loss: 2.363 | Train PPL:  10.620
	Valid Loss: 2.011 | Valid PPL:   7.470
Epoch: 05 | Time: 0m 32s
	Train Loss: 2.207 | Train PPL:   9.092
	Valid Loss: 1.922 | Valid PPL:   6.837
Epoch: 06 | Time: 0m 32s
	Train Loss: 2.091 | Train PPL:   8.094
	Valid Loss: 1.901 | Valid PPL:   6.693
Epoch: 07 | Time: 0m 32s
	Train Loss: 2.003 | Train PPL:   7.409
	Valid Loss: 1.848 | Valid PPL:   6.345
Epoch: 08 | Time: 0m 32s
	Train Loss: 1.935 | Train PPL:   6.921
	Valid Loss: 1.809 | Valid PPL:   6.107
Epoch: 09 | Time: 0m 32s
	Train Loss: 1.871 | Train PPL:   6.498
	Valid Loss: 1.794 | Valid PPL:   6.016
Epoch: 10 | Time: 0m 32s
	Train Loss: 1.824 | Train PPL

In [56]:
model.load_state_dict(torch.load('tut5-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 1.826 | Test PPL:   6.210 |
