In [12]:
import math
import time

from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import transformer.Constants as Constants
from dataset import TranslationDataset, paired_collate_fn
from transformer.Models import Transformer
from transformer.Optim import ScheduledOptim

from train import prepare_dataloaders

In [13]:
cuda = False
device = torch.device('cuda' if cuda else 'cpu')

In [14]:
class Transformer(nn.Module):
    ''' A sequence to sequence model with attention mechanism. '''

    def __init__(
            self,
            n_src_vocab, n_tgt_vocab, len_max_seq,
            d_word_vec=512, d_model=512, d_inner=2048,
            n_layers=6, n_head=8, d_k=64, d_v=64, dropout=0.1,
            tgt_emb_prj_weight_sharing=True,
            emb_src_tgt_weight_sharing=True):

        super().__init__()

        self.encoder = Encoder(
            n_src_vocab=n_src_vocab, len_max_seq=len_max_seq,
            d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
            n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
            dropout=dropout)

        self.decoder = Decoder(
            n_tgt_vocab=n_tgt_vocab, len_max_seq=len_max_seq,
            d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
            n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
            dropout=dropout)

        self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False)
        nn.init.xavier_normal_(self.tgt_word_prj.weight)

        assert d_model == d_word_vec, \
        'To facilitate the residual connections, \
         the dimensions of all module outputs shall be the same.'

        if tgt_emb_prj_weight_sharing:
            # Share the weight matrix between target word embedding & the final logit dense layer
            self.tgt_word_prj.weight = self.decoder.tgt_word_emb.weight
            self.x_logit_scale = (d_model ** -0.5)
        else:
            self.x_logit_scale = 1.

        if emb_src_tgt_weight_sharing:
            # Share the weight matrix between source & target word embeddings
            assert n_src_vocab == n_tgt_vocab, \
            "To share word embedding table, the vocabulary size of src/tgt shall be the same."
            self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight

    def forward(self, src_seq, src_pos, tgt_seq, tgt_pos):

        tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1]

        enc_output, *_ = self.encoder(src_seq, src_pos)
        dec_output, *_ = self.decoder(tgt_seq, tgt_pos, src_seq, enc_output)
        seq_logit = self.tgt_word_prj(dec_output) * self.x_logit_scale

        return seq_logit.view(-1, seq_logit.size(2))


In [16]:
max_token_seq_len = 15 #data['settings'].max_token_seq_len
training_data, validation_data = prepare_dataloaders(data, opt)
src_vocab_size = 50000  # training_data.dataset.src_vocab_size
tgt_vocab_size = 50000  # training_data.dataset.tgt_vocab_size

NameError: name 'data' is not defined

In [8]:
transformer = Transformer(
    n_src_vocab = src_vocab_size,
    n_tgt_vocab = tgt_vocab_size,
    len_max_seq = max_token_seq_len, 
    tgt_emb_prj_weight_sharing = True, # True
    emb_src_tgt_weight_sharing = True, # True
    d_k = 64,           # 64
    d_v = 64,           # 64
    d_model = 512,      # 512
    d_word_vec = 512,   # 512
    d_inner = 2048,     # 2048
    n_layers = 6,       # 6
    n_head = 8,         # 8
    dropout = 0.1       # 0.1
).to(device)



NameError: name 'opt' is not defined

In [3]:
optimizer = ScheduledOptim(
    optim.Adam(
        filter(lambda x: x.requires_grad, transformer.parameters()),
        betas=(0.9, 0.98), eps=1e-09),
    opt.d_model, opt.n_warmup_steps)



NameError: name 'ScheduledOptim' is not defined

In [4]:
train(transformer, training_data, validation_data, optimizer, device ,opt)

NameError: name 'train' is not defined