In [1]:
!pip install spacy



In [2]:
import spacy

In [3]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [4]:
!python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [5]:
pip install torchtext



In [6]:
import spacy
from torchtext.data import Field
from torchtext.datasets import Multi30k, WMT14
import torch
from torch.utils.data import TensorDataset, DataLoader

spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

BOS= '<s>'
EOS = '</s>'
PAD = '<pad>'

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

def buildTensor(dataset, max_seq, src_vocab2num, trg_vocab2num, train=True, dev=False, test=False):
    src_data = torch.ones(len(dataset), max_seq + 2)
    trg_data = None
    if not train:
        raw_trg = []
    
    trg_data = torch.ones(len(dataset), max_seq + 2)

    for i, sentence in enumerate(dataset):
        src_data[i][0] = src_vocab2num[BOS]
        src_data[i][-1] = trg_vocab2num[EOS]
        for j in range(1, min(max_seq + 1, len(sentence.src))):
            word = sentence.src[j - 1]
            src_data[i][j] = src_vocab2num[word]

        if trg_data is not None:
            trg_data[i][0] = trg_vocab2num[BOS]
            trg_data[i][-1] = trg_vocab2num[EOS]
            for j in range(1, min(max_seq + 1, len(sentence.trg))):
                word = sentence.trg[j - 1]
                trg_data[i][j] = trg_vocab2num[word]

            if not train:
                raw_trg.append(sentence.trg)

    if train:
        return src_data, trg_data

    elif dev:
        return src_data, trg_data, raw_trg

    else:
        return src_data, raw_trg

print('Loading dataset')

SRC = Field(tokenize=tokenize_en, pad_token=PAD)
TGT = Field(tokenize=tokenize_de, init_token=BOS, eos_token = EOS, pad_token=PAD)
max_seq = 50
train, dev, test = Multi30k.splits(exts=('.en', '.de'), fields=(SRC, TGT), filter_pred=lambda x: len(vars(x)['src']) <= max_seq and len(vars(x)['trg']) <= max_seq)

min_freq = 2
batch_size = 100
SRC.build_vocab(train.src, min_freq=min_freq)
TGT.build_vocab(train.trg, min_freq=min_freq)

src_vocab = SRC.vocab
trg_vocab = TGT.vocab

src_train_data, trg_train_data = buildTensor(train, max_seq, src_vocab.stoi, trg_vocab.stoi, train=True)
src_dev_data, trg_dev_data, raw_dev_trg = buildTensor(dev, max_seq, src_vocab.stoi, trg_vocab.stoi, train=False, dev=True)
src_test_data, raw_test_trg = buildTensor(test, max_seq, src_vocab.stoi, trg_vocab.stoi, train=False, test=True)

src_train_data, trg_train_data = src_train_data.type(torch.long), trg_train_data.type(torch.long)
src_dev_data, trg_dev_data = src_dev_data.type(torch.long), trg_dev_data.type(torch.long)
src_test_data = src_test_data.type(torch.long)

print('Building dataset')
train = TensorDataset(src_train_data, trg_train_data)
dev = TensorDataset(src_dev_data, trg_dev_data)
test = TensorDataset(src_test_data)

print('Building dataloader')
train_loader = DataLoader(train, batch_size=batch_size, pin_memory=True)
dev_loader = DataLoader(dev, batch_size=batch_size, pin_memory=True)
test_loader = DataLoader(test, batch_size=batch_size, pin_memory=True)

print('Saving dataloader')
torch.save(src_vocab.stoi, 'src_vocab2num.pt')
torch.save(src_vocab.itos, 'src_num2vocab.pt')
torch.save(trg_vocab.stoi, 'trg_vocab2num.pt')
torch.save(trg_vocab.itos, 'trg_num2vocab.pt')
torch.save(raw_dev_trg, 'raw_dev_trg.pt')
torch.save(raw_test_trg, 'raw_test_trg.pt')
torch.save(train_loader, 'train_loader.pt')
torch.save(dev_loader, 'dev_loader.pt')
torch.save(test_loader, 'test_loader.pt')

training.tar.gz:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Loading dataset
downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:00<00:00, 6.15MB/s]
validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 1.82MB/s]


downloading validation.tar.gz
downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 1.70MB/s]


Building dataset
Building dataloader
Saving dataloader


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
import torch.nn as nn
from torch.nn import Transformer

class TransformerModel(nn.Module):
    def __init__(self, n_src_vocab, n_trg_vocab, d_model=512, n_heads=8, n_encoders=6, n_decoders=6, d_ff=2048, dropout=0.1, padding_idx=1):
        super(TransformerModel, self).__init__()

        self.src_embedding = nn.Embedding(n_src_vocab, d_model, padding_idx=padding_idx)
        self.trg_embedding = nn.Embedding(n_trg_vocab, d_model, padding_idx=padding_idx)
        self.transformer = Transformer(d_model, n_heads, num_encoder_layers=n_encoders, num_decoder_layers=n_decoders,
                                       dim_feedforward=d_ff, dropout=dropout)
        self.fc = nn.Linear(d_model, n_trg_vocab)

    def no_peek_mask(self, size):
        mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask.to(device)
    
    def forward(self, src_seq, trg_seq):
        trg_mask = self.no_peek_mask(trg_seq.size(1))
        src_seq, trg_seq = self.src_embedding(src_seq), self.trg_embedding(trg_seq)
        src_seq, trg_seq = src_seq.permute(1, 0, 2), trg_seq.permute(1, 0, 2)  # size = (S, B, E), where S = max_seq, B = batch_size, E = embedding_size.
        output = self.transformer(src_seq, trg_seq, tgt_mask=trg_mask)
        return self.fc(output).permute(1, 2, 0)

In [9]:
class TransformerOptim():
    def __init__(self, optimizer, d_model=512, warmup_steps=4000):
        """
        :param optimizer: the optimizer that we used, in the original paper, Vaswani et al. use Adam.
        :param d_model: the embedding dimension.
        :param warmup_steps: the number of warm up training steps.
        """
        self.optimizer = optimizer
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.n_steps = 0

    def update_lr(self):
        """
        Updated learning rate. lr := d_model^(-0.5) * min(step_num^(-0.5), step_num * warmup_steps^(-1.5))
        """
        self.n_steps += 1
        lr = self.d_model**(-0.5) * min(self.n_steps**(-0.5), self.n_steps * self.warmup_steps**(-1.5))
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

    def zero_grad(self):
        self.optimizer.zero_grad()

    def step(self):
        self.update_lr()
        self.optimizer.step()

In [81]:
"""
Author: Yanting Miao
"""
import time
import torch
import torch.nn as nn
import torch.optim as optim

def embedding(x, n_vocab, device, d_model=512):
    embed = nn.Embedding(n_vocab, d_model, padding_idx=0).to(device)
    return embed(x).permute(1, 0, 2)

def calculate_time(start):
    end = time.time()
    t = end - start
    m = t // 60
    s = t - m * 60
    return m, s

def evaluating(model, data, criterion, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for src, trg in data:
            src, trg = src.to(device), trg.to(device)
            trg_input = trg[:, :-1]
            trg_real = trg[:, 1:]
            translate = model(src, trg_input)
            loss = criterion(translate, trg_real)
            total_loss += loss.item()

    return total_loss / len(data)

def training(model, train_data, dev_data, n_epochs, criterion, optimizer, device, path):
    train_loss_list = []
    val_loss_list = []
    model.train()
    step = 1
    print_every = len(train_data)
    min_loss = None
    start = time.time()
    for epoch in range(n_epochs):
        running_loss = 0.0
        for src, trg in train_data:
            optimizer.zero_grad()
            src = src.to(device)
            # shifted to right, for example, trg = "<s>I love cats</s>", trg_input = "<s>I love cats", trg_real = "I love cats</s>"
            trg_input = trg[:, :-1].to(device)
            trg_real = trg[:, 1:].to(device)
            translate = model(src, trg_input)
            loss = criterion(translate, trg_real)
            running_loss += loss.item()
            loss.backward()
            optimizer.step()
            step += 1
            if step % print_every == 0:
                val_loss = evaluating(model, dev_data, criterion, device)
                m, s = calculate_time(start)
                train_loss_list.append(running_loss / len(train_data))
                val_loss_list.append(val_loss)
                print('%d/%d, (%dm%ds), train loss: %.3f, val loss: %.3f' %
                      (epoch + 1, n_epochs, m, s, running_loss / len(train_data), val_loss))
                if min_loss is None or min_loss > val_loss:
                    if min_loss:
                        print('Validation loss decreaseing: %.4f --> %.4f' % (min_loss, val_loss))
                    else:
                        print('Validation loss in first epoch is: %.4f' % (val_loss))
                    print('Saving model')
                    min_loss = val_loss
                    torch.save(model, path)
                running_loss = 0.0
                model.train()
    return train_loss_list, val_loss_list

if __name__ == '__main__':
    n_epochs = 10
    max_seq = 100
    optim_name = 'Adam'
    print('Loading Multi30K dataset')
    train_data = torch.load('train_loader.pt')
    dev_data = torch.load('dev_loader.pt')
    test_data = torch.load('test_loader.pt')
    src_vocab2num = torch.load('src_vocab2num.pt')
    trg_vocab2num = torch.load('trg_vocab2num.pt')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = TransformerModel(len(src_vocab2num), len(trg_vocab2num), 512, 1, n_encoders=6, n_decoders=6, d_ff=2048).to(device)
    path = 'best_adam_transformer.pt'
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    adam_optim = optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9)
    optimizer = TransformerOptim(adam_optim)
    print('Start training')
    start = time.time()
    train_loss, val_loss = training(model, train_data, dev_data, n_epochs, criterion, optimizer, device, path)
    m, s = calculate_time(start)
    print('Training took %dm%ds' % (m, s))
    print('Start testing')
    model = torch.load(path)
    model = model.to(device)
    # test_loss = evaluating(model, test_data, criterion, device)
    # print('Test loss: %.3f' % (test_loss))
    print('Saving experiment result')
    train_loss_path = optim_name + '_train_loss.pt'
    val_loss_path = optim_name + '_val_loss.pt'
    # test_loss_path = optim_name + '_test_loss.pt'
    torch.save(train_loss, train_loss_path)
    torch.save(val_loss, val_loss_path)
    # torch.save(test_loss, test_loss_path)

Loading Multi30K dataset
Start training
1/10, (1m57s), train loss: 2.782, val loss: 1.435
Validation loss in first epoch is: 1.4349
Saving model
2/10, (3m55s), train loss: 1.256, val loss: 1.063
Validation loss decreaseing: 1.4349 --> 1.0628
Saving model
3/10, (5m53s), train loss: 1.002, val loss: 0.899
Validation loss decreaseing: 1.0628 --> 0.8986
Saving model
4/10, (7m51s), train loss: 0.864, val loss: 0.792
Validation loss decreaseing: 0.8986 --> 0.7920
Saving model
5/10, (9m49s), train loss: 0.762, val loss: 0.750
Validation loss decreaseing: 0.7920 --> 0.7504
Saving model
6/10, (11m47s), train loss: 0.685, val loss: 0.734
Validation loss decreaseing: 0.7504 --> 0.7340
Saving model
7/10, (13m45s), train loss: 0.622, val loss: 0.744
8/10, (15m42s), train loss: 0.571, val loss: 0.675
Validation loss decreaseing: 0.7340 --> 0.6755
Saving model
9/10, (17m41s), train loss: 0.527, val loss: 0.672
Validation loss decreaseing: 0.6755 --> 0.6718
Saving model
10/10, (19m39s), train loss: 0.

In [107]:
def translate(model, sentence, trg_num2vocab, device):
    model.eval()
    sentence = tokenize_en(sentence[0])
    tmp = []
    for token in sentence:
        if token in src_vocab2num:
            tmp.append(src_vocab2num[token])
        else:
            tmp.append(src_vocab2num['<unk>'])
    sentence = torch.LongTensor([tmp]).to(device)
    sentence = sentence.view(1, -1)
    trg_init_tok = trg_vocab2num[BOS]
    trg = torch.LongTensor([[trg_init_tok]]).to(device)
    translation = ""
    for i in range(max_seq):
        pred = model(sentence, trg)
        pred = torch.argmax(pred, dim=1)
        new_word = pred[0][i].item()
        add_word = trg_num2vocab[new_word]
        if add_word == EOS or add_word == PAD:
            break
        translation += " " + add_word
        pred = torch.LongTensor([[new_word]]).to(device)
        trg = torch.cat((trg, pred), dim=1).to(device)
    
    return translation

In [83]:
def reconstructTrgLine(raw_data, num2vocab):
    sentences = []
    for line in raw_data:
        tmp = ''
        for token in line:
            if token != BOS and token != EOS and token != PAD:
                token = token + ' '
                tmp += token
        sentences.append([tmp])
    
    return sentences

In [84]:
def reconstructSrcLine(data, num2vocab):
    sentences = []
    for line in data:
        tmp = ''
        for token in line:
            word = num2vocab[token.item()]
            if word != BOS and word != EOS and word != PAD and word != '<unk>':
                word += ' '
                tmp += word
        sentences.append([tmp])
    
    return sentences

In [61]:
raw_test_trg_sentences = reconstructLine(raw_test_trg, TGT.vocab.stoi)

In [62]:
raw_test_src_sentences = reconstructSrcLine(src_test_data, SRC.vocab.itos)

In [65]:
pip install pytorch-nlp

Collecting pytorch-nlp
[?25l  Downloading https://files.pythonhosted.org/packages/4f/51/f0ee1efb75f7cc2e3065c5da1363d6be2eec79691b2821594f3f2329528c/pytorch_nlp-0.5.0-py3-none-any.whl (90kB)
[K     |███▋                            | 10kB 22.2MB/s eta 0:00:01[K     |███████▎                        | 20kB 18.8MB/s eta 0:00:01[K     |███████████                     | 30kB 14.9MB/s eta 0:00:01[K     |██████████████▌                 | 40kB 14.2MB/s eta 0:00:01[K     |██████████████████▏             | 51kB 11.7MB/s eta 0:00:01[K     |█████████████████████▉          | 61kB 11.5MB/s eta 0:00:01[K     |█████████████████████████▌      | 71kB 11.7MB/s eta 0:00:01[K     |█████████████████████████████   | 81kB 12.9MB/s eta 0:00:01[K     |████████████████████████████████| 92kB 5.9MB/s 
Installing collected packages: pytorch-nlp
Successfully installed pytorch-nlp-0.5.0


In [111]:
from torchnlp.metrics import get_moses_multi_bleu

model = torch.load('best_adam_transformer.pt')
translation = [None] * len(raw_test_src_sentences)
for i in range(len(translation)):
    translation[i] = [translate(model, raw_test_src_sentences[i], TGT.vocab.itos, device)]
    if i == 3:
        break