In [1]:
!pip install spacy



In [2]:
import spacy

In [3]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [4]:
!python -m spacy download de

Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 782kB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907056 sha256=1e9c528d9790a6ba3568db66e3df40ed65ca1f357135297edc27dc045acec7df
  Stored in directory: /tmp/pip-ephem-wheel-cache-syahoig5/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/

In [5]:
import spacy
from torchtext.data import Field
from torchtext.datasets import IWSLT
import torch
from torch.utils.data import TensorDataset, DataLoader

spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

print('Loading dataset')
BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<pad>"
SRC = Field(tokenize=tokenize_en, pad_token=BLANK_WORD)
TGT = Field(tokenize=tokenize_de, init_token=BOS_WORD, eos_token = EOS_WORD, pad_token=BLANK_WORD)
max_seq = 30
train, dev, test = IWSLT.splits(exts=('.en', '.de'), fields=(SRC, TGT), filter_pred=lambda x: len(vars(x)['src']) <= max_seq and len(vars(x)['trg']) <= max_seq)

min_freq = 2
batch_size = 100
SRC.build_vocab(train.src, min_freq=min_freq)
TGT.build_vocab(train.trg, min_freq=min_freq)

src_vocab = SRC.vocab
trg_vocab = TGT.vocab
src_train_data = torch.ones(len(train), max_seq)
trg_train_data = torch.ones(len(train), max_seq)
src_dev_data = torch.ones(len(dev), max_seq)
trg_dev_data = torch.ones(len(dev), max_seq)
src_test_data = torch.ones(len(test), max_seq)
raw_test_trg = []

for i, sentence in enumerate(train):
    for j, word in enumerate(sentence.src):
        src_train_data[i][j] = src_vocab.stoi[word]

    for j, word in enumerate(sentence.trg):
        trg_train_data[i][j] = trg_vocab.stoi[word]

for i, sentence in enumerate(dev):
    for j, word in enumerate(sentence.src):
        if word in src_vocab.stoi:
            src_dev_data[i][j] = src_vocab.stoi[word]
        else:
            src_dev_data[i][j] = 0

    for j, word in enumerate(sentence.trg):
        if word in trg_vocab.stoi:
            trg_dev_data[i][j] = trg_vocab.stoi[word]
        else:
            trg_dev_data[i][j] = 0

for i, sentence in enumerate(test):
    for j, word in enumerate(sentence.src):
        if word in src_vocab.stoi:
            src_test_data[i][j] = src_vocab.stoi[word]
        else:
            src_test_data[i][j] = 0

    raw_test_trg.append(sentence.trg)

src_train_data, trg_train_data = src_train_data.type(torch.long), trg_train_data.type(torch.long)
src_dev_data, trg_dev_data = src_dev_data.type(torch.long), trg_dev_data.type(torch.long)
src_test_data = src_test_data.type(torch.long)

print('Building dataset')
train = TensorDataset(src_train_data, trg_train_data)
dev = TensorDataset(src_dev_data, trg_dev_data)
test = TensorDataset(src_test_data)

print('Building dataloader')
train_loader = DataLoader(train, batch_size=batch_size, pin_memory=True)
dev_loader = DataLoader(dev, batch_size=batch_size, pin_memory=True)
test_loader = DataLoader(test, batch_size=batch_size, pin_memory=True)

print('Saving dataloader')
torch.save(src_vocab.stoi, 'src_vocab2num.pt')
torch.save(src_vocab.itos, 'src_num2vocab.pt')
torch.save(trg_vocab.stoi, 'trg_vocab2num.pt')
torch.save(trg_vocab.itos, 'trg_num2vocab.pt')
torch.save(raw_test_trg, 'raw_test_trg.pt')
torch.save(train_loader, 'train_loader.pt')
torch.save(dev_loader, 'dev_loader.pt')
torch.save(test_loader, 'test_loader.pt')

Loading dataset
downloading en-de.tgz


en-de.tgz: 100%|██████████| 23.6M/23.6M [00:12<00:00, 1.86MB/s]


.data/iwslt/en-de/IWSLT16.TED.tst2014.en-de.de.xml
.data/iwslt/en-de/IWSLT16.TED.dev2010.en-de.en.xml
.data/iwslt/en-de/IWSLT16.TED.tst2011.en-de.de.xml
.data/iwslt/en-de/IWSLT16.TED.tst2013.en-de.en.xml
.data/iwslt/en-de/IWSLT16.TED.tst2012.en-de.de.xml
.data/iwslt/en-de/IWSLT16.TED.dev2010.en-de.de.xml
.data/iwslt/en-de/IWSLT16.TED.tst2013.en-de.de.xml
.data/iwslt/en-de/IWSLT16.TED.tst2011.en-de.en.xml
.data/iwslt/en-de/IWSLT16.TED.tst2012.en-de.en.xml
.data/iwslt/en-de/IWSLT16.TED.tst2010.en-de.en.xml
.data/iwslt/en-de/IWSLT16.TED.tst2010.en-de.de.xml
.data/iwslt/en-de/IWSLT16.TED.tst2014.en-de.en.xml
.data/iwslt/en-de/train.tags.en-de.de
.data/iwslt/en-de/train.tags.en-de.en
Building dataset
Building dataloader
Saving dataloader


In [7]:
"""
Author: Yanting Miao
"""
import time
import torch
import torch.nn as nn
import torch.optim as optim
from Model_Pytorch import TransformerModel
from Optim import TransformerOptim

def embedding(x, n_vocab, device, d_model=512):
    embed = nn.Embedding(n_vocab, d_model, padding_idx=0).to(device)
    return embed(x).permute(1, 0, 2)

def calculate_time(start):
    end = time.time()
    t = end - start
    m = t // 60
    s = t - m * 60
    return m, s

def evaluating(model, data, criterion, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for src, trg in data:
            src, trg = src.to(device), trg.to(device)
            trg_input = trg[:, :-1]
            trg_real = trg[:, 1:]
            translate = model(src, trg_input)
            loss = criterion(translate, trg_real)
            total_loss += loss.item()

    return total_loss / len(data)

def training(model, train_data, dev_data, n_epochs, criterion, optimizer, device, path):
    train_loss_list = []
    val_loss_list = []
    model.train()
    step = 1
    print_every = len(train_data)
    min_loss = None
    start = time.time()
    for epoch in range(n_epochs):
        running_loss = 0.0
        for src, trg in train_data:
            optimizer.zero_grad()
            src = src.to(device)
            # shifted to right, for example, trg = "<s>I love cats</s>", trg_input = "<s>I love cats", trg_real = "I love cats</s>"
            trg_input = trg[:, :-1].to(device)
            trg_real = trg[:, 1:].to(device)
            translate = model(src, trg_input)
            loss = criterion(translate, trg_real)
            running_loss += loss.item()
            loss.backward()
            optimizer.step()
            step += 1
            if step % print_every == 0:
                val_loss = evaluating(model, dev_data, criterion, device)
                m, s = calculate_time(start)
                train_loss_list.append(running_loss / len(train_data))
                val_loss_list.append(val_loss)
                print('%d/%d, (%dm%ds), train loss: %.3f, val loss: %.3f' %
                      (epoch + 1, n_epochs, m, s, running_loss / len(train_data), val_loss))
                if min_loss is None or min_loss > val_loss:
                    if min_loss:
                        print('Validation loss decreaseing: %.4f --> %.4f' % (min_loss, val_loss))
                    else:
                        print('Validation loss in first epoch is: %.4f' % (val_loss))
                    min_loss = val_loss
                    torch.save(model, path)
                running_loss = 0.0
                model.train()
    return train_loss_list, val_loss_list

if __name__ == '__main__':
    n_epochs = 10
    max_seq = 30
    optim_name = 'Adam'
    print('Loading IWSLT dataset')
    train_data = torch.load('train_loader.pt')
    dev_data = torch.load('dev_loader.pt')
    test_data = torch.load('test_loader.pt')
    src_vocab2num = torch.load('src_vocab2num.pt')
    trg_vocab2num = torch.load('trg_vocab2num.pt')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = TransformerModel(len(src_vocab2num), len(trg_vocab2num), 512, 1, 1, 1, d_ff=1024).to(device)
    path = 'best_adam_transformer.pt'
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    adam_optim = optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-9)
    optimizer = TransformerOptim(adam_optim)
    print('Start training')
    start = time.time()
    train_loss, val_loss = training(model, train_data, dev_data, n_epochs, criterion, optimizer, device, path)
    m, s = calculate_time(start)
    print('Training took %dm%ds' % (m, s))
    print('Start testing')
    model = torch.load(path)
    model = model.to(device)
    # test_loss = evaluating(model, test_data, criterion, device)
    # print('Test loss: %.3f' % (test_loss))
    print('Saving experiment result')
    train_loss_path = optim_name + '_train_loss.pt'
    val_loss_path = optim_name + '_val_loss.pt'
    # test_loss_path = optim_name + '_test_loss.pt'
    torch.save(train_loss, train_loss_path)
    torch.save(val_loss, val_loss_path)
    # torch.save(test_loss, test_loss_path)

Loading IWSLT dataset
Start training
1/10, (6m12s), train loss: 3.317, val loss: 2.122
Validation loss in first epoch is: 2.1215
2/10, (12m22s), train loss: 2.068, val loss: 1.775
Validation loss decreaseing: 2.1215 --> 1.7746
3/10, (18m33s), train loss: 1.789, val loss: 1.589
Validation loss decreaseing: 1.7746 --> 1.5889
4/10, (24m43s), train loss: 1.568, val loss: 1.457
Validation loss decreaseing: 1.5889 --> 1.4573
5/10, (30m54s), train loss: 1.420, val loss: 1.384
Validation loss decreaseing: 1.4573 --> 1.3843
6/10, (37m4s), train loss: 1.323, val loss: 1.333
Validation loss decreaseing: 1.3843 --> 1.3328
7/10, (43m15s), train loss: 1.254, val loss: 1.302
Validation loss decreaseing: 1.3328 --> 1.3022
8/10, (49m26s), train loss: 1.202, val loss: 1.276
Validation loss decreaseing: 1.3022 --> 1.2760
9/10, (55m36s), train loss: 1.163, val loss: 1.263
Validation loss decreaseing: 1.2760 --> 1.2625
10/10, (61m47s), train loss: 1.129, val loss: 1.241
Validation loss decreaseing: 1.2625 