In [1]:
import torch
import torch.nn as nn
import torchtext
from torchtext.data import Field, TabularDataset, Iterator, batch
from torchtext.data import Iterator, BucketIterator
from torchtext.vocab import Vectors
from src.models import get_model
from torch.optim import Adam
from tqdm.notebook import tqdm
from torch.autograd import Variable
import spacy
import re
import numpy as np
import pandas as pd
import os
from IPython.core.debugger import set_trace

In [2]:
EPOCH = 2000
K = 128
H = 4
N = 6
dropout = 0.1
MIN_FREQ = 2
MAX_LEN = 100
B = 32
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
class Tokenizer(object):
    
    def __init__(self, lang):
        self.nlp = spacy.load(lang)
            
    def tokenize(self, sentence):
        sentence = re.sub(r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
        sentence = re.sub(r"[ ]+", " ", sentence)
        sentence = re.sub(r"\!+", "!", sentence)
        sentence = re.sub(r"\,+", ",", sentence)
        sentence = re.sub(r"\?+", "?", sentence)
        sentence = sentence.lower()
        return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]

In [4]:
BOS_WORD = '<sos>'
EOS_WORD = '<eos>'
TRG = Field(
    lower=True, 
    tokenize=Tokenizer('fr_core_news_sm').tokenize, 
    init_token=BOS_WORD, 
    eos_token=EOS_WORD,
    batch_first=True
)
SRC = Field(
    lower=True, 
    tokenize=Tokenizer('en_core_web_sm').tokenize,
    batch_first=True
)

In [5]:
path = 'C:/Users/bill/Documents/projects/data/tutorial/transformer/'
if not os.path.exists(os.path.join(path, 'temp.csv')):
    source = open(os.path.join(path, 'english.txt'), encoding='utf8').read().strip().split('\n')
    target = open(os.path.join(path, 'french.txt'), encoding='utf8').read().strip().split('\n')
    df = pd.DataFrame({
        'src' : source, 
        'trg': target 
    }, columns=["src", "trg"])
    df = df[(df['src'].str.count(' ') <= MAX_LEN) & (df['trg'].str.count(' ') <= MAX_LEN)]
    df.to_csv(os.path.join(path, 'temp.csv'), index=False)
data = TabularDataset(
    os.path.join(path, 'temp.csv'), 
    format='csv', 
    fields=[
        ('src', SRC), 
        ('trg', TRG)
    ])
train, valid, test = data.split(split_ratio=[0.8, 0.1, 0.1])

In [6]:
SRC.build_vocab(train.src, min_freq=MIN_FREQ)
TRG.build_vocab(train.trg, min_freq=MIN_FREQ)
src_pad = SRC.vocab.stoi['<pad>']
trg_pad = TRG.vocab.stoi['<pad>']

In [7]:
class HarvardIterator(Iterator):
    def create_batches(self):
        if self.train:
            def pool(d, random_shuffler):
                for p in batch(d, self.batch_size * 100):
                    p_batch = data.batch(
                        sorted(p, key=self.sort_key),
                        self.batch_size, self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
                        yield b
            self.batches = pool(self.data(), self.random_shuffler)
        else:
            self.batches = []
            for b in batch(self.data(), 
                           self.batch_size,
                           self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))

#B = 32
#train_iter = MyIterator(
#    train, 
#     batch_size=B, 
#     device='cuda',
#     repeat=False, 
#     sort_key=lambda x: (len(x.src), len(x.trg)),
#     batch_size_fn=batch_size_fn, 
#     train=True, 
#     shuffle=True
# )

In [13]:
itrain, ivalid = BucketIterator.splits(
    (train, valid),
    batch_sizes=(B, B),
    device=device,
    sort_key=lambda x: (len(x.src), len(x.trg)),
    sort_within_batch=False,
    repeat=False
)
itest = Iterator(
    test,
    batch_size=B,
    device=device,
    sort=False,
    sort_within_batch=False,
    repeat=False
)

In [9]:
model = get_model(
    src_vocab=len(SRC.vocab), 
    trg_vocab=len(TRG.vocab), 
    K=K,
    H=H,
    N=N,
    dropout=dropout
)

In [15]:
def nopeak_mask(size):
    np_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
    np_mask = Variable(torch.from_numpy(np_mask) == 0)
    np_mask = np_mask.cuda()
    return np_mask

def create_masks(src, trg):
    src_mask = (src != src_pad).unsqueeze(-2)
    if trg is not None:
        trg_mask = (trg != trg_pad).unsqueeze(-2)
        size = trg.size(1)
        np_mask = nopeak_mask(size)
        trg_mask = trg_mask & np_mask
    else:
        trg_mask = None
    return src_mask, trg_mask

def scoring(device, model, criterion, iterator):
    with torch.no_grad():
        total_loss = []
        for batch in iterator:
            src = batch.src.to(device)
            trg = batch.trg.to(device)
            trg_input = trg[:, :-1]
            src_mask, trg_mask = create_masks(src, trg_input)
            preds = model(src, trg_input, src_mask, trg_mask)
            loss = criterion(
                preds.view(-1, preds.size(-1)), 
                trg[:, 1:].contiguous().view(-1))
            total_loss.append(loss.item())
    return np.mean(total_loss)

In [None]:
print("training model...")
model.to(device)
#model.train()

optimizer = Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss(ignore_index=trg_pad)
best_loss = 10

for epoch in range(EPOCH):
    
    total_loss = []
    for batch in tqdm(itrain): 
        
        # index of the tokens B x T
        src = batch.src.to(device)
        trg = batch.trg.to(device)
        
        optimizer.zero_grad()
        
        trg_input = trg[:, :-1]
        src_mask, trg_mask = create_masks(src, trg_input)
        preds = model(src, trg_input, src_mask, trg_mask)
        
        #set_trace()
        
        loss = criterion(
            preds.view(-1, preds.size(-1)), 
            trg[:, 1:].contiguous().view(-1))

        total_loss.append(loss.item())   
        
        loss.backward()
        optimizer.step()
    
    train_loss = np.mean(total_loss)
    valid_loss = scoring(device, model, criterion, ivalid)

    print("epoch %d, train_loss = %.3f, valid_loss = %.03f" % (epoch, train_loss, valid_loss))
    
    if valid_loss < best_loss:
        torch.save({
                'loss': valid_loss,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            }, 'C:/Users/bill/Documents/projects/tutorials/model.tar')

training model...


HBox(children=(FloatProgress(value=0.0, max=3873.0), HTML(value='')))


epoch 0, train_loss = 2.622, valid_loss = 2.370


HBox(children=(FloatProgress(value=0.0, max=3873.0), HTML(value='')))


epoch 1, train_loss = 2.266, valid_loss = 2.105


HBox(children=(FloatProgress(value=0.0, max=3873.0), HTML(value='')))


epoch 2, train_loss = 2.041, valid_loss = 1.939


HBox(children=(FloatProgress(value=0.0, max=3873.0), HTML(value='')))


epoch 3, train_loss = 1.883, valid_loss = 1.826


HBox(children=(FloatProgress(value=0.0, max=3873.0), HTML(value='')))


epoch 4, train_loss = 1.767, valid_loss = 1.747


HBox(children=(FloatProgress(value=0.0, max=3873.0), HTML(value='')))


epoch 5, train_loss = 1.675, valid_loss = 1.677


HBox(children=(FloatProgress(value=0.0, max=3873.0), HTML(value='')))


epoch 6, train_loss = 1.602, valid_loss = 1.626


HBox(children=(FloatProgress(value=0.0, max=3873.0), HTML(value='')))


epoch 7, train_loss = 1.542, valid_loss = 1.587


HBox(children=(FloatProgress(value=0.0, max=3873.0), HTML(value='')))


epoch 8, train_loss = 1.489, valid_loss = 1.563


HBox(children=(FloatProgress(value=0.0, max=3873.0), HTML(value='')))


epoch 9, train_loss = 1.444, valid_loss = 1.520


HBox(children=(FloatProgress(value=0.0, max=3873.0), HTML(value='')))


epoch 10, train_loss = 1.408, valid_loss = 1.502


HBox(children=(FloatProgress(value=0.0, max=3873.0), HTML(value='')))


epoch 11, train_loss = 1.378, valid_loss = 1.497


HBox(children=(FloatProgress(value=0.0, max=3873.0), HTML(value='')))


epoch 12, train_loss = 1.367, valid_loss = 1.488


HBox(children=(FloatProgress(value=0.0, max=3873.0), HTML(value='')))


epoch 13, train_loss = 1.355, valid_loss = 1.469


HBox(children=(FloatProgress(value=0.0, max=3873.0), HTML(value='')))