In [1]:
import copy
import math
import time

import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import IterableDataset
from torch.utils.data import DataLoader

In [2]:
DATA_DIR = 'data/'

SRC_VOCAB_FILE = [f'{DATA_DIR}/bpecodes', f'{DATA_DIR}/dict.ru.txt']
TRG_VOCAB_FILE = [f'{DATA_DIR}/bpecodes', f'{DATA_DIR}/dict.en.txt']

RU_DATA = f'{DATA_DIR}/train.ru'
EN_DATA = f'{DATA_DIR}/train.en'

MAX_SEQ_LEN = 255
PAD_ID = 1
BOUND_ID = 2

# model params
enc_ffn_dims = 8192
dec_ffn_dims = 4096
d_model = 1024
heads = 16
N = 6
WARM_UP_STEPS = 16000
INIT_LR = 2

EPOCHS = 10
BATCH_SIZE = 64

DEVICE = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda', index=1)

In [3]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len = 256):
        super().__init__()
        self.d_model = d_model

        half_dim = d_model // 2
        pe = math.log(10000) / (half_dim - 1)
        pe = torch.exp(torch.arange(half_dim, dtype=torch.float) * -pe)
        pe = torch.arange(max_seq_len, dtype=torch.float).unsqueeze(1) * pe.unsqueeze(0)
        pe = torch.cat([torch.sin(pe), torch.cos(pe)], dim=1).view(max_seq_len, -1)
        
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    
    def forward(self, x):
        length = x.size(1) + 2
        return x + self.pe[:, 2:length]

In [4]:
def attention(q, k, v, d_k, mask=None, dropout=None):
    
    scores = torch.matmul(q, k.transpose(-2, -1)) /  np.sqrt(d_k)
    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask == 0, -1e9)
    scores = F.softmax(scores, dim=-1)
    
    if dropout is not None:
        scores = dropout(scores)
        
    output = torch.matmul(scores, v)
    return output

In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    
    def forward(self, q, k, v, mask=None):
        
        bs = q.size(0)
                
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        
        concat = scores.transpose(1,2).contiguous().view(bs, -1, self.d_model)
        
        output = self.out(concat)
    
        return output

In [6]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout = 0.1):
        super().__init__() 
        self.dropout = dropout
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.linear_2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        x = F.dropout(F.relu(self.linear_1(x)), p = self.dropout, training = self.training)
        x = self.linear_2(x)
        return x

In [7]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, ffn_dim, dropout = 0.1):
        super().__init__()
        self.dropout = dropout
        self.norm_1 = nn.LayerNorm(d_model)
        self.norm_2 = nn.LayerNorm(d_model)
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model, ffn_dim)
        
    def forward(self, x, mask):
        x = x + F.dropout(self.attn(x, x, x, mask), p = self.dropout, training = self.training)
        x = self.norm_1(x)
        x = x + F.dropout(self.ff(x), p = self.dropout, training = self.training)
        x = self.norm_2(x)
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, ffn_dim, dropout=0.1):
        super().__init__()
        self.dropout = dropout
        self.norm_1 = nn.LayerNorm(d_model)
        self.norm_2 = nn.LayerNorm(d_model)
        self.norm_3 = nn.LayerNorm(d_model)
        self.attn_1 = MultiHeadAttention(heads, d_model)
        self.attn_2 = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model, ffn_dim)
        
    def forward(self, x, e_outputs, src_mask, trg_mask):
        x = x + F.dropout(self.attn_1(x, x, x, trg_mask), p = self.dropout, training = self.training)
        x = self.norm_1(x)
        x = x + F.dropout(self.attn_2(x, e_outputs, e_outputs, src_mask), p = self.dropout, training = self.training)
        x = self.norm_2(x)
        x = x + F.dropout(self.ff(x), p = self.dropout, training = self.training)
        x = self.norm_3(x)
        return x

In [8]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, ffn_dim):
        super().__init__()
        self.N = N
        self.d_model = d_model
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=1)
        self.pe = PositionalEncoder(d_model)
        self.layers = nn.ModuleList([EncoderLayer(d_model, heads, ffn_dim) for _ in range(N)])
    def forward(self, src, mask):
        x = self.embed(src) * np.sqrt(self.d_model)
        x = self.pe(x)
        for i in range(N):
            x = self.layers[i](x, mask)
        return x
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, ffn_dim):
        super().__init__()
        self.N = N
        self.d_model = d_model
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=1)
        self.pe = PositionalEncoder(d_model)
        self.layers = nn.ModuleList([DecoderLayer(d_model, heads, ffn_dim) for _ in range(N)])
    def forward(self, trg, e_outputs, src_mask, trg_mask):
        x = self.embed(trg) * np.sqrt(self.d_model)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
        return x

In [9]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model, N, heads, enc_ffn_dim, dec_ffn_dim):
        super().__init__()
        self.encoder = Encoder(src_vocab_size, d_model, N, heads, enc_ffn_dim)
        self.decoder = Decoder(trg_vocab_size, d_model, N, heads, dec_ffn_dim)
    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = F.linear(d_output, self.decoder.embed.weight)
        return output
    
    def out(self, trg, e_outputs, src_mask, trg_mask):
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = F.linear(d_output, self.decoder.embed.weight)
        return output

In [10]:
class Tokenizer:
    
    def __init__(self, bpe_file: str, vocab_file: str):
        import fastBPE
        from sacremoses import MosesDetokenizer, MosesTokenizer
        
        self.bpe = fastBPE.fastBPE(bpe_file)
        self.tokenizer = MosesTokenizer('ru')
        self.detokenizer = MosesDetokenizer('en')
        
        tmp = open(vocab_file, encoding='utf-8').read().split('\n')
        tmp = [x.split()[0] for x in tmp[:-1]]
        self.vocab = ['<sos>', '<pad>', '<eos>', '<unk>'] + tmp
        self.t2i = {t : i for i, t in enumerate(self.vocab)}
    
    def encode(self, sent: str, add_eos = False):
        sent = self.tokenizer.tokenize(sent, aggressive_dash_splits=True, return_str=True)
        tokens = self.bpe.apply([sent])[0].split()
        tokens = [self.t2i[t] if t in self.t2i else self.t2i['<unk>'] for t in tokens]
        if add_eos:
            tokens += [self.t2i['<eos>']]
        return tokens
    
    def decode(self, tokens: list):
        sent = [self.vocab[t] for t in tokens]
        sent = ' '.join(sent)
        sent = (sent + ' ').replace('@@ ', '').rstrip()
        sent = self.detokenizer.detokenize(sent.split())
        return sent

In [11]:
class NMT_dataset(IterableDataset):

    def __init__(self, ru_data, en_data):
        self.ru_data = ru_data
        self.en_data = en_data
        self.src_tokenizer = Tokenizer(*SRC_VOCAB_FILE)
        self.trg_tokenizer = Tokenizer(*TRG_VOCAB_FILE)
    
    def src_line_mapper(self, line):
        line = line.replace('\n', '')
        tokens = self.src_tokenizer.encode(line, True)
        return tokens
    
    def trg_line_mapper(self, line):
        line = line.replace('\n', '')
        tokens = self.trg_tokenizer.encode(line, True)
        return tokens
    
    def __iter__(self):

        ru_iter = open(self.ru_data, 'r', encoding='utf-8')
        en_iter = open(self.en_data, 'r', encoding='utf-8')

        mapped_ru_iter = map(self.src_line_mapper, ru_iter)
        mapped_en_iter = map(self.trg_line_mapper, en_iter)
        
        ru_en_iter = zip(mapped_ru_iter, mapped_en_iter)
        
        return ru_en_iter

In [12]:
def pad_batch(batch):
    max_src_len = max(len(x) for x, _ in batch)
    max_trg_len = max(len(y) for _, y in batch)
    max_src_len = min(max_src_len, MAX_SEQ_LEN)
    max_trg_len = min(max_trg_len, MAX_SEQ_LEN)
    
    src_batch = [x[:max_src_len] for x, _ in batch]
    trg_batch = [y[:max_trg_len] for _, y in batch]
    
    src_batch = [np.pad(x, (0, max_src_len - len(x)), constant_values = PAD_ID) for x in src_batch]
    trg_batch = [np.pad(y, (0, max_trg_len - len(y)), constant_values = PAD_ID) for y in trg_batch]
    return torch.tensor(src_batch, dtype=torch.long), torch.tensor(trg_batch, dtype=torch.long)

In [13]:
def nopeak_mask(size):
    np_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
    np_mask = torch.from_numpy(np_mask) == 0
    return np_mask.to(DEVICE)

def create_masks(src, trg):
    
    src_mask = (src != PAD_ID).unsqueeze(-2)

    if trg is not None:
        trg_mask = (trg != PAD_ID).unsqueeze(-2)
        size = trg.size(1)
        np_mask = nopeak_mask(size).to(DEVICE)
        trg_mask = trg_mask & np_mask
    else:
        trg_mask = None
        
    return src_mask, trg_mask

In [14]:
dataset = NMT_dataset(RU_DATA, EN_DATA)
dataloader = DataLoader(dataset, batch_size = BATCH_SIZE, collate_fn=pad_batch)

In [15]:
src_vocab_size = len(dataset.src_tokenizer.vocab)
trg_vocab_size = len(dataset.trg_tokenizer.vocab)

# Train

In [16]:
model = Transformer(src_vocab_size, trg_vocab_size, d_model, N, heads, enc_ffn_dims, dec_ffn_dims).to(DEVICE)
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

In [17]:
def get_lr(step):
    lr = INIT_LR
    lr *= d_model ** (-0.5)
    lr *= min(1.0, step / WARM_UP_STEPS)
    lr *= 1 / np.sqrt(max(step, WARM_UP_STEPS))
    return lr

In [18]:
optim = torch.optim.Adam(model.parameters(), lr=INIT_LR, betas=(0.9, 0.98), eps=1e-9)

In [19]:
scheduler = torch.optim.lr_scheduler.LambdaLR(optim, get_lr, last_epoch=-1)

In [20]:
import logging

logging.basicConfig(level=logging.DEBUG)
LOGGER = logging.getLogger()
FH = logging.FileHandler("train.log", "w", encoding="utf8")
FH.setFormatter(logging.Formatter("%(asctime)s %(message)s"))
LOGGER.setLevel(logging.DEBUG)
LOGGER.addHandler(FH)

In [21]:
from tqdm import tqdm

def train_model(epochs, print_every=10):
    
    model.train()
    
    start = time.time()
    temp = start
    
    total_loss = 0
    
    for epoch in range(epochs):
        for i, (src, trg) in tqdm(enumerate(dataloader)):
            scheduler.step()
            
            src = src.to(DEVICE)
            trg = trg.to(DEVICE)
            
            trg_input = F.pad(trg[:, :-1], (1, 0), value=BOUND_ID)
            src_mask, trg_mask = create_masks(src, trg_input)
            preds = model(src, trg_input, src_mask, trg_mask)
            
            optim.zero_grad()
            loss = F.cross_entropy(preds.view(-1, preds.size(-1)), trg.view(-1), ignore_index=PAD_ID)
            loss.backward()
            optim.step()
            
            total_loss += loss
            
            if (i + 1) % print_every == 0:
                loss_avg = total_loss / print_every
                LOGGER.info(f"iter = {i + 1}, lr = {scheduler.get_lr()[0]}, loss = {loss_avg}")
                total_loss = 0
            if (i + 1) % 100 == 0:
                tokens = np.argmax(F.softmax(preds, -1).data.tolist(), 2)
                LOGGER.info(f'Prediction: {dataset.tokenizer.decode(tokens[0])}')
                LOGGER.info(f'True: {dataset.tokenizer.decode(trg[0].data.tolist())}')
            if (i + 1) % 20000 == 0:
                torch.save(model.state_dict(), f'model.epoch{epoch}it{i+1}.pt')
        
        torch.save(model.state_dict(), f'model.epoch{epoch}.it{i+1}.pt')

In [22]:
# train_model(EPOCHS)

# Save model

In [23]:
# torch.save(model.state_dict(), f'model.pt')

# Load model

In [24]:
PATH = "model.pt"
model = Transformer(src_vocab_size, trg_vocab_size, d_model, N, heads, enc_ffn_dims, dec_ffn_dims).to(DEVICE)
model.load_state_dict(torch.load(PATH, map_location=DEVICE))

<All keys matched successfully>

# Beam search

In [25]:
import math

def init_vars(sentence, model, K):
    
    src_mask = (sentence != PAD_ID)
    e_output = model.encoder(sentence, src_mask)
            
    out = model.out(torch.LongTensor([[BOUND_ID]]).to(DEVICE), 
                    e_output, src_mask, 
                    nopeak_mask(1))
    out = F.softmax(out, -1)
    
    probs, ix = out[:, -1].data.topk(K)
    log_scores = torch.Tensor([math.log(prob) for prob in probs.data[0]]).unsqueeze(0).to(DEVICE)
    
    outputs = torch.zeros(K, MAX_SEQ_LEN).long().to(DEVICE)
    outputs[:, 0] = BOUND_ID
    outputs[:, 1] = ix[0]
    
    e_outputs = torch.zeros(K, e_output.size(-2),e_output.size(-1)).to(DEVICE)
    e_outputs[:, :] = e_output[0]
    
    return outputs, e_outputs, log_scores

def k_best_outputs(outputs, out, log_scores, i, k):
    
    probs, ix = out[:, -1].data.topk(k)
    log_probs = torch.Tensor([math.log(p) for p in probs.data.view(-1)]).view(k, -1).to(DEVICE) + log_scores.transpose(0,1)
    log_probs = log_probs.to(DEVICE)
    
    k_probs, k_ix = log_probs.view(-1).topk(k)
    
    row = k_ix // k
    col = k_ix % k

    outputs[:, :i] = outputs[row, :i]
    outputs[:, i] = ix[row, col]

    log_scores = k_probs.unsqueeze(0)
    
    return outputs, log_scores

def beam_search(sentence, model, K = 10):    
    
    outputs, e_outputs, log_scores = init_vars(sentence, model, K)
    src_mask = (sentence != PAD_ID).to(DEVICE)
    ind = None
    for i in range(2, MAX_SEQ_LEN):
    
        trg_mask = nopeak_mask(i)

        out = model.out(outputs[:,:i], e_outputs, src_mask, trg_mask)

        out = F.softmax(out, dim=-1)
    
        outputs, log_scores = k_best_outputs(outputs, out, log_scores, i, K)
        
        sentence_lengths = torch.zeros(len(outputs), dtype=torch.long).to(DEVICE)
        
        ones = (outputs==BOUND_ID).nonzero()
        for vec in ones:
            i = vec[0]
            if sentence_lengths[i] == 0:
                sentence_lengths[i] = vec[1]
        num_finished_sentences = len([s for s in sentence_lengths if s > 0])
        if num_finished_sentences == K:
            alpha = 0.6
            div = 1/(sentence_lengths.type_as(log_scores)**alpha)
            _, ind = torch.max(log_scores * div, 1)
            ind = ind.data[0]
            break
        
    if ind is None:
        length = (outputs[0] == BOUND_ID).nonzero()
        if len(length) != 0:
            length = length[1]
        else:
            length = 10
        res = outputs[0][1:length].data.tolist()
        return dataset.trg_tokenizer.decode(res)
    
    else:
        length = (outputs[ind] == BOUND_ID).nonzero()[1]
        res = outputs[ind][1:length].data.tolist()
        return dataset.trg_tokenizer.decode(res)

# Inference

In [26]:
model.eval()

Transformer(
  (encoder): Encoder(
    (embed): Embedding(31232, 1024, padding_idx=1)
    (pe): PositionalEncoder()
    (layers): ModuleList(
      (0): EncoderLayer(
        (norm_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (norm_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): MultiHeadAttention(
          (q_linear): Linear(in_features=1024, out_features=1024, bias=True)
          (v_linear): Linear(in_features=1024, out_features=1024, bias=True)
          (k_linear): Linear(in_features=1024, out_features=1024, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (out): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (ff): FeedForward(
          (linear_1): Linear(in_features=1024, out_features=8192, bias=True)
          (linear_2): Linear(in_features=8192, out_features=1024, bias=True)
        )
      )
      (1): EncoderLayer(
        (norm_1): LayerNorm((1024,), eps=1e-05, elementwise_aff

In [27]:
def translate(sent):
    sent = torch.LongTensor([dataset.src_line_mapper(sent)]).to(DEVICE)
    return beam_search(sent, model, 5)

In [28]:
translate('Я устал делать эту домашку')

"I'm tired of doing this housework"

In [29]:
data = open('data/eval-ru-100.txt', encoding='utf-8').read().split('\n')

In [30]:
from tqdm.autonotebook import tqdm

In [31]:
trans = [translate(x) for x in tqdm(data)]

HBox(children=(IntProgress(value=0), HTML(value='')))




In [32]:
open('answer.txt', 'w', encoding='utf-8').write('\n'.join(trans))

20913

In [36]:
import sacrebleu
refs = [open('data/truth-100.txt', encoding='utf-8').read().split('\n')]
bleu = sacrebleu.corpus_bleu(trans, refs)
print(bleu.score)

52.508918591346166
