In [1]:
import math
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torch.nn.functional as F

In [2]:
DR = 0.2 # Dropout rate
EDL = 5  # Encoder-decoder layers
device = "cpu"


In [3]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

    
        
class Transformer(nn.Module):

    def __init__(self, in_vocab_size,word_emb_size ,out_vocab_size, heads,hidden_dim,layers):
        super(Transformer, self).__init__()
        self.in_vocab_size = in_vocab_size
        self.out_vocab_size = out_vocab_size
        self.positional_encoder = PositionalEncoding(word_emb_size).to(device)
        
        encoder_layers = TransformerEncoderLayer(in_vocab_size, 1, hidden_dim, DR)
        self.transformer_encoder = TransformerEncoder(encoder_layers, layers)
        self.encoder = nn.Embedding(word_emb_size, word_emb_size)
        self.word_embeds = nn.Embedding(in_vocab_size, word_emb_size)
        self.word_emb_size = word_emb_size
        self.decoder = nn.Linear(word_emb_size, out_vocab_size)
        
    def forward(self, src):
        
        src = self.encoder(src) * math.sqrt(self.in_vocab_size)
        src = self.positional_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output)
        return output
    
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        output = F.log_softmax(output, dim=1)
        return output

In [4]:
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.data import Field, BucketIterator
from nltk.tokenize import TweetTokenizer
import spacy
tknzr = TweetTokenizer()
import os 


def load_data(path):
    """
    Load dataset
    """
    input_file = os.path.join(path)
    with open(input_file, "r", encoding='utf-8') as f:
        data = f.read()

    return data.split('\n')


english_sentences = load_data('corpus.en_ru.1m.en')
russian_sentences = load_data('corpus.en_ru.1m.ru')

KeyboardInterrupt: 

In [None]:
from keras.preprocessing.text import Tokenizer

from keras.preprocessing.sequence import pad_sequences


def tokenize(x):
    x_tk = Tokenizer(char_level = False,num_words=40000)
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk

def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = length, padding = 'post')

def preprocess(x, y):
    
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)
    m = max(max([len(sentence) for sentence in preprocess_x]),max([len(sentence) for sentence in preprocess_y]))
    preprocess_x = pad(preprocess_x,m)
    preprocess_y = pad(preprocess_y, m)

    return preprocess_x, preprocess_y, x_tk, y_tk

In [None]:
preproc_russian_sentences, preproc_english_sentences, english_tokenizer, russian_tokenizer =\
    preprocess(russian_sentences,english_sentences)
    


In [None]:
max_english_sequence_length = preproc_english_sentences.shape[1]
max_russian_sequence_length = preproc_russian_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
russian_vocab_size = len(russian_tokenizer.word_index)



In [None]:
from torch.utils import data

en_data = torch.from_numpy(preproc_english_sentences).long().to(device)
ru_data = torch.from_numpy(preproc_russian_sentences).long().to(device)
print(en_data.shape)
print(ru_data.shape)

dataset = data.TensorDataset(ru_data,en_data) 
dataloader = data.DataLoader(dataset,batch_size=2)

In [None]:

ntokens =english_vocab_size # the size of vocabulary
emsize = 32  # embedding dimension
nhid = 32 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 3 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)
cnt =0




In [None]:

criterion = nn.NLLLoss()
lr = 0.01 # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
for epoch in range(1):
    model.train() 
    total_loss = 0.
    cnt = 0
    for (i,tar) in dataloader:
        optimizer.zero_grad()
        inp = i
        predicted = model(inp)
        cnt+=1
        
        loss = criterion(predicted.view(-1,english_vocab_size),tar.view(-1))

        loss.backward()
        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        
        optimizer.step()

        log_interval = 30
        if cnt % log_interval == 0 and cnt > 0:
            torch.save((english_tokenizer, russian_tokenizer, model.state_dict()), "model")
            cur_loss = total_loss / log_interval
          
            
            print("Cur loss:", cur_loss, "Epoch:", epoch, "%:", cnt/len(dataloader)*100 )
            total_loss = 0
      