In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# IMPORTS

In [2]:
import re
import time
import math
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data

from tqdm import notebook
pd.set_option('display.max_colwidth', 200)

import spacy
from spacy.lang.ru import Russian

In [3]:
# dependency for spaCy Russian tokenizer
!pip install pymorphy2

Collecting pymorphy2
[?25l  Downloading https://files.pythonhosted.org/packages/07/57/b2ff2fae3376d4f3c697b9886b64a54b476e1a332c67eee9f88e7f1ae8c9/pymorphy2-0.9.1-py3-none-any.whl (55kB)
[K     |████████████████████████████████| 61kB 5.1MB/s 
[?25hCollecting pymorphy2-dicts-ru<3.0,>=2.4
[?25l  Downloading https://files.pythonhosted.org/packages/3a/79/bea0021eeb7eeefde22ef9e96badf174068a2dd20264b9a378f2be1cdd9e/pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2MB)
[K     |████████████████████████████████| 8.2MB 11.4MB/s 
Collecting dawg-python>=0.7.1
  Downloading https://files.pythonhosted.org/packages/6a/84/ff1ce2071d4c650ec85745766c0047ccc3b5036f1d03559fd46bb38b5eeb/DAWG_Python-0.7.2-py2.py3-none-any.whl
Installing collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844


In [4]:
# check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# CREATING FIELD OBJECTS

In [5]:
nlp_ru = Russian()

In [6]:
nlp_en = spacy.load("en_core_web_sm", disable = ["parser", "tagger", "ner"])

In [7]:
def tokenize_ru(text):
  return [tok.text for tok in nlp_ru.tokenizer(text)]

In [8]:
def tokenize_en(text):
  return [tok.text for tok in nlp_en.tokenizer(text)]

In [9]:
# Field object for Russian
SRC = data.Field(tokenize = tokenize_ru, include_lengths = True, lower = True)

In [10]:
# Field object for English
TRG = data.Field(tokenize = tokenize_en, 
                 init_token = '<sos>', # "start" token
                 eos_token = '<eos>', # "end" token
                 include_lengths = True, 
                 lower = True)


In [11]:
fields = [('rus', SRC), ('eng', TRG)]

# DATA PREPARATION

## BUILDING VOCABULARY

In [12]:
dir = "drive/MyDrive/Translating-Text-From-Russian-To-English-With-Attention"

In [14]:
nmt_data = data.TabularDataset(path = dir + "nmt_data.csv", format = 'csv', fields = fields)

In [15]:
SRC.build_vocab(nmt_data, max_size = 4000)

In [16]:
TRG.build_vocab(nmt_data, max_size = 4000)

In [17]:
len(SRC.vocab), len(TRG.vocab)

(4002, 4004)

## DATALOADERS

In [18]:
train_data, val_data = nmt_data.split(split_ratio = 0.8)

In [19]:
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, val_data), 
    batch_size = 64, 
    sort_within_batch = True, 
    sort_key = lambda x:len(x.rus),
    device = device)

# MODEL

## ENCODER

In [20]:
class Encoder(nn.Module):
  
  def __init__(self, hidden_size, embedding_size, num_layers=2, dropout=0.3):
    
    super(Encoder, self).__init__()
    
    self.hidden_size = hidden_size
    self.embedding_size = embedding_size
    self.num_layers = num_layers
    self.dropout = dropout
    self.embedding = nn.Embedding(len(SRC.vocab), embedding_size)
    self.gru = nn.GRU(embedding_size, hidden_size, num_layers = num_layers, dropout = dropout)
      
  def forward(self, input_sequence):
      
    embedded = self.embedding(input_sequence)
            
    outputs, hidden = self.gru(embedded)
    
    return outputs, hidden

## DECODER

In [22]:
class Decoder(nn.Module):
  def __init__(self, embedding_size, hidden_size, output_size, n_layers = 2, dropout = 0.3):
      
    super(Decoder, self).__init__()
    
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.n_layers = n_layers
    self.dropout = dropout
    self.embedding = nn.Embedding(output_size, embedding_size)        
    self.gru = nn.GRU(embedding_size, hidden_size, n_layers, dropout = dropout)
    self.concat = nn.Linear(hidden_size * 2, hidden_size)
    self.out = nn.Linear(hidden_size, output_size)
      
  def forward(self, current_token, hidden_state, encoder_outputs, mask):
    
    # convert current_token to word_embedding
    embedded = self.embedding(current_token)
    
    # Pass through GRU
    gru_output, hidden_state = self.gru(embedded, hidden_state)
    
    # Calculate attention weights
    attn_scores = torch.sum(gru_output * encoder_outputs, dim = 2)
    attn_scores = attn_scores.t()
    attn_scores = attn_scores.masked_fill(mask == 0, -1e5)
    attention_weights = F.softmax(attn_scores, dim = 1).unsqueeze(1)
    
    # Calculate context vector (weigthed average)
    context = attention_weights.bmm(encoder_outputs.transpose(0, 1))
    
    # Concatenate  context vector and GRU output
    gru_output = gru_output.squeeze(0)
    context = context.squeeze(1)
    concat_input = torch.cat((gru_output, context), 1)
    concat_output = torch.tanh(self.concat(concat_input))
    
    output = self.out(concat_output)
    
    return output, hidden_state

## SEQ-TO-SEQ

In [50]:
class seq2seq(nn.Module):
  def __init__(self, embedding_size, hidden_size, vocab_size, device, pad_idx, eos_idx, sos_idx):
    super(seq2seq, self).__init__()
    
    self.encoder = Encoder(hidden_size, embedding_size, num_layers = 2, dropout = 0.3)
    
    self.decoder = Decoder(embedding_size, hidden_size, vocab_size, n_layers = 2, dropout = 0.3)
    
    # Indices of special tokens and hardware device 
    self.pad_idx = pad_idx
    self.eos_idx = eos_idx
    self.sos_idx = sos_idx
    self.device = device
         
  def forward(self, input_sequence, output_sequence):
    
    # Unpack input_sequence tuple
    input_tokens = input_sequence[0]
  
    if output_sequence is None:
      inference = True
      output_tokens = torch.zeros((100, input_tokens.shape[1])).long().fill_(self.sos_idx).to(self.device)
    else:
      inference = False
      output_tokens = output_sequence[0]
    
    vocab_size = self.decoder.output_size
    batch_size = len(input_sequence[1])
    max_seq_len = len(output_tokens)
    
    # tensor to store decoder outputs
    outputs = torch.zeros(max_seq_len, batch_size, vocab_size).to(self.device)        
    
    encoder_outputs, hidden = self.encoder(input_tokens)
    
    # first input to the decoder is the <sos> tokens
    output = output_tokens[0,:]
    
    mask = (input_tokens != self.pad_idx).permute(1, 0)
    
    for t in range(1, max_seq_len):
      output = output.unsqueeze(0)
      
      output, hidden = self.decoder(output, hidden, encoder_outputs, mask)
      outputs[t] = output
      
      if inference:
        output = output.max(1)[1]
      else:
        output = output_tokens[t]
      
      # If we're in inference mode, keep generating until we produce an <eos> token
      if inference and output.item() == self.eos_idx:
        return outputs[:t]
        
    return outputs

## TRAIN MODEL

In [51]:
# extract special tokens
pad_idx = TRG.vocab.stoi['<pad>']
eos_idx = TRG.vocab.stoi['<eos>']
sos_idx = TRG.vocab.stoi['<sos>']

# Size of embedding_dim should match the dim of pre-trained word embeddings!
embedding_dim = 100
hidden_dim = 256
vocab_size = len(TRG.vocab)

In [52]:
model = seq2seq(embedding_dim, hidden_dim, vocab_size, device, pad_idx, eos_idx, sos_idx).to(device)

In [53]:
model

seq2seq(
  (encoder): Encoder(
    (embedding): Embedding(4002, 100)
    (gru): GRU(100, 256, num_layers=2, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(4004, 100)
    (gru): GRU(100, 256, num_layers=2, dropout=0.3)
    (concat): Linear(in_features=512, out_features=256, bias=True)
    (out): Linear(in_features=256, out_features=4004, bias=True)
  )
)

In [54]:
optimizer = optim.Adam(model.parameters())

In [55]:
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

In [56]:
def train(model, iterator, criterion, optimizer):
  model.train()
  
  epoch_loss = 0
  
  for idx, batch in notebook.tqdm(enumerate(iterator), total = len(iterator)):
    input_sequence = batch.rus
    output_sequence = batch.eng

    target_tokens = output_sequence[0]

    optimizer.zero_grad()

    output = model(input_sequence, output_sequence)

    output = output[1:].view(-1, output.shape[-1])
    target_tokens = target_tokens[1:].view(-1)

    loss = criterion(output, target_tokens)

    loss.backward()

    optimizer.step()

    epoch_loss += loss.item()
      
  return epoch_loss / len(iterator)

## EVAL MODEL

In [57]:
def evaluate(model, iterator, criterion):
  model.eval()
  
  epoch_loss = 0
  
  for idx, batch in notebook.tqdm(enumerate(iterator), total = len(iterator)):
    input_sequence = batch.rus
    output_sequence = batch.eng

    target_tokens = output_sequence[0]

    output = model(input_sequence, output_sequence)

    output = output[1:].view(-1, output.shape[-1])
    target_tokens = target_tokens[1:].view(-1)

    loss = criterion(output, target_tokens)

    epoch_loss += loss.item()
      
  return epoch_loss / len(iterator)

In [58]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [59]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
  start_time = time.time()
  
  train_loss = train(model, train_iterator, criterion, optimizer)
  valid_loss = evaluate(model, valid_iterator, criterion)
  
  end_time = time.time()
  
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
  
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), dir + 'best_model.pt')
  
  print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

HBox(children=(FloatProgress(value=0.0, max=2339.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=585.0), HTML(value='')))


Epoch: 01 | Time: 1m 35s
	Train Loss: 3.029 | Train PPL:  20.681
	 Val. Loss: 2.132 |  Val. PPL:   8.435


HBox(children=(FloatProgress(value=0.0, max=2339.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=585.0), HTML(value='')))


Epoch: 02 | Time: 1m 35s
	Train Loss: 1.853 | Train PPL:   6.381
	 Val. Loss: 1.640 |  Val. PPL:   5.157


HBox(children=(FloatProgress(value=0.0, max=2339.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=585.0), HTML(value='')))


Epoch: 03 | Time: 1m 35s
	Train Loss: 1.485 | Train PPL:   4.415
	 Val. Loss: 1.446 |  Val. PPL:   4.244


HBox(children=(FloatProgress(value=0.0, max=2339.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=585.0), HTML(value='')))


Epoch: 04 | Time: 1m 35s
	Train Loss: 1.301 | Train PPL:   3.673
	 Val. Loss: 1.348 |  Val. PPL:   3.852


HBox(children=(FloatProgress(value=0.0, max=2339.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=585.0), HTML(value='')))


Epoch: 05 | Time: 1m 35s
	Train Loss: 1.186 | Train PPL:   3.275
	 Val. Loss: 1.299 |  Val. PPL:   3.664


HBox(children=(FloatProgress(value=0.0, max=2339.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=585.0), HTML(value='')))


Epoch: 06 | Time: 1m 36s
	Train Loss: 1.109 | Train PPL:   3.033
	 Val. Loss: 1.268 |  Val. PPL:   3.554


HBox(children=(FloatProgress(value=0.0, max=2339.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=585.0), HTML(value='')))


Epoch: 07 | Time: 1m 38s
	Train Loss: 1.051 | Train PPL:   2.861
	 Val. Loss: 1.247 |  Val. PPL:   3.479


HBox(children=(FloatProgress(value=0.0, max=2339.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=585.0), HTML(value='')))


Epoch: 08 | Time: 1m 36s
	Train Loss: 1.005 | Train PPL:   2.733
	 Val. Loss: 1.233 |  Val. PPL:   3.433


HBox(children=(FloatProgress(value=0.0, max=2339.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=585.0), HTML(value='')))


Epoch: 09 | Time: 1m 38s
	Train Loss: 0.967 | Train PPL:   2.629
	 Val. Loss: 1.223 |  Val. PPL:   3.397


HBox(children=(FloatProgress(value=0.0, max=2339.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=585.0), HTML(value='')))


Epoch: 10 | Time: 1m 36s
	Train Loss: 0.935 | Train PPL:   2.546
	 Val. Loss: 1.216 |  Val. PPL:   3.374


# INFERENCE

In [60]:
# load saved model weights
path = dir + 'best_model.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [61]:
def translate_sentence(model, sentence):
    model.eval()
    
    tokenized = nlp_ru(sentence) 
    tokenized = [t.lower_ for t in tokenized]
    int_tokenized = [SRC.vocab.stoi[t] for t in tokenized] 
    
    sentence_length = torch.LongTensor([len(int_tokenized)]).to(model.device) 
    tensor = torch.LongTensor(int_tokenized).unsqueeze(1).to(model.device) 
    
    translation_tensor_logits = model((tensor, sentence_length), None) 
    
    translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1)
    translation = [TRG.vocab.itos[t] for t in translation_tensor]
 
    translation = translation[1:]
    return " ".join(translation)

In [62]:
sentence = "это новый"
response = translate_sentence(model, sentence)
print(response)

is this new


In [63]:
sentence = "том прислал мне подарок"
response = translate_sentence(model, sentence)
print(response)

tom sent me a present
