#Table of Contents

1. Import Libraries
2. Load Dataset
3. Create Field Objects
4. Data Preparation
  - Build Vocabulary
  - Create Dataloaders
  
5. Define Model Architecture
  - Encoder Architecture
  - Decoder Architecture
  - Sequence-to-Sequence Architecture
7. Train Sequence-to-Sequence Model
8. Model Inference 
  - Build Inference Function
  - Translate Russian Sentences in the Test Dataset

#1. Import Libraries

In [2]:
import re
import time
import math
import random

import numpy as np
import pandas as pd
import spacy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from tqdm import notebook
pd.set_option('display.max_colwidth', 200)

In [3]:
#from torchtext import data
#https://stackoverflow.com/questions/51452412/cant-import-torchtext-module-in-jupyter-notebook-while-using-pytorch

# You have to use PyTorch 0.4.x.
# torch.legacy was removed in PyTorch 1.x.
import sys
sys.path.append("C:/Users/czwea/anaconda3/bin/")
import torchtext
#from torchtext import data
# from torchtext.legacy import data



In [52]:
# check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


  return torch._C._cuda_getDeviceCount() > 0


#2. Load Dataset

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
# # extract the zip file from your Google Drive
# ! unzip '/content/drive/My Drive/Course_Notes/NLP using PyTorch/Seq2Seq/nmt_data.zip'

In [2]:
# read dataset from the Google drive
df = pd.read_csv("D:/LargeData/Analytics_Vidhya/NLP_Deep/nmt_data.csv")
test_df = pd.read_csv("D:/LargeData/Analytics_Vidhya/NLP_Deep/nmt_data_test.csv")

# shape of datasets
df.shape, test_df.shape

((187053, 2), (46668, 2))

In [7]:
df.sample(10)

Unnamed: 0,rus,eng
52154,он не поймал сигнал,he didn't catch the signal
107508,том не двоюродный брат мэри,tom isn't mary's cousin
23618,у вас есть безалкогольные напитки,do you have any non-alcoholic drinks
16741,этот банан зелёный,this banana is green
117491,том сказал что мэри наивна,tom said that mary was naive
130574,ты ужасный водитель,you're a terrible driver
104004,это был зимний вечер,this was a winter evening
99069,полагаю вы том джексон,i presume you're tom jackson
143942,мэри привлекательнее элис,mary is more attractive than alice
37336,это кошка тома,this is tom's cat


In [8]:
test_df.sample(10)

Unnamed: 0,rus,eng
43084,это кратковременное решение,this is a short term solution
43352,том слегка придирчив,tom is a bit of a nitpicker
975,я рис не заказывал,i didn't order rice
29080,решайте сами,that's up to you
43661,какую поисковую систему вы используете,which search engine do you use
14322,том мог нам соврать,tom may have lied to us
21,том не мог слышать мэри,tom couldn't hear mary
4398,увидимся вечером,see you tonight
29066,сколько масла покупать,how much butter should i buy
31034,тома зверски избили,tom was beaten to a pulp


#3. Create Field Objects

In [4]:
# import Russian spacy model to tokenize Russian text
from spacy.lang.ru import Russian

In [None]:
# dependency for spaCy Russian tokenizer
# https://github.com/conda-forge/pymorphy2-feedstock#:~:text=Installing%20pymorphy2%20Installing%20pymorphy2%20from%20the%20conda-forge%20channel,has%20been%20enabled%2C%20pymorphy2%20can%20be%20installed%20with%3A
# Installing pymorphy2 from the conda-forge channel can be achieved by adding conda-forge to your channels with:

# conda config --add channels conda-forge
# conda config --set channel_priority strict
# Once the conda-forge channel has been enabled, pymorphy2 can be installed with:

# conda install pymorphy2

Collecting pymorphy2
[?25l  Downloading https://files.pythonhosted.org/packages/a3/33/fff9675c68b5f6c63ec8c6e6ff57827dda28a1fa5b2c2d727dffff92dd47/pymorphy2-0.8-py2.py3-none-any.whl (46kB)
[K     |████████████████████████████████| 51kB 3.0MB/s eta 0:00:011
[?25hCollecting dawg-python>=0.7
  Downloading https://files.pythonhosted.org/packages/6a/84/ff1ce2071d4c650ec85745766c0047ccc3b5036f1d03559fd46bb38b5eeb/DAWG_Python-0.7.2-py2.py3-none-any.whl
Collecting pymorphy2-dicts<3.0,>=2.4
[?25l  Downloading https://files.pythonhosted.org/packages/02/51/2465fd4f72328ab50877b54777764d928da8cb15b74e2680fc1bd8cb3173/pymorphy2_dicts-2.4.393442.3710985-py2.py3-none-any.whl (7.1MB)
[K     |████████████████████████████████| 7.1MB 6.4MB/s 
Installing collected packages: dawg-python, pymorphy2-dicts, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.8 pymorphy2-dicts-2.4.393442.3710985


In [8]:
# spacy object for Russian
nlp_ru = Russian()

# spacy object for English
# python -m spacy download en_core_web_sm # see https://spacy.io/models
nlp_en = spacy.load("en_core_web_sm", disable = ["parser", "tagger", "ner"])
# diabling help speed calcualtions

In [9]:
## functions to perform tokenization

# tokenizes Russian text from a string into a list of tokens
def tokenize_ru(text):
  return [tok.text for tok in nlp_ru.tokenizer(text)]

# tokenizes English text from a string into a list of tokens
def tokenize_en(text):
  return [tok.text for tok in nlp_en.tokenizer(text)]

In [10]:
## Create Field objects using torchtext

# https://stackoverflow.com/questions/63539809/torchtext-0-7-shows-field-is-being-deprecated-what-is-the-alternative

# Field object for Russian
SRC = torchtext.data.Field(tokenize = tokenize_ru, 
                 include_lengths = True, # keeps track of the lengths
                 lower = True)

# Field object for English.  tokens are used on teh decoder end
TRG = torchtext.data.Field(tokenize = tokenize_en, 
                 init_token = '<sos>', # "start" token
                 eos_token = '<eos>', # "" token
                 include_lengths = True, 
                 lower = True)

fields = [('rus', SRC), ('eng', TRG)]



* refer the video "Text preprocessing in PyTorch" in the course "Fundamentals of Deep Learning" to learn more about the TorchText's Field objects

#4. Data Preparation

###4.1 Build Vocabulary & Data Loaders


In [42]:
# importing data from csv
nmt_data = data.TabularDataset(path="../../../../../LargeData/Analytics_Vidhya/NLP_Deep/nmt_data.csv", format='csv', fields=fields)



In [43]:
# build vocabulary for Russian sequences
SRC.build_vocab(nmt_data, max_size=4000) # only the 400 most used tokens will be used

# build vocabulary for English sequences
TRG.build_vocab(nmt_data, max_size=4000)

In [44]:
# check size of vocabulary
# The special tokens add to the 4000 max_size
len(SRC.vocab), len(TRG.vocab)

(4002, 4004)

In [48]:
# special tokens in input sequences (Russian)
SRC.vocab.itos[0], SRC.vocab.itos[1]

('<unk>', '<pad>')

In [49]:
# special tokens in ouput sequences (English)
TRG.vocab.itos[0], TRG.vocab.itos[1], TRG.vocab.itos[2], TRG.vocab.itos[3]

('<unk>', '<pad>', '<sos>', '<eos>')

###4.2 Create Dataloaders

In [50]:
# Split our dialogue data into training, validation, and test sets
train_data, val_data = nmt_data.split(split_ratio=0.8)

In [53]:
# Create a set of iterators for each split
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, val_data), 
    batch_size = 64, 
    sort_within_batch = True, 
    sort_key = lambda x:len(x.rus),
    device = device)



#5. Define Model Architecture

###5.1 Encoder Architecture

In [5]:
## embedding layer: 
##    input dimensions = input_dim (size of Russian vocabulary), 
##    ouput dimensions = emb_dim

## GRU layer:
##    input dimensions = emb_dim
##    hidden units = hid_dim
##    layers = n_layers
##    output dim = hid_dim

class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()
    
    self.hid_dim = hid_dim
    self.n_layers = n_layers
    
    self.embedding = nn.Embedding(input_dim, emb_dim)
    
    self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout = dropout)
      
  def forward(self, src):
      
    #src: [src len, batch size]
    
    embedded = self.embedding(src)
    
    #embedded: [src len, batch size, emb dim]
    
    outputs, hidden = self.gru(embedded)
    
    #outputs: [src len, batch size, hid dim]
    #hidden: [n layers, batch size, hid dim]
    
    return hidden

###5.2 Decoder Architecture

In [6]:
## embedding layer: 
##    input dimensions = output_dim (size of English vocabulary), 
##    ouput dimensions = emb_dim

## GRU layer:
##    input dimensions = emb_dim
##    hidden units = hid_dim
##    layers = n_layers
##    output dim = hid_dim

## Fully Connected layer:
##    input dimensions = hid_dim, 
##    ouput dimensions = output_dim (size of English vocabulary)

class Decoder(nn.Module):
  def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()
    
    self.output_dim = output_dim
    self.hid_dim = hid_dim
    self.n_layers = n_layers
    
    self.embedding = nn.Embedding(output_dim, emb_dim)
    
    self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout = dropout)
    
    self.fc_out = nn.Linear(hid_dim, output_dim)
      
  def forward(self, input, hidden):
      
    input = input.unsqueeze(0)
    
    #input = [1, batch size]
    
    embedded = self.embedding(input)
    
    #embedded = [1, batch size, emb dim]
            
    output, hidden = self.gru(embedded, hidden)
    
    #output = [seq len, batch size, hid dim]
    #hidden = [n layers, batch size, hid dim]
    
    #seq len will always be 1 in the decoder, therefore, output = [1, batch size, hid dim]
    
    prediction = self.fc_out(output.squeeze(0))
    
    #prediction = [batch size, output dim]
    
    return prediction, hidden

###5.3 Sequence-to-Sequence Architecture

In [7]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    
    self.encoder = encoder
    self.decoder = decoder
    self.device = device
      
  def forward(self, src, trg):
      
    #src = [src len, batch size]
    #trg = [trg len, batch size]
    
    batch_size = trg.shape[1]
    trg_len = trg.shape[0]
    trg_vocab_size = self.decoder.output_dim
    
    # tensor to store decoder outputs
    outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
    
    # last hidden state of the encoder is used as the initial hidden state of the decoder
    hidden = self.encoder(src)  
    
    # first input to the decoder is the <sos> tokens
    input = trg[0,:]
    
    for t in range(1, trg_len):
        
      # insert input token embedding, previous hidden state
      # receive output tensor (predictions) and new hidden state
      output, hidden = self.decoder(input, hidden)
      
      # place predictions in a tensor holding predictions for each token
      outputs[t] = output
      input = trg[t,:]
        
    return outputs

#6. Train Seq2Seq Model

In [11]:
# set hyperparameters
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 256
N_LAYERS = 2
ENC_DROPOUT = 0.3
DEC_DROPOUT = 0.3

# instantiate Encoder and Decoder
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

# instantiate Sequence-to-Sequence Model
model = Seq2Seq(enc, dec, device).to(device)

AttributeError: 'Field' object has no attribute 'vocab'

In [63]:
# print model architecture
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4002, 100)
    (gru): GRU(100, 256, num_layers=2, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(4004, 100)
    (gru): GRU(100, 256, num_layers=2, dropout=0.3)
    (fc_out): Linear(in_features=256, out_features=4004, bias=True)
  )
)

In [64]:
# find number of trainable parameters
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,169,020 trainable parameters


In [65]:
# Adam optimizer
optimizer = optim.Adam(model.parameters())

# pad token index
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

# cross entropy loss with softmax
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [66]:
def train(model, iterator, optimizer, criterion):
    
  model.train()
  
  epoch_loss = 0
  
  for i, batch in notebook.tqdm(enumerate(iterator)):
      
    # set accumulated loss to zero
    optimizer.zero_grad()
    
    # get integer sequences (tensors)
    src = batch.rus[0]
    trg = batch.eng[0]  
    
    # pass Russian tensor batch to the sequence-to-sequence model
    output = model(src, trg)
    
    #trg = [trg len, batch size]
    #output = [trg len, batch size, output dim]
    
    output_dim = output.shape[-1]
    
    output = output[1:].view(-1, output_dim)
    #output = [(trg len - 1) * batch size, output dim]

    trg = trg[1:].view(-1)        
    #trg = [(trg len - 1) * batch size]
    
    # compute loss
    loss = criterion(output, trg)
    
    # backpropagate lossb
    loss.backward()
    
    # update weights
    optimizer.step()
    
    epoch_loss += loss.item()
      
  return epoch_loss / len(iterator)

In [67]:
def evaluate(model, iterator, criterion):
    
  model.eval()
  
  epoch_loss = 0
  
  with torch.no_grad():
  
    for i, batch in enumerate(iterator):

      # get integer sequences (tensors)
      src = batch.rus[0]
      trg = batch.eng[0]

      output = model(src, trg)

      #trg = [trg len, batch size]
      #output = [trg len, batch size, output dim]

      output_dim = output.shape[-1]
      
      output = output[1:].view(-1, output_dim)
      trg = trg[1:].view(-1)

      #trg = [(trg len - 1) * batch size]
      #output = [(trg len - 1) * batch size, output dim]

      loss = criterion(output, trg)
      
      epoch_loss += loss.item()
      
  return epoch_loss / len(iterator)

In [68]:
# function to compute time taken by an epoch (in mm:ss)
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [70]:
N_EPOCHS = 10

best_valid_loss = float('inf')

# start model training
for epoch in range(N_EPOCHS):
    
  start_time = time.time()
  
  train_loss = train(model, train_iterator, optimizer, criterion)
  valid_loss = evaluate(model, valid_iterator, criterion)
  
  end_time = time.time()
  
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
  
  # compare validation loss
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), '../../../../../LargeData/Analytics_Vidhya/NLP_Deep/best_model_russian_gru.pt')
  
  print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

0it [00:00, ?it/s]



Epoch: 01 | Time: 12m 0s
	Train Loss: 2.388 | Train PPL:  10.890
	 Val. Loss: 1.885 |  Val. PPL:   6.586


0it [00:00, ?it/s]

Epoch: 02 | Time: 11m 20s
	Train Loss: 1.685 | Train PPL:   5.393
	 Val. Loss: 1.535 |  Val. PPL:   4.643


0it [00:00, ?it/s]

Epoch: 03 | Time: 11m 13s
	Train Loss: 1.390 | Train PPL:   4.014
	 Val. Loss: 1.386 |  Val. PPL:   3.997


0it [00:00, ?it/s]

Epoch: 04 | Time: 11m 14s
	Train Loss: 1.229 | Train PPL:   3.418
	 Val. Loss: 1.311 |  Val. PPL:   3.711


0it [00:00, ?it/s]

Epoch: 05 | Time: 10m 54s
	Train Loss: 1.122 | Train PPL:   3.072
	 Val. Loss: 1.267 |  Val. PPL:   3.549


0it [00:00, ?it/s]

Epoch: 06 | Time: 10m 54s
	Train Loss: 1.046 | Train PPL:   2.846
	 Val. Loss: 1.238 |  Val. PPL:   3.449


0it [00:00, ?it/s]

Epoch: 07 | Time: 11m 4s
	Train Loss: 0.987 | Train PPL:   2.683
	 Val. Loss: 1.222 |  Val. PPL:   3.392


0it [00:00, ?it/s]

Epoch: 08 | Time: 10m 45s
	Train Loss: 0.941 | Train PPL:   2.563
	 Val. Loss: 1.209 |  Val. PPL:   3.350


0it [00:00, ?it/s]

Epoch: 09 | Time: 10m 59s
	Train Loss: 0.904 | Train PPL:   2.470
	 Val. Loss: 1.205 |  Val. PPL:   3.335


0it [00:00, ?it/s]

Epoch: 10 | Time: 10m 56s
	Train Loss: 0.872 | Train PPL:   2.391
	 Val. Loss: 1.199 |  Val. PPL:   3.317


#7. Model Inference

In [4]:
# load saved model weights
path = '../../../../../LargeData/Analytics_Vidhya/NLP_Deep/best_model_russian_gru.pt'
model.load_state_dict(torch.load(path))

NameError: name 'model' is not defined

###7.1 Build Inference Function

In [72]:
# function to perform translation
def translate_sentence(sentence, model):
    
  # set model at evaluation modeb
  model.eval()

  # empty list to keep input sequence tokensb    
  token_int = []

  # iterate over the input sequence
  doc = nlp_ru(sentence)
  for i in doc:
    # convert tokens to
    token_int.append(SRC.vocab.stoi[i.text])

  # convert list to a PyTorch tensor  
  token_int = torch.tensor([token_int]).to(device)
  token_int = token_int.reshape(-1,1)

  # pass the tensor to the encoder and get the context vector (hidden)
  hidden = model.encoder(token_int)
  
  # initialize the list with the start token's index
  trg_indexes = [TRG.vocab.stoi[TRG.init_token]]
  
  pred_token = TRG.vocab.stoi[TRG.init_token]

  while pred_token != TRG.vocab.stoi[TRG.eos_token]:
    trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
    with torch.no_grad():
      # pass the context vector (hidden) to the decoder
      output, hidden = model.decoder(trg_tensor, hidden)
    
    # get index of the largest value
    pred_token = output.argmax(1).item()
    trg_indexes.append(pred_token)
  
  # covert integers to tokens
  trg_tokens = [TRG.vocab.itos[i] for i in trg_indexes]

  return " ".join(trg_tokens[1:-1])

In [73]:
# actual translation "is it working"
sent = "это работает"
translate_sentence(sent, model)

'it works'

###7.2 Translate Russian Sentences in the Test Dataset

In [74]:
translations = [translate_sentence(sent, model) for sent in notebook.tqdm(test_df["rus"])]

  0%|          | 0/46668 [00:00<?, ?it/s]

In [None]:
# add translations to the test dataframe
test_df["translations"] = translations

In [None]:
test_df.sample(20)

Unnamed: 0,rus,eng,translations
42227,это совершенно секретно,this is top secret,this is totally <unk>
40378,он принял решение повторить попытку,he made up his mind to try again,he <unk> the importance of <unk>
37795,это была долгая неделя,this has been a long week,it was a long week
42201,том бы тобой очень гордился,tom would be very proud of you,tom would 've been very proud of you
13958,я с трудом нашёл тома,i had a hard time finding tom,i 've met tom 's cat
38281,это очень необычно,this is really unusual,this is very thin
41192,я рада что ты согласна,i'm glad you agree,i 'm glad you agree
26739,смех заразителен,laughter is infectious,the <unk> is <unk>
32373,мы провели день на пляже,we spent the day at the beach,we took a meeting at all
26020,почему ты не здесь,why aren't you here,why are n't you here


In [75]:
# save translations and download it to your local system
test_df.to_csv("../../../../../LargeData/Analytics_Vidhya/NLP_Deep/nmt_test_translations.csv", index=False)