# Machine translation Pytorch

In [1]:
import numpy as np
import pandas as pd
import csv
import re
import imblearn
import torch
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.parsing.porter import PorterStemmer
from gensim.utils import tokenize
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torch
from torchtext.datasets import IWSLT2017
from torchtext.legacy import data
from torchtext.vocab import Vocab
from torchtext.data.metrics import bleu_score
from torch.utils.tensorboard import SummaryWriter
from torchsummary import summary
# from torchtext import data
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.metrics import roc_curve,auc
from numpy import interp
from itertools import cycle
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from collections import Counter
from functools import reduce
# ! pip install captum bokeh spacy emot parameter-sherpa
import sherpa
import captum
import random
import spacy
from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization, IntegratedGradients, LayerConductance
from captum.attr import visualization as viz
from captum.attr import configure_interpretable_embedding_layer, remove_interpretable_embedding_layer
import emot
from bokeh.io import output_notebook
output_notebook()
# ! python -m spacy download en_core_web_sm
# !python -m spacy download de
%matplotlib inline

## Text Preprocessing

In [2]:
spacy_german = spacy.load("de")
spacy_english = spacy.load("en")

def tokenize_german(text):
    return [token.text.lower() for token in spacy_german.tokenizer(text)]

def tokenize_english(text):
    return [token.text.lower() for token in spacy_english.tokenizer(text)]

train_iter, valid_iter, test_iter, = IWSLT2017(root='.pytorch/.data/', language_pair=('en','de'))
train_data = list(train_iter)
valid_data = list(valid_iter)
test_data = list(test_iter)
en_counter = Counter()
de_counter = Counter()

for (en, de) in train_data:
    en_counter.update(tokenize_english(en))
    de_counter.update(tokenize_german(de))

german = Vocab(de_counter, min_freq=10, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))
english = Vocab(en_counter, min_freq=10, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))


INFO:root:File C:\Users\User\YandexDisk\myAI\myNLP\deepLearningRecipes\.pytorch\.data\2017-01-trnmted.tgz already exists.
INFO:root:Validating hash aca701032b1c4411afc4d9fa367796ba matches hash of C:\Users\User\YandexDisk\myAI\myNLP\deepLearningRecipes\.pytorch\.data\2017-01-trnmted.tgz
INFO:root:Opening tar file C:\Users\User\YandexDisk\myAI\myNLP\deepLearningRecipes\.pytorch\.data\2017-01-trnmted.tgz.
INFO:root:C:\Users\User\YandexDisk\myAI\myNLP\deepLearningRecipes\.pytorch\.data\2017-01-trnmted/._texts.html already extracted.
INFO:root:C:\Users\User\YandexDisk\myAI\myNLP\deepLearningRecipes\.pytorch\.data\2017-01-trnmted/texts.html already extracted.
INFO:root:C:\Users\User\YandexDisk\myAI\myNLP\deepLearningRecipes\.pytorch\.data\2017-01-trnmted/texts/DeEnItNlRo/DeEnItNlRo/._.eval already extracted.
INFO:root:C:\Users\User\YandexDisk\myAI\myNLP\deepLearningRecipes\.pytorch\.data\2017-01-trnmted/texts/DeEnItNlRo/DeEnItNlRo/.eval already extracted.
INFO:root:C:\Users\User\YandexDisk\

In [3]:
print(f'Unique tokens in source (de) vocabs {len(german)}')
print(f'Unique tokens in source (en) vocabs {len(english)}')

Unique tokens in source (de) vocabs 15927
Unique tokens in source (en) vocabs 13190


In [4]:
english_transform = lambda x: [english['<BOS>']] + [english[token] for token in tokenize_english(x)] + [english['<EOS>']]
german_transform = lambda x: [german['<BOS>']] + [german[token] for token in tokenize_german(x)] + [german['<EOS>']]
print("output of the text_transform:", english_transform("here is an example"))

output of the text_transform: [1, 78, 18, 54, 225, 2]


## Generating Batch iterator

In [5]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
BATCH_SIZE=32
def collate_batch(batch):
    de_list, en_list = [], []
    for (_en, _de) in batch:
        en_list.append(torch.tensor(english_transform(_en)))
        de_list.append(torch.tensor(english_transform(_de)))
    return pad_sequence(en_list, padding_value=3.0), pad_sequence(de_list, padding_value=3.0)

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
next(iter(train_dataloader))

(tensor([[   1,    1,    1,  ...,    1,    1,    1],
         [  15,   14,    7,  ...,   13,   16,   23],
         [  18,    7, 3794,  ..., 3303,  184, 4037],
         ...,
         [   3,    3,    3,  ...,    3,    3,    3],
         [   3,    3,    3,  ...,    3,    3,    3],
         [   3,    3,    3,  ...,    3,    3,    3]]),
 tensor([[    1,     1,     1,  ...,     1,     1,     1],
         [11412,     0,   712,  ...,     0,   272,     0],
         [    0,   609,     0,  ...,     0,     0,     0],
         ...,
         [    3,     3,     3,  ...,     3,     3,     3],
         [    3,     3,     3,  ...,     3,     3,     3],
         [    3,     3,     3,  ...,     3,     3,     3]]))

## Model Implementation

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Encoder

In [18]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(EncoderLSTM, self).__init__()
        self.input_size = input_size # size of input one-hot vector
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size # output size of embedding NN
        self.hidden_size = hidden_size # Dimension of NN's inside lstm cell
        num_layers = num_layers # no of stacked lstm
        self.dropout = nn.Dropout(p)

        self.embedding = nn.Embedding(self.input_size, self.embedding_size) # [input size, embedding dims]
        self.LSTM = nn.LSTM(self.embedding_size, hidden_size, num_layers, dropout=p) # [embedding dims, hidden size, num layers]

    def forward(self, x):
        embd = self.dropout(self.embedding(x)) # x: [Sequence_length, batch_size] emd: [Sequence_length , batch_size , embedding dims]
        outputs, (hidden_state, cell_state) = self.LSTM(embd) # output: [Sequence_length , batch_size , hidden_size] (hs, cs): [num_layers, batch_size size, hidden_size]
        return hidden_state, cell_state

input_size_encoder = len(english)
encoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
encoder_dropout = float(0.5)

encoder_lstm = EncoderLSTM(input_size_encoder, encoder_embedding_size,
                           hidden_size, num_layers, encoder_dropout).to(device)
print(encoder_lstm)

EncoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(13190, 300)
  (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
)


### Decoder

In [19]:
class DecoderLSTM(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, output_size):
        super(DecoderLSTM, self).__init__()

        self.input_size = input_size # size of one-hot input vector
        self.embedding_size = embedding_size # output size embedding layer
        self.hidden_size = hidden_size # dim of NN inside lstm memory
        self.num_layers = num_layers # no of stacked lstm
        self.output_size = output_size # size of one-hot vector (target language)
        self.dropout = nn.Dropout(p)

        self.embedding = nn.Embedding(self.input_size, self.embedding_size) # [input size, embedding dims]
        self.LSTM = nn.LSTM(self.embedding_size, hidden_size, num_layers, dropout=p) # [embedding dims, hidden size, num layers]

        self.fc1 = nn.Linear(self.hidden_size, self.hidden_size*2)
        self.fc2 = nn.Linear(self.hidden_size*2, self.output_size)

    def forward(self, x, hidden_state, cell_state):
        x = x.unsqueeze(0) # [1, batch_size]

        embd = self.dropout(self.embedding(x)) # [1, batch_size, embedding dims]
        outputs, (hidden_state, cell_state) = self.LSTM(embd, (hidden_state, cell_state)) # [num_layers, batch_size size, hidden_size]
        fc = self.fc1(outputs)
        pred = self.fc2(self.dropout(fc))

        pred = pred.squeeze(0)
        return  pred, hidden_state, cell_state


input_size_decoder = len(german)
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
decoder_dropout = float(0.5)
output_size = len(german)

decoder_lstm = DecoderLSTM(input_size_decoder, decoder_embedding_size,
                           hidden_size, num_layers, decoder_dropout, output_size).to(device)
print(decoder_lstm)



DecoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(15927, 300)
  (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  (fc1): Linear(in_features=1024, out_features=2048, bias=True)
  (fc2): Linear(in_features=2048, out_features=15927, bias=True)
)


In [26]:
class Seq2Seq(nn.Module):
    def __init__(self, Encoder_LSTM, Decoder_LSTM):
        super(Seq2Seq, self).__init__()
        self.Encoder_LSTM = Encoder_LSTM
        self.Decoder_LSTM = Decoder_LSTM

    def forward(self, source, target, tfr=0.5):
        batch_size = source.shape[1] # [ seq length, No. of seq]

        target_len = target.shape[0] # [ seq length, No. of seq]
        target_vocab_size = len(german)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        hidden_state_encoder, cell_state_encoder = self.Encoder_LSTM(source)

        x = target[0] # trigger <BOS>

        for i in range(1, target_len):
            output, hidden_state_decoder, cell_state_decoder = self.Decoder_LSTM(x, hidden_state_encoder, cell_state_encoder)
            outputs[i] = output
            pred = output.argmax(1)
            x = target[i] if np.random.rand() < tfr else pred
        return outputs

In [27]:
learning_rate = 0.001
writer = SummaryWriter(f".pytorch/runs/loss_plot")
step = 0

model = Seq2Seq(encoder_lstm, decoder_lstm).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.stoi["<PAD>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [28]:
print(model)

Seq2Seq(
  (Encoder_LSTM): EncoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(13190, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (Decoder_LSTM): DecoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(15927, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc1): Linear(in_features=1024, out_features=2048, bias=True)
    (fc2): Linear(in_features=2048, out_features=15927, bias=True)
  )
)


In [29]:
def translate_sentenece(model, sentence, english, german, device, max_length=50):
    sentence_tensor = torch.LongTensor(english_transform(sentence)).unsqueeze(1).to(device)

    with torch.no_grad():
        hid, cell = model.Encoder_LSTM(sentence_tensor)

    outputs = [german.stoi["<BOS>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hid, cell = model.Decoder_LSTM(previous_word, hid, cell)
            pred = output.argmax(1).item()

        outputs.append(pred)

        if output.argmax(1).item() == german.stoi['<EOS>']:
            break
    res = [german.itos[i] for i in outputs]
    return res[1:]

def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for e in data:
        src = e[0]
        trg = e[1]
        pred = translate_sentenece(model, src, english, german, device)
        pred = pred[:-1]
        targets.append([trg])
        outputs.append(pred)
    return  bleu_score(outputs, targets)

def checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss):
    print('saving')
    print()
    state = {'model': model,'best_loss': best_loss,'epoch': epoch,'rng_state': torch.get_rng_state(), 'optimizer': optimizer.state_dict(),}
    torch.save(state, '/content/checkpoint-NMT')
    torch.save(model.state_dict(),'checkpoint/eng_ger')

In [30]:
epoch_loss = 0.0
num_epochs = 100
best_loss = 999999
best_epoch = -1
sentence1 = "output of the text_transform"
ts1s = []

for epoch in range(num_epochs):
    print(f'Epoch - {epoch+1} / {num_epochs}')
    model.eval()
    ts1 = translate_sentenece(model, sentence1, english, german, device)
    print(f"Translated example sentence 1: \n {ts1}")
    ts1s.append(ts1)

    model.train()
    for i, batch in enumerate(train_dataloader):
        input = batch[0].to(device)
        target = batch[0].to(device)

        # pass the input to the model
        output = model(input, target)
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad() # clear out the accumulating grads

        loss = criterion(output, target) # calculate the loss for every epoch

        loss.backward()

        # clip the gradient if it exceeds 1
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()
        step += 1
        epoch_loss += loss.item()
        writer.add_scalar("Training loss", loss, global_step=step)

    if epoch_loss < best_loss:
        best_loss = epoch_loss
        best_epoch = epoch
        checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss)

        if (epoch - best_epoch) >= 10:
            print('Model not converging in the last 10 epochs')
            break
    print(f'Epoch loss - {loss.item()}')
    print()

print(epoch_loss / len(train_dataloader))

score = bleu(test_data[1:100], model, german, english, device)
print(f'Bleu score {.2:score*100}')

Epoch - 1 / 100


KeyboardInterrupt: 