<a href="https://colab.research.google.com/github/Joel-Vijo/Neural-Machine-Translation/blob/main/Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install torchtext==0.8.0

Collecting torchtext==0.8.0
  Downloading torchtext-0.8.0-cp37-cp37m-manylinux1_x86_64.whl (6.9 MB)
[K     |████████████████████████████████| 6.9 MB 4.9 MB/s 
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.12.0
    Uninstalling torchtext-0.12.0:
      Successfully uninstalled torchtext-0.12.0
Successfully installed torchtext-0.8.0


In [None]:
from google.colab import drive
import math
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import nltk
import numpy as np
import random
import spacy
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score

In [None]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
!python -m spacy download de
!python -m spacy download en
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

Collecting de_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 4.5 MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-py3-none-any.whl size=14907055 sha256=7dfa72f07d9686aa16e3eeec4c79b399424d1ca5623eb9ccd29d18cbcd5b18e7
  Stored in directory: /tmp/pip-ephem-wheel-cache-u5vc2shi/wheels/00/66/69/cb6c921610087d2cab339062345098e30a5ceb665360e7b32a
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/de_core_news_sm -->
/usr/local/lib/

In [None]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [None]:
SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)



In [None]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))


downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 701kB/s] 


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 230kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 214kB/s]


In [None]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_dim, output_size, dropout):
        super(Encoder, self).__init__()
        self.embedding_dim=embedding_dim
        self.hidden_size = hidden_size 
        self.output_size = output_size 
        self.embed=nn.Embedding(input_size,embedding_dim)
        self.rnn=nn.LSTM(embedding_dim,hidden_size,2,dropout=dropout,batch_first=False)
        self.h2o=nn.Linear(hidden_size,input_size)
        self.dropout=nn.Dropout(dropout)

    def forward(self, input, hidden):
        input=self.embed(input)
        input=self.dropout(input)
        output,hidden=self.rnn(input,hidden)
        return hidden

In [None]:
class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_dim, output_size, dropout):
        super(Decoder, self).__init__()
        self.embedding_dim=embedding_dim
        self.hidden_size = hidden_size 
        self.output_size = output_size 
        self.embed=nn.Embedding(output_size,embedding_dim)
        self.rnn=nn.LSTM(embedding_dim,hidden_size,2,dropout=dropout,batch_first=False)
        self.h2o=nn.Linear(hidden_size,output_size)
        self.dropout=nn.Dropout(dropout)
    def forward(self, input, hidden):
        input=input.to(torch.long)
        input=self.embed(input)
        input=self.dropout(input)
        #hidden=(hidden[0].unsqueeze(1),hidden[1].unsqueeze(1))
        output,hidden=self.rnn(input,hidden)
        output=self.h2o(output.squeeze(0))
        return hidden , output 

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.enc=encoder
        self.dec=decoder
    def forward(self, hidden_size, source, target, teacher_forcing_ratio=0.5):
        batch_size=target.shape[1]
        target_length=target.shape[0]
        vocab_size=self.dec.output_size
        outputs=torch.zeros(target_length,batch_size,vocab_size)
        inputs=target[0,:]
        inputs=inputs.unsqueeze(0)
        hidden=(torch.zeros((2, batch_size, hidden_size)),torch.zeros((2, batch_size, hidden_size)))
        hidden=self.enc(source,hidden)
        for i in range(1,target_length):
          hidden,output=self.dec(inputs,hidden)
          outputs[i]=output
          teacher_force=random.random()<teacher_forcing_ratio
          if(teacher_force):
            inputs=target[i]
            inputs=inputs.unsqueeze(0)
          else:
            inputs=output.argmax(1)
            inputs=inputs.unsqueeze(0)
        return outputs

In [None]:
i_size=len(SRC.vocab)
o_size=len(TRG.vocab)
e_dim=256
h_size=100
dropout=0.5
learning_rate=0.001
dec=Decoder(i_size,h_size,e_dim,o_size,dropout)
enc=Encoder(i_size,h_size,e_dim,o_size,dropout)
model=Seq2Seq(enc,dec)
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),learning_rate)
BATCH_SIZE = 128
print(o_size)
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE)


5893




In [None]:
def train(model,hidden_size,iterator,criterion,optimiser):
  model.train()
  epoch_loss=0
  for i,batch in enumerate(iterator):
    src = batch.src
    trg = batch.trg    
    optimizer.zero_grad()
    output = model(hidden_size, src, trg)
    o_size=output.shape[-1]
    output = output[1:].view(-1, o_size)
    trg = trg[1:].view(-1)
    loss = criterion(output, trg)    
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()
    epoch_loss += loss.item()
  return epoch_loss/len(iterator)
        
        

In [None]:
for i in range(10):
  train_loss=train(model,h_size,train_iterator,criterion,optimizer)
  print("Training",train_loss)



Training 2.1222821426811724
Training 2.087526580310603
Training 2.0410292216859727
Training 1.998980354107424
Training 1.976734046368872
Training 1.9289034531505098
Training 1.9148473450790944
Training 1.8828066534933015
Training 1.8540002653777337
Training 1.8321876357830569


In [None]:
def ipTensor(sentence, src_field):
    if isinstance(sentence, list):
        tokens = [src_field.init_token] + [token.lower() for token in sentence] + [src_field.eos_token]
    else:
        tokens = [src_field.init_token] + tokenize_de(sentence) + [src_field.eos_token]
    seq_len = len(tokens)
    ip_tensor = torch.LongTensor([src_field.vocab.stoi[token] for token in tokens])
    return ip_tensor.view(seq_len, 1)

In [None]:
def Evaluate(iterator, model, criterion):
    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            model.zero_grad()
            source = batch.src
            target = batch.trg
            print(target)
            outputs = model(h_size,source, target)
            outputs = outputs[1:].view(-1, o_size)
            targets = target[1:].view(-1)
            batch_loss = criterion(outputs, targets)
            eval_loss += batch_loss.item()
        
        return eval_loss/len(iterator)

In [None]:
model.eval()
test_loss = Evaluate(test_iterator, model, criterion)
print(test_loss)



tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [  16,  110,    4,  ...,    4,   24,   16],
        [1909,   19,   34,  ...,   14,   14,   30],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])
tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [ 14,   4,   4,  ...,   4,  63, 795],
        [395,  35,   9,  ...,  14, 270,   6],
        ...,
        [  3,   3,   3,  ...,   3,   3,   3],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]])
tensor([[   2,    2,    2,  ...,    2,    2,    2],
        [   4,    4,   63,  ...,    7,   25,    7],
        [   9,   38,  929,  ...,   34, 1547, 2222],
        ...,
        [   5,    5,    5,  ...,    1,    1,    1],
        [   3,    3,    3,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])
tensor([[   2,    2,    2,  ...,    2,    2,    2],
      

In [None]:
def Translate(src_sentence, src_field, trg_field, model):
    ip_tensor = ipTensor(src_sentence, src_field)
    max_len = 4*ip_tensor.shape[0]
    src_len = [ip_tensor.shape[0]]
    hidden=(torch.zeros((2, 1, h_size)),torch.zeros((2, 1, h_size)))
    with torch.no_grad():
        enc_states = model.enc(ip_tensor,hidden)
    dec_states = enc_states
    sos_id = trg_field.vocab.stoi[trg_field.init_token]
    eos_id = trg_field.vocab.stoi[trg_field.eos_token]
    predicts = [sos_id]
    length = 1
    while length < max_len:
        input = torch.LongTensor([predicts[-1]]).view((1, 1))
        with torch.no_grad():
            dec_states,output = model.dec(input, dec_states)
        output = output.squeeze()
        output = output.view(-1, model.dec.output_size)
        predicts.append(output.argmax(-1).item())
        length += 1
        if predicts[-1] == eos_id:
            break
    sentence = [trg_field.vocab.itos[id] for id in predicts[1:]]
    return sentence

In [None]:
ind = int(random.random() * len(test_data.examples))
example = test_data.examples[ind]
src_sentence = example.src
trg_sentence = example.trg
print("German: ", ' '.join(src_sentence))
translation = Translate(src_sentence, SRC, TRG, model)
print("English: ", ' '.join(translation[:-1]))
print("Actual Translation: ", ' '.join(trg_sentence))

German:  . kamera die in blickt und gehsteig dem auf steht jeans und hemd schwarzen einem in mann ein
English:  a man in a black shirt and a pants is standing on the street and the other .
Actual Translation:  a man in a black shirt and jeans standing on the sidewalk looking at the camera .


In [None]:
def Calculate_BLEU(data, src_field, trg_field, model):
    trgs = []
    predicted_trgs = []
    for i in range(len(data.examples)):
        src_sentence = vars(data[i])['src']
        trg_sentence = vars(data[i])['trg']
        try:                               
            predicted_trg = Translate(src_sentence, src_field, trg_field, model)
            predicted_trgs.append(predicted_trg[:-1])
            trgs.append([trg_sentence])
        except:
            pass
    return bleu_score(predicted_trgs, trgs)

In [None]:
bleu_score_test = Calculate_BLEU(test_data, SRC, TRG, model)
print(f"BLEU score on Testing Data: {bleu_score_test*100:.2f}")

[2, 4, 9, 6, 4, 29, 23, 10, 45, 44, 5, 5, 3]
[2, 4, 154, 10, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 3]
[2, 4, 33, 6, 4, 4, 197, 10, 4, 4, 4, 4, 4, 5, 3]
[2, 110, 19, 6, 6, 25, 338, 17, 36, 6, 4, 4, 6, 7, 98, 5, 3]
[2, 19, 17, 4, 4, 4, 4, 5, 3]
[2, 4, 115, 9, 13, 4, 97, 11, 4, 14, 15, 6, 4, 6, 4, 6, 4, 98, 15, 6, 4, 6, 4, 98, 6, 4, 98, 5, 3]
[2, 4, 38, 12, 19, 17, 36, 6, 43, 12, 4, 77, 5, 3]
[2, 4, 34, 6, 4, 31, 23, 10, 7, 7, 103, 18, 4, 7, 68, 18, 7, 7, 7, 7, 7, 7, 7, 5, 3]
[2, 4, 9, 10, 4, 4, 4, 5, 3]
[2, 4, 9, 6, 4, 29, 23, 10, 32, 8, 4, 144, 11, 4, 5, 3]
[2, 4, 14, 11, 4, 17, 17, 4, 4, 4, 4, 4, 4, 5, 3]
[2, 16, 30, 17, 4, 4, 4, 4, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 5, 3]
[2, 4, 14, 6, 4, 4, 4, 4, 4, 4, 4, 5, 3]
[2, 4, 9, 10, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 3]
[2, 16, 19, 17, 36, 6, 4, 5, 5, 3]
[2, 4, 33, 6, 4, 31, 23, 10, 4, 4, 4, 4, 5, 3]
[2, 4, 14, 6, 4, 4, 4, 4, 4, 6, 7, 98, 5, 3]
[2, 4, 14, 6, 4, 29, 23, 11, 4, 26, 23, 10, 36, 6, 4, 4, 6, 4, 4, 5, 3]
[2, 4, 64, 6, 4, 31, 23, 10, 8, 