<a href="https://colab.research.google.com/github/Joel-Vijo/Neural-Machine-Translation/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install torchtext==0.8.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.8.0
  Downloading torchtext-0.8.0-cp37-cp37m-manylinux1_x86_64.whl (6.9 MB)
[K     |████████████████████████████████| 6.9 MB 5.3 MB/s 
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.12.0
    Uninstalling torchtext-0.12.0:
      Successfully uninstalled torchtext-0.12.0
Successfully installed torchtext-0.8.0


In [3]:
from google.colab import drive
import math
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import nltk
import numpy as np
import random
import spacy
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score

In [4]:
device = torch.device("cuda")
print("Notebook is running on", device)
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


Notebook is running on cuda


In [5]:
!python -m spacy download de
!python -m spacy download en
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'de' are deprecated. Please use the
full pipeline package name 'de_core_news_sm' instead.[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de-core-news-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.3.0/de_core_news_sm-3.3.0-py3-none-any.whl (14.6 MB)
[K     |████████████████████████████████| 14.6 MB 17.4 MB/s 
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spa

In [6]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [7]:
SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True,
            batch_first=True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True,
            batch_first=True)



In [9]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG),root='data')



In [10]:
L9\lok.j;k'
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [11]:
class self_attention(nn.Module):
  def __init__(self,input_dim,q_dim,k_dim,v_dim,heads):
    super().__init__()
    self.k_dim=k_dim
    self.query=nn.Linear(input_dim,q_dim)
    self.key=nn.Linear(input_dim,k_dim)
    self.value=nn.Linear(input_dim,v_dim)
    self.softmax=nn.Softmax()
    self.head=heads
    self.final=nn.Linear(q_dim,input_dim)
  def forward(self,query,key,value,mask=None):
    #print("Query",query.size())
    #print("Key",key.size())
    batch_size=query.shape[0]
    q=self.query(query)
    k=self.key(key)
    v=self.value(value)
    #print(q.size())
    q=q.view(batch_size,-1,self.head,q.shape[2]//self.head).permute(0,2,1,3)
    k=k.view(batch_size,-1,self.head,k.shape[2]//self.head).permute(0,2,1,3)
    v=v.view(batch_size,-1,self.head,v.shape[2]//self.head).permute(0,2,1,3)
    #print("Query2",q.size())
    #print("k",k.size())
    k=k.permute(0,1,3,2)
    m=torch.matmul(q,k)
    m=m/self.k_dim
    #print("Mask",mask.size())
    #print("M",m.size())

    #mask=mask.permute(2,1,0)
    if(mask is not None):
      m=m.masked_fill(mask==0,-1e10)
    #print("M",m.size())
    m=torch.softmax(m,dim=-1)
    #print(" v ",v.size())
    z=torch.matmul(m,v)
    z=z.permute(0,2,1,3).contiguous()
    z=z.view(batch_size,-1,self.k_dim)
    z=self.final(z)
    return z



In [12]:
class positional_feed_forward(nn.Module):
  def __init__(self,input_dim,layer_dim):
    super().__init__()
    self.linear1=nn.Linear(input_dim,layer_dim)
    self.linear2=nn.Linear(layer_dim,input_dim)
    self.dropout=nn.Dropout(0.1)
  def forward(self,input):
    input=self.dropout(torch.relu(self.linear1(input)))
    input=self.linear2(input)
    return input

In [13]:
class Encoder_layer(nn.Module):
  def __init__(self,input_dim,q_dim,v_dim,k_dim,pff_dim,heads):
    super().__init__()
    self.attention=self_attention(input_dim,q_dim,k_dim,v_dim,heads)
    self.pff=positional_feed_forward(input_dim,pff_dim)
    self.norm1=nn.LayerNorm(input_dim)
    self.norm2=nn.LayerNorm(input_dim)
  def forward(self,input,mask):
    attention=self.attention(input,input,input,mask)
    #print("Attention",attention.size())
    output=self.norm1(attention+input)
    feed=self.pff(output)
    #print("Output and feed",output.size())
    output=self.norm2(feed+output)
    #print("Output",output.size())
    return output


In [14]:
class Encoder(nn.Module):
  def __init__(self,input_dim,max_len,embedding_dim,q_dim,v_dim,k_dim,pff_dim,heads):
    super().__init__()
    self.embedding=nn.Embedding(input_dim,embedding_dim)
    self.pos_embedding=nn.Embedding(max_len,embedding_dim)
    self.layers=nn.ModuleList([Encoder_layer(embedding_dim,q_dim,v_dim,k_dim,pff_dim,heads) for i in range(3)])
  def forward(self,input,mask):
    pos_tensor=torch.arange(0,input.shape[1]).unsqueeze(0).repeat(input.shape[0],1).to(device)
    embed=self.embedding(input)+self.pos_embedding(pos_tensor)
    for layer in self.layers:
      embed=layer(embed,mask)
    return embed

In [15]:
class Decoder_layer(nn.Module):
  def __init__(self,input_dim,q_dim,k_dim,v_dim,pff_dim,heads):
    super().__init__()
    self.attention1=self_attention(input_dim,q_dim,k_dim,v_dim,heads)
    self.attention2=self_attention(input_dim,q_dim,k_dim,v_dim,heads)
    self.pff=positional_feed_forward(input_dim,pff_dim)
    self.norm1=nn.LayerNorm(input_dim)
    self.norm2=nn.LayerNorm(input_dim)
    self.norm3=nn.LayerNorm(input_dim)
  def forward(self,input,enc_output,mask1,mask2):
    #print("Decoder layer input",input.size())
    #print("Attention 1")
    attention1=self.attention1(input,input,input,mask1)
    output=self.norm1(attention1+input)
    #print("Encoder output",enc_output.size())
  
    #print("Attention 2")
    attention2=self.attention2(output,enc_output,enc_output,mask2)
    output=self.norm2(attention2+output)
    feed=self.pff(output)
    output=self.norm3(feed+output)
    return output

In [16]:
class Decoder(nn.Module):
  def __init__(self,input_dim,max_len,embedding_dim,q_dim,k_dim,v_dim,pff_dim,heads):
    super().__init__()
    self.embedding=nn.Embedding(input_dim,embedding_dim)
    self.pos_embedding=nn.Embedding(max_len,embedding_dim)
    self.linear=nn.Linear(embedding_dim,input_dim)
    self.softmax=nn.Softmax(input_dim)
    self.layers=nn.ModuleList([Decoder_layer(embedding_dim,q_dim,k_dim,v_dim,pff_dim,heads) for i in range(3)])
  def forward(self,input,enc_output,mask1,mask2):
    pos_tensor=torch.arange(0,input.shape[1]).unsqueeze(0).repeat(input.shape[0],1).to(device)
    embed=self.embedding(input)+self.pos_embedding(pos_tensor)
    for layer in self.layers:
      #print("1")
      embed=layer(embed,enc_output,mask1,mask2)
    output=self.linear(embed)
    return output



In [17]:
class Model(nn.Module):
  def __init__(self,encoder,decoder,src_pad_ix,trg_pad_ix):
    super().__init__()
    self.enc=encoder
    self.dec=decoder
    self.src_pad=src_pad_ix
    self.trg_pad=trg_pad_ix
  def make_src_mask(self,m):
    mask=(m!=self.src_pad).unsqueeze(1).unsqueeze(2)
    return mask
  def make_trg_mask(self,m):
    mask=(m!=self.trg_pad).unsqueeze(1).unsqueeze(2)
    len=m.shape[1]
    sub_mask=torch.tril(torch.ones((len,len),device=device)).bool()
    #print("MAIN MASK",mask.size())
    #print("SUB MASK",sub_mask.size())
    #mask=mask.permute(2,1,0)
    mask=mask & sub_mask
    #mask=mask.permute(2,1,0)
    return mask
  def forward(self,src,trg):
    #print("SRC",src.size())
    src_mask=self.make_src_mask(src)
    trg_mask=self.make_trg_mask(trg)
    enc_outputs=self.enc(src,src_mask)
    outputs=self.dec(trg,enc_outputs,trg_mask,src_mask)
    return outputs

In [18]:
input_dim=len(SRC.vocab)
output_dim=len(TRG.vocab)
embedding_dim=256
pf_dim=512
heads=8
max_len=100
enc=Encoder(input_dim,max_len,embedding_dim,256,256,256,pf_dim,heads)
dec=Decoder(output_dim,max_len,embedding_dim,256,256,256,pf_dim,heads)
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
model=Model(enc,dec,SRC_PAD_IDX,TRG_PAD_IDX).to(device)
BATCH_SIZE=128
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,device=device)





In [19]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 9,038,341 trainable parameters


In [20]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
def train(model,iterator,criterion,optimiser):
  model.train()
  epoch_loss=0
  for i,batch in enumerate(iterator):
    src = batch.src
    trg = batch.trg    
    optimizer.zero_grad()
    #print(trg.size())
    s=trg[1:,:].contiguous().view(-1)
    #print(s.size())
    output = model(src, trg[:,:-1])
    #print("Output",output.size())
    #print("Finall output",output.size())
    #print("Trg",trg.size())
    #print(output)
    output_size=output.shape[-1]
    output=output.contiguous().view(-1,output_size)
    trg=trg[:,1:].contiguous().view(-1)
    #print(output.size())
    #print(trg.size())
    loss = criterion(output, trg)   
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()
    epoch_loss += loss.item()
  return epoch_loss/len(iterator)

In [21]:
def Evaluate(iterator, model, criterion):
    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            source = batch.src
            target = batch.trg
            #print(target)
            outputs = model(source, target[:,:-1])
            #print(outputs)
            #print("Trg",trg.size())
            output_size=outputs.shape[-1]
            outputs=outputs.contiguous().view(-1,output_size)
            target=target[:,1:].contiguous().view(-1)
            batch_loss = criterion(outputs, target)
            #print(outputs)
            eval_loss += batch_loss.item()
    return eval_loss/len(iterator)

In [22]:
for i in range(10):
  train_loss=train(model,train_iterator,criterion,optimizer)
  valid_loss=Evaluate(valid_iterator,model,criterion)
  print("Valid loss",valid_loss)
  print("  ")
  print("TRAINING LOSS= ",train_loss)
  print("  ")



Valid loss 3.0734723806381226
  
TRAINING LOSS=  4.122138601042626
  
Valid loss 2.481746196746826
  
TRAINING LOSS=  2.87184216587554
  
Valid loss 2.1850149780511856
  
TRAINING LOSS=  2.3583249111007487
  
Valid loss 2.0122684985399246
  
TRAINING LOSS=  2.028018735578932
  
Valid loss 1.9131564944982529
  
TRAINING LOSS=  1.777159020764187
  
Valid loss 1.8293492496013641
  
TRAINING LOSS=  1.5744880821211222
  
Valid loss 1.8067896664142609
  
TRAINING LOSS=  1.3979041429343202
  
Valid loss 1.772386059165001
  
TRAINING LOSS=  1.2432931766636046
  
Valid loss 1.7795377969741821
  
TRAINING LOSS=  1.1058371915166074
  
Valid loss 1.7851138412952423
  
TRAINING LOSS=  0.9834409004266041
  


In [23]:
model.eval()
test_loss=Evaluate(test_iterator,model,criterion)
print(test_loss)

1.8343684524297714




In [24]:
def translate_sentence(sentence, src_field, trg_field, model, device, max_len = 50):
    model.eval()
    print(sentence)
    if isinstance(sentence, str):
        nlp = spacy.load('de_core_news_sm')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    src_mask = model.make_src_mask(src_tensor)
    #print(src_tensor)
    with torch.no_grad():
        enc_src = model.enc(src_tensor, src_mask)
    print("Encoder output",enc_src.size())
    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)
      
        with torch.no_grad():
            output = model.dec(trg_tensor, enc_src, trg_mask, src_mask)
        #print(output)
        pred_token = output.argmax(2)[:,-1].item()
        print(pred_token)
        trg_indexes.append(pred_token)
        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:]

In [25]:
example_idx=3
src = vars(test_data.examples[example_idx])['src']
trg = vars(test_data.examples[example_idx])['trg']
translation = translate_sentence(src, SRC, TRG, model, device)


print("German:",' '.join(src))
print("English:",' '.join(trg))
print("Prediction:",' '.join(translation[:-1]))

['fünf', 'leute', 'in', 'winterjacken', 'und', 'mit', 'helmen', 'stehen', 'im', 'schnee', 'mit', 'schneemobilen', 'im', 'hintergrund', '.']
Encoder output torch.Size([1, 17, 256])
251
19
22
446
555
11
628
17
36
6
7
95
13
1577
250
5
3
German: fünf leute in winterjacken und mit helmen stehen im schnee mit schneemobilen im hintergrund .
English: five people wearing winter jackets and helmets stand in the snow , with snowmobiles in the background .
Prediction: five people wearing winter jackets and helmets are standing in the snow with pine trees .


In [26]:
translation = translate_sentence(src, SRC, TRG, model, device)

print(f'predicted trg = {translation}')

['fünf', 'leute', 'in', 'winterjacken', 'und', 'mit', 'helmen', 'stehen', 'im', 'schnee', 'mit', 'schneemobilen', 'im', 'hintergrund', '.']
Encoder output torch.Size([1, 17, 256])
251
19
22
446
555
11
628
17
36
6
7
95
13
1577
250
5
3
predicted trg = ['five', 'people', 'wearing', 'winter', 'jackets', 'and', 'helmets', 'are', 'standing', 'in', 'the', 'snow', 'with', 'pine', 'trees', '.', '<eos>']


In [27]:
print(bleu_score)

<function bleu_score at 0x7f12ffcf9b90>


In [28]:
def calculate_bleu(data, src_field, trg_field, model, device, max_len = 50):
    
    trgs = []
    pred_trgs = []
    
    for datum in data:
        
        src = vars(datum)['src']
        trg = vars(datum)['trg']
        
        pred_trg = translate_sentence(src, src_field, trg_field, model, device, max_len)
        
        #cut off <eos> token
        pred_trg = pred_trg[:-1]
        
        pred_trgs.append(pred_trg)
        trgs.append([trg])
        
    return bleu_score(pred_trgs, trgs)
bleu= calculate_bleu(test_data, SRC, TRG, model, device)

print(f'BLEU score = {bleu*100:.2f}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
14
36
8
4
101
39
5
3
['feuerwehrmänner', 'kommen', 'aus', 'einer', 'u-bahnstation', '.']
Encoder output torch.Size([1, 8, 256])
1227
0
75
12
4
354
5
3
['vier', 'männer', ',', 'drei', 'von', 'ihnen', 'mit', 'gebetskappen', ',', 'sitzen', 'auf', 'einer', 'blau', 'und', 'olivgrün', 'gemusterten', 'matte', '.']
Encoder output torch.Size([1, 20, 256])
110
30
15
48
12
155
32
8
4
444
13
29
11
620
327
543
5
3
['das', 'ist', 'eine', 'große', 'menschengruppe', ',', 'die', 'im', 'freien', 'auf', 'bänken', 'sitzt', '.']
Encoder output torch.Size([1, 15, 256])
209
10
4
59
38
12
19
32
57
5
3
['ein', 'mann', 'in', 'einem', 'roten', 'shirt', 'geht', 'an', 'einem', 'türkis', 'und', 'weiß', 'karierten', 'imbissladen', 'namens', '"', '32', 'de', 'neude', '"', 'vorbei', '.']
Encoder output torch.Size([1, 24, 256])
4
9
6
4
31
23
41
232
4
384
114
136
160
0
160
0
160
0
160
5
160
5
160
5
3
['ärzte', 'bei', 'einer', 'art', 'operation', '.']
Encod