In [1]:
!pip install torchtext==0.10.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0
  Downloading torchtext-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 4.7 MB/s 
Collecting torch==1.9.0
  Downloading torch-1.9.0-cp37-cp37m-manylinux1_x86_64.whl (831.4 MB)
[K     |████████████████████████████████| 831.4 MB 2.7 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.12.1+cu113
    Uninstalling torch-1.12.1+cu113:
      Successfully uninstalled torch-1.12.1+cu113
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.1
    Uninstalling torchtext-0.13.1:
      Successfully uninstalled torchtext-0.13.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.1+

 import all the required modules

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as f
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field,BucketIterator
import spacy
import random
import math
import numpy as np
import time

Set the random seeds for reproducability

In [82]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

download the german model

In [4]:
import spacy.cli 
spacy.cli.download("de_core_news_sm")


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


Load the German and English spaCy models

In [5]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')


create the tokenizers

In [6]:
def tokenize_de(text):
  return [t.text for t in spacy_de.tokenizer(text)]

def tokenize_en(text):
  return [t.text for t in spacy_en.tokenizer(text)]

create the source and target fields

In [7]:
SRC = Field(tokenize = tokenize_de,init_token = '<sos>',eos_token='<eos>',lower = True)
TRG = Field(tokenize = tokenize_en,init_token = '<sos>',eos_token='<eos>',lower = True)



Load the data

In [8]:
train_data,valid_data,test_data=Multi30k.splits(exts=('.de','.en'),fields=(SRC,TRG))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 866kB/s] 


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 235kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 219kB/s]


Build the vocabulary and Define the device

In [9]:
SRC.build_vocab(train_data,min_freq=2)
TRG.build_vocab(train_data,min_freq=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Create the iterators

In [68]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

create the encoder class

In [69]:
from thinc.layers.bidirectional import bidirectional
class Encoder(nn.Module):
  def __init__(self,input_dim,emb_dim,enc_hid_dim,dec_hid_dim,dropout):

    super().__init__()

    self.embedding= nn.Embedding(input_dim,emb_dim)

    self.rnn = nn.GRU(emb_dim,enc_hid_dim,bidirectional = True )

    self.fc = nn.Linear(enc_hid_dim*2,dec_hid_dim)
    
    self.dropout = nn.Dropout(dropout)


  def forward(self,src):

    embedded = self.dropout(self.embedding(src))

    outputs,hidden = self.rnn(embedded)

    hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1)))  

    return outputs,hidden

create the attention class

In [70]:
class Attention(nn.Module):
  def __init__(self,enc_hid_dim,dec_hid_dim):

    super().__init__()

    self.attn = nn.Linear((enc_hid_dim*2)+dec_hid_dim , dec_hid_dim)
    self.v = nn.Linear(dec_hid_dim,1,bias = False)

  def forward(self,hidden,encoder_outputs):

    batch_size = encoder_outputs.shape[1]
    src_len = encoder_outputs.shape[0]
    hidden = hidden.unsqueeze(1).repeat(1,src_len,1)
    encoder_outputs = encoder_outputs.permute(1,0,2)
    energy = torch.tanh(self.attn(torch.cat((hidden,encoder_outputs),dim=2)))
    attention = self.v(energy).squeeze(2)

    return f.softmax(attention,dim=1)





create the decoder class

In [71]:
class Decoder(nn.Module):
  def __init__(self,output_dim,emb_dim,enc_hid_dim,dec_hid_dim,dropout,attention):

    super().__init__()

    
    self.output_dim = output_dim
    self.attention = attention
    self.embedding = nn.Embedding(output_dim,emb_dim)
    self.rnn = nn.GRU((enc_hid_dim*2)+emb_dim,dec_hid_dim)
    self.fc_out = nn.Linear((enc_hid_dim*2)+dec_hid_dim+emb_dim,output_dim)
    self.dropout = nn.Dropout(dropout)


  def forward(self,input,hidden,encoder_outputs):

    input = input.unsqueeze(0)
    embedded = self.dropout(self.embedding(input))
    a = self.attention(hidden,encoder_outputs)
    a = a.unsqueeze(1)
    encoder_outputs = encoder_outputs.permute(1,0,2)
    weighted = torch.bmm(a,encoder_outputs)
    weighted = weighted.permute(1,0,2)
    rnn_input = torch.cat((embedded,weighted),dim=2)
    output,hidden = self.rnn(rnn_input,hidden.unsqueeze(0))

    assert (output == hidden).all()
    embedded = embedded.squeeze(0)
    weighted = weighted.squeeze(0)
    output = output.squeeze(0)
    prediction = self.fc_out(torch.cat((output,weighted,embedded),dim=1))

    return prediction,hidden.squeeze(0)


Building the Seq2Seq Model

In [72]:
class Seq2Seq(nn.Module):
  def __init__(self,encoder,decoder,device):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.device = device


  def forward(self,src,trg,teacher_forcing_ratio= 0.5):

    
    batch_size = src.shape[1]
    trg_len =   trg.shape[0]
    trg_vocab_size = self.decoder.output_dim
    outputs = torch.zeros(trg_len,batch_size,trg_vocab_size).to(self.device)
    encoder_outputs,hidden = self.encoder(src)
    input = trg[0,:]


    for t in range(1,trg_len):
      output,hidden = self.decoder(input,hidden,encoder_outputs)
      outputs[t]=output
      teacher_force = random.random() < teacher_forcing_ratio
      top1 = output.argmax(1)
      input = trg[t] if teacher_force else top1
    return outputs

initialize the parameters, encoder, decoder and seq2seq model

In [73]:
Input_dim = len(SRC.vocab)
Output_dim = len(TRG.vocab)
Enc_emb_dim = 256
Dec_emb_dim = 256
Enc_hid_dim = 512
Dec_hid_dim = 512
Enc_dropout = 0.5
Dec_dropout = 0.5

attn = Attention(Enc_hid_dim,Dec_hid_dim)
enc = Encoder(Input_dim,Enc_emb_dim,Enc_hid_dim,Dec_hid_dim,Enc_dropout)
dec = Decoder(Output_dim,Dec_emb_dim,Enc_hid_dim,Dec_hid_dim,Dec_dropout,attn)

model = Seq2Seq(enc,dec,device).to(device)

we will initialize all biases to zero and all weights from  N(0,0.01)


In [74]:
def init_weights(m):
  for name , param in m.named_parameters():
    if 'weight' in name:
      nn.init.normal_(param.data,mean=0,std=0.01)
    else :
      nn.init.constant_(param.data,0)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(5893, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

create an optimizer

In [75]:
optimizer = optim.Adam(model.parameters())


initialize the loss function

In [78]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

create the training function

In [85]:
def train(model,iterator,optimizer,criterion,clip):
  model.train()
  epoch_loss = 0
  for i,batch in enumerate(iterator):
    src = batch.src
    trg = batch.trg
    optimizer.zero_grad()
    output = model(src,trg)
    output_dim = output.shape[-1]
    output = output[1:].view(-1,output_dim)
    trg = trg[1:].view(-1)
    loss = criterion(output,trg)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(),clip)
    optimizer.step()
    epoch_loss += loss.item()
  return epoch_loss / len(iterator)

create the evaluation function

In [86]:
def evaluate(model,iterator,criterion):
  model.eval()
  epoch_loss = 0
  for i ,batch in enumerate(iterator):
    src = batch.src
    trg = batch.trg
    output = model(src,trg)
    output_dim = output.shape[-1]
    output = output[1:].view(-1,output_dim)
    trg = trg[1:].view(-1)
    loss = criterion(output,trg)
    epoch_loss += loss.item()
  return epoch_loss/len(iterator)  

define a timing function

In [87]:
def epoch_time(start_time,end_time):
  el_time = end_time - start_time
  el_mins = int(el_time/60)
  el_secs = int(el_time - el_mins *60)
  return el_mins , el_secs

we train our model and saving the parameters that give us the best validation loss

In [88]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')


Epoch: 01 | Time: 1m 19s
	Train Loss: 5.067 | Train PPL: 158.673
	 Val. Loss: 4.461 |  Val. PPL:  86.605
Epoch: 02 | Time: 1m 21s
	Train Loss: 4.197 | Train PPL:  66.471
	 Val. Loss: 3.816 |  Val. PPL:  45.412
Epoch: 03 | Time: 1m 22s
	Train Loss: 3.474 | Train PPL:  32.258
	 Val. Loss: 2.957 |  Val. PPL:  19.234
Epoch: 04 | Time: 1m 22s
	Train Loss: 2.923 | Train PPL:  18.605
	 Val. Loss: 2.758 |  Val. PPL:  15.773
Epoch: 05 | Time: 1m 22s
	Train Loss: 2.537 | Train PPL:  12.642
	 Val. Loss: 2.504 |  Val. PPL:  12.227
Epoch: 06 | Time: 1m 22s
	Train Loss: 2.226 | Train PPL:   9.263
	 Val. Loss: 2.408 |  Val. PPL:  11.112
Epoch: 07 | Time: 1m 23s
	Train Loss: 2.002 | Train PPL:   7.402
	 Val. Loss: 2.448 |  Val. PPL:  11.567
Epoch: 08 | Time: 1m 22s
	Train Loss: 1.778 | Train PPL:   5.919
	 Val. Loss: 2.463 |  Val. PPL:  11.738
Epoch: 09 | Time: 1m 22s
	Train Loss: 1.633 | Train PPL:   5.120
	 Val. Loss: 2.374 |  Val. PPL:  10.745
Epoch: 10 | Time: 1m 23s
	Train Loss: 1.503 | Train PPL

we test the model on the test set using these best parameters

In [89]:
model.load_state_dict(torch.load('tut3-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 2.375 | Test PPL:  10.752 |
