#<div align="center">Sequence to Sequence + Attention</div>

---
###### Author     : alvinwatner
###### Inspired by : Aladdin Persson (https://www.youtube.com/watch?v=EoGUlvhRYpk&t=2146s)
---

##To reproduce : 

1.   Make sure you have utils.py inside your google drive, otherwise there are 2 ways to acquire utils.py file :

  *   Open and Run the [utils.ipynb](https://colab.research.google.com/drive/1tkh9NyoKLfl4gk8L3_JKVGfCMIXDLy37?usp=sharing).
  *   Download from this [link](https://drive.google.com/file/d/18RMp42VvfMSdUfxxYKR_1QiA5fWcbgus/view?usp=sharing).

  Also make sure the **utils.py** is inside the **main_path** directory that specified on the *second cell*.


2. Create "**Checkpoint**" folder inside the **main_path**.
3. Create "**plot**" folder inside the **main_path** .






<br/>

---

###### *Note : Don't forget to change runtime type to **GPU**, and **CTRL + F9** to run all cells, otherwise the training will run super slow. Enjoy~*

In [2]:
# Mount to gdrive and have a peek to current path
from google.colab import drive 
drive. mount('/content/drive')

Mounted at /content/drive


In [3]:
# Main path for tensorboard and checkpoint
main_path = '/content/drive/MyDrive/Colab Notebooks/NLP/Pytorch/Seq2seq/'

In [4]:
# Path to utils.py root folder
%cd /content/drive/MyDrive/Colab Notebooks/NLP/Pytorch/Seq2seq

/content/drive/MyDrive/Colab Notebooks/NLP/Pytorch/Seq2seq


In [5]:
!ls -l utils.py

-rw------- 1 root root 2415 Dec 23 03:36 utils.py


In [6]:
#import utils.py as module
from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import numpy as np 
import random
import spacy
import matplotlib.pyplot as plt

import os

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device = {device}")

device = cuda


In [None]:
!pip install torchtext==0.6.0 
!python -m spacy download de #Language degerman
!python -m spacy download en #Language enlish

In [9]:
spacy_ger = spacy.load('de')
spacy_en = spacy.load('en')

In [10]:
def tokenizer_ger(text):
  return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_en(text):
  return [tok.text for tok in spacy_en.tokenizer(text)]

In [11]:
gerField = Field(tokenize = tokenizer_ger, lower = True,\
                    init_token = '<SOS>', eos_token = '<EOS>')
enField = Field(tokenize = tokenizer_en, lower = True,\
                init_token = '<SOS>', eos_token = '<EOS>')

In [12]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields = (gerField, enField))

In [13]:
gerField.build_vocab(train_data, max_size = 10000, min_freq = 2)
enField.build_vocab(train_data, max_size = 10000, min_freq = 2)

In [14]:
encoder_input_size = len(gerField.vocab)
decoder_input_size = len(enField.vocab)
num_layers = 2
hidden_size = 1024
batch_size = 32
embedding_dim = 300
dropout = 0.5
num_epochs = 10

In [15]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = batch_size,
    sort_within_batch = True,
    sort_key = lambda x : len(x.src),
    device = device
)

In [16]:
# Encoder version 1.5
class Encoder(nn.Module):
  def __init__(self, input_size, embedding_dim, hidden_size, num_layers, dropout, isBidirectional = True):
    super().__init__()
    self.num_direction = 2 if isBidirectional else 1
    self.num_layers = num_layers

    self.dropout = nn.Dropout(dropout)
    self.embedding = nn.Embedding(input_size, embedding_dim)
    self.rnn = nn.LSTM(embedding_dim, hidden_size, num_layers,
                       bidirectional = True, dropout = dropout)
    self.fc_hc = nn.Linear(hidden_size * self.num_direction, hidden_size)
    self.fc_out = nn.Linear(hidden_size * self.num_direction, hidden_size)

  def forward(self, inp):    
    #inp.shape = (seq_length, batch_size)

    embedded = self.dropout(self.embedding(inp))
    #embedded.shape = (seq_length, batch_size, embedding_dim)

    hj, (hidden, cell) = self.rnn(embedded)
    #hj           = (seq_length, batch_size, hidden_size * num_direction)
    #hidden.shape = (num_layer * num_direction, batch_size, hidden_size)
    #cell.shape   = (num_layer * num_direction, batch_size, hidden_size)

    batch_size  = hidden.shape[1]
    hidden_size = hidden.shape[2]

    hj = self.fc_out(hj)
    #h.shape = (seq_length, batch_size, hidden_size)

    hidden = hidden.view(self.num_layers, batch_size, hidden_size * self.num_direction)
    #hidden.shape = (num_layers, batch_size, hidden_size * num_direction)
    cell = cell.view(self.num_layers, batch_size, hidden_size * self.num_direction)
    # cell.shape = (num_layers, batch_size, hidden_size * num_direction)

    hidden = self.fc_hc(hidden)
    cell   = self.fc_hc(cell)
    #hidden.shape = (num_layers, batch_size, hidden_size)
    #cell.shape = (num_layers, batch_size, hidden_size)

    return hj, hidden, cell

In [17]:
# Decoder version 1.5
class Decoder(nn.Module):
  def __init__(self, input_size, embedding_dim, hidden_size, num_layers, dropout):
    super().__init__()
    vocab_size      = input_size
    self.num_layers = num_layers
    self.dropout    = nn.Dropout(dropout)
    self.rnn        = nn.LSTM(embedding_dim, hidden_size, num_layers, dropout = dropout)
    self.embedding  = nn.Embedding(input_size, embedding_dim)
    self.fc_decoder = nn.Linear(hidden_size, vocab_size)
    self.fc_energy  = nn.Linear(hidden_size * num_layers, 1)
    self.fc_si      = nn.Linear(hidden_size * num_layers, hidden_size)
    self.softmax    = nn.Softmax(dim = 0)

  '''hj are the encoder output'''
  '''s_i_minOne is the decoder hidden_state at time step i_minONE'''
  def alignment(self, s_i_minOne, hj):
    #hj.shape         = (seq_length, batch_size, hidden_size)
    #s_i_minOne.shape = (num_layers, batch_size, hidden_size)
    
    seq_length  = hj.shape[0] 
    
    num_layers  = s_i_minOne.shape[0]
    batch_size  = s_i_minOne.shape[1]
    hidden_size = s_i_minOne.shape[2]
    
    s_i_minOne = s_i_minOne.view(1, batch_size, hidden_size * num_layers)
    #s_i_minOne.shape = (1, batch_size, hidden_size * num_layers)
    s_i_minOne = self.fc_si(s_i_minOne)
    #s_i_minOne.shape = (1, batch_size, hidden_size)
    
    s_i_minOne = s_i_minOne.repeat(seq_length, 1, 1)
    #s_i_minOne.shape = (seq_length, batch_size, hidden_size)

    energy = torch.cat((s_i_minOne, hj), dim = 2)
    #energy.shape = (seq_length, batch_size, hidden_size * 2)
    energy = self.fc_energy(energy)
    #energy.shape = (seq_length, batch_size, 1)
    return energy
  
  def context_ops(self, energy, h_j):
    '''c = ∑ α.h'''
    α = self.softmax(energy)
    # α.shape = (seq_length, batch_size, 1)
    # h_j     = (seq_length, batch_size, hidden_size) 

    seq_length  = α.shape[0]
    batch_size  = α.shape[1]
    hidden_size = h_j.shape[2]

    context_vector = torch.empty(seq_length, batch_size, hidden_size).to(device)

    ''' c = ∑ α.h Linear combination between α_ij * h_j where, α_ij = scalar and h_j = vector with size = 'hidden_size'  '''
    for j in range(seq_length):
      for batch_idx in range(batch_size):
        context_vector[j, batch_idx] = α[j, batch_idx] * h_j[j, batch_idx]

    '''
    = iteration sample =
    /word-1 ; sentence-1/ probability = 0.7  *  /word-1 ; sentence-1/ hidden_size = h1_1 |  
    /word-1 ; sentence-2/ probability = 0.2  *  /word-1 ; sentence-2/ hidden_size = h1_2 | => Where h_ij = vector with size 'hidden_size'    
    /word-1 ; sentence-3/ probability = 0.1  *  /word-1 ; sentence-3/ hidden_size = h1_3 |   
    '''    

    context_vector = torch.sum(context_vector, dim = 0)
    #context_vector.shape = (batch_size, hidden_size)
    context_vector = context_vector.unsqueeze(0)
    #context_vector.shape = (1, batch_size, hidden_size)
    context_vector = context_vector.repeat(2, 1, 1)
    #context_vector.shape = (2, batch_size, hidden_size) 
    
    return context_vector
  
  # si_negOne.shape = (2, 32, 1024)
  # cell.shape      = (2, 32, 1024)
  def forward(self, hj, s_i_minOne, cell, inp):
    #inp.shape = (batch_size,)
    inp = inp.unsqueeze(0)
    #inp.shape = (1, batch_size)
    embedded = self.dropout(self.embedding(inp))
    #embedding.shape =  (1, batch_size, embedding_dim)
    energy = self.alignment(s_i_minOne, hj)
    #energy.shape = (seq_length, batch_size, 1)
    
    context_vector = self.context_ops(energy, hj)
    #context_vector.shape = (2, batch_size, hidden_size)

    de_out, (hidden, cell) = self.rnn(embedded, (context_vector, cell))

    #de_out.shape = (1, batch_size, hidden_size)
    #hidden.shape = (2, batch_size, hidden_size)
    #cell.shape   = (2, batch_size, hidden_size)

    de_out = self.fc_decoder(de_out)
    #de_out.shape = (1, batch_size, vocab_size)
    de_out = de_out.squeeze(0) 
    #de_out.shape = (batch_size, vocab_size)

    return de_out, hidden, cell

In [18]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, source, target, tfr = 0.5):
    batch_size = target.shape[1]
    target_len = target.shape[0]
    hj, s_i_minOne, cell = self.encoder(source)
    #hj.shape         = (seq_length, batch_size, hidden_size)
    #s_i_minOne.shape = (num_layers, batch_size, hidden_size)
    #cell.shape       = (num_layers, batch_size, hidden_size)

    vocab_size = len(enField.vocab)

    outputs = torch.zeros(target_len, batch_size, vocab_size).to(device)

    # <SOS>
    inp = target[0]
    # inp.shape = (32,)

    for t in range(1, target_len):
      output, s_i_minOne, cell = self.decoder(hj, s_i_minOne, cell, inp)
      #output.shape = (batch_size, vocab_size)
      #hidden.shape = (num_layer, batch_size, hidden_size)
      #cell.shape   = (num_layer, batch_size, hidden_size)
      
      outputs[t] = output

      best_guess = output.argmax(1)
      inp = target[t] if random.random() > tfr else best_guess
    
    return outputs
      

In [19]:
encoder = Encoder(encoder_input_size, embedding_dim, hidden_size, num_layers, dropout).to(device)
decoder = Decoder(decoder_input_size, embedding_dim, hidden_size, num_layers, dropout).to(device)
seq2seq = Seq2Seq(encoder, decoder).to(device)

In [20]:
pad_idx = enField.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)
optimizer = optim.Adam(seq2seq.parameters(), lr = 0.001)

In [21]:
plot_ = {'avg_loss' : [], 'epoch' : []}

In [22]:
load_model = False

if load_model:
  model_name    = 'AlvinSeq2Seq_attn'
  current_epoch = 0
  path = f"{main_path}/Checkpoint/|Model_{model_name}|Epoch_{current_epoch}.pth.tar"
  load_checkpoint(torch.load(path), seq2seq, optimizer)
else:
  model_name = 'Seq2SeqAttn'
  current_epoch = 0

In [23]:
import datetime
import pytz

time_zone = pytz.timezone('Asia/Jakarta')

def delta_time(start_time, stop_time):
  delta_time = stop_time - start_time
  minute, second = divmod(delta_time.total_seconds(), 60)  
  hour, minute   = divmod(minute, 60)  

  return int(hour), int(minute), int(second)

In [None]:
start_time = datetime.datetime.now(time_zone)

for epoch in range(num_epochs):
  epoch_loss     = 0
  iteration_loss = [] 

  for batch_idx, batch in enumerate(train_iterator):
    source = batch.src.to(device)
    target = batch.trg.to(device)

    outputs = seq2seq(source, target)
    outputs = outputs[1:].reshape(-1, outputs.shape[2])
    ground_Truth = target[1:].reshape(-1)

    optimizer.zero_grad()
    loss = criterion(outputs, ground_Truth)
    loss.backward()
    torch.nn.utils.clip_grad_norm(seq2seq.parameters(), max_norm = 1)
    optimizer.step()
    loss_val = loss.item()
    iteration_loss.append(loss_val)

    epoch_loss += loss_val

    # Average loss for the last 100 iteration 
    if not batch_idx % 100 and batch_idx != 0:
      print(f"EPOCH : {epoch}/{num_epochs - 1} || Batch Iteration : {batch_idx}/{len(train_iterator)} || Avg_Loss from {batch_idx - 100} to {batch_idx} : {sum(iteration_loss[-100:])/100}") 
  
  
  avg_loss = epoch_loss/len(train_iterator)
  plot_['avg_loss'].append(avg_loss)
  plot_['epoch'].append(epoch)

  # Save Current Checkpoint and Plot
  checkpoint = {'state_dict' : seq2seq.state_dict(), 'optimizer': optimizer.state_dict()}
  save_checkpoint(checkpoint, 
                  filename=f"{main_path}Checkpoint/|Model_{model_name}|Epoch_{current_epoch}.pth.tar")  
  
  np.save(f"{main_path}plot/AvgLoss_Model_{model_name}|Epoch_{current_epoch}", \
                              np.array(plot_['avg_loss']))  
  np.save(f"{main_path}plot/Epoch_Model_{model_name}|Epoch_{current_epoch}", \
                              np.array(plot_['epoch']))  

  # Remove Previous Checkpoint and Plot
  if epoch > 0:
    os.remove(f"{main_path}Checkpoint/|Model_{model_name}|Epoch_{current_epoch - 1}.pth.tar")    
    os.remove(f"{main_path}plot/AvgLoss_Model_{model_name}|Epoch_{current_epoch - 1}.npy")
    os.remove(f"{main_path}plot/Epoch_Model_{model_name}|Epoch_{current_epoch - 1}.npy")

  if epoch == num_epochs - 1:
    stop_time = datetime.datetime.now(time_zone)
    hour, minute, second = delta_time(start_time, stop_time)

    print("")
    print(f" **************  TRAINING DONE {num_epochs} EPOCHS  ************")
    print(f" **********  {hour} Hours, {minute} Minutes, {second} Seconds  ********")        
    print(" ====================================================")  
    print(f"|              Average Loss : {avg_loss}     |")   
    print(" ====================================================")  

  else:
    print(f" ************** EPOCH - {epoch}/{num_epochs - 1} **************")
    print(f"|       Average Loss : {avg_loss}           |")   
    print(" ********************************************")         
    print("")

  current_epoch += 1



EPOCH : 0/9 || Batch Iteration : 100/907 || Avg_Loss from 0 to 100 : 5.285993185043335
EPOCH : 0/9 || Batch Iteration : 200/907 || Avg_Loss from 100 to 200 : 4.8126041173934935
EPOCH : 0/9 || Batch Iteration : 300/907 || Avg_Loss from 200 to 300 : 4.6542918229103085
EPOCH : 0/9 || Batch Iteration : 400/907 || Avg_Loss from 300 to 400 : 4.5080348944664
EPOCH : 0/9 || Batch Iteration : 500/907 || Avg_Loss from 400 to 500 : 4.455879228115082
EPOCH : 0/9 || Batch Iteration : 600/907 || Avg_Loss from 500 to 600 : 4.315963315963745
EPOCH : 0/9 || Batch Iteration : 700/907 || Avg_Loss from 600 to 700 : 4.27121946811676
EPOCH : 0/9 || Batch Iteration : 800/907 || Avg_Loss from 700 to 800 : 4.2329288721084595
EPOCH : 0/9 || Batch Iteration : 900/907 || Avg_Loss from 800 to 900 : 4.177702927589417
=> Saving checkpoint
 ************** EPOCH - 0/9 **************
|       Average Loss : 4.525826414469979           |
 ********************************************

EPOCH : 1/9 || Batch Iteration : 100/

In [None]:
#source  
plt.plot(plot_['epoch'], plot_['avg_loss'])
plt.xlabel('epoch')
plt.ylabel('avg_loss')
plt.show()

In [None]:
plot_['avg_loss'] = np.save(Avgloss_plot_path, np.array(plot_['avg_loss']))
plot_['epoch']    = np.save(Epoch_plot_path, np.array(plot_['epoch']))