# **TUBES NLP**
### **Neural Machine Translation with Seq2Seq Architecture (Eng→Ina)**
### **Menggunakan Transformer**

#### Ruhiyah Faradishi Widiaputri
#### 13519034


# IMPORT NEEDED LIBRARIES

In [None]:
import json
import re
import random
import numpy as np

from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from timeit import default_timer as timer

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# LOAD DATA

This NMT trains with ... dataset from IndoNLG ([https://github.com/IndoNLP/indonlg](https://github.com/IndoNLP/indonlg))

In [None]:
# read train data
%cd /content/drive/My Drive/Tahun 4/NLP/tubes-mt/MT_TED_MULTI/
train_data_dir = "train_preprocess.json"
val_data_dir = 'valid_preprocess.json'
test_data_dir = 'test_preprocess.json'
trained_model_path = 'transformer/trained_model_transformer'

MAX_LENGTH = 15

/content/drive/My Drive/Tahun 4/NLP/tubes-mt/MT_TED_MULTI


In [None]:
# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "UNK", 1: "PAD", 2:"SOS", 3:"EOS"}
        self.n_words = 4  # Count SOS and EOS + UNK, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

def normalize_string(s):
  s = s.lower()
  s = re.sub(r"([.!?])", r" \1", s)
  s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
  return s

def filterPair(p):
    return len(p[0].split(' ')) <= MAX_LENGTH and \
        len(p[1].split(' ')) <= MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def load_data(filename):
  f = open(filename)
  json_data = json.load(f)
  data = []
  for j in json_data:
    text = normalize_string(j["text"])
    label = normalize_string(j["label"])
    data.append([text, label])
  return data

# define input and output lang
input_lang = Lang("en")
output_lang = Lang("ina")

# load + normalize train data
train_data = load_data(train_data_dir)

# check how many sentence pairs
print("Read %s sentence pairs" % len(train_data))

# take only data train with len < 20
train_data = filterPairs(train_data)
print("Trimmed to %s sentence pairs" % len(train_data))
  
# add vocabulary
for tr in train_data:
  input_lang.addSentence(tr[0])
  output_lang.addSentence(tr[1])

print("Counted words:")
print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)

Read 87406 sentence pairs
Trimmed to 40063 sentence pairs
Counted words:
en 17727
ina 16808


In [None]:
# load validation data
# load + normalize train data
val_data = load_data(val_data_dir)

# check how many sentence pairs
print("Read %s sentence pairs" % len(val_data))

# take only data train with len < 20
val_data = filterPairs(val_data)
print("Trimmed to %s sentence pairs" % len(val_data))

Read 2677 sentence pairs
Trimmed to 1304 sentence pairs


# SEQ2SEQ MODEL DENGAN TRANSFORMERS

In [None]:
# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()

        # dropout
        self.dropout = nn.Dropout(dropout)

        # den = 1/(10000^(2i/dmodel))
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)

        # position
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)

        # pos_embedding
        pos_embedding = torch.zeros((maxlen, emb_size))
        # P E(pos,2i) = sin(pos/1000^(2i/dim_model))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        # P E(pos,2i+1) = cos(pos/1000^(2i/dim_model))
        pos_embedding[:, 1::2] = torch.cos(pos * den)

        # Saving buffer (same as parameter without gradients needed)
        pos_encoding = pos_embedding.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pos_encoding",pos_encoding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_encoding[:token_embedding.size(0), :])

In [None]:
# Seq2Seq Network 
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()

        # embedding size
        self.emb_size = emb_size

        # positional encoding untuk source maupun target
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)
        
        # embedding layer
        self.embedding = nn.Embedding(src_vocab_size, emb_size)

        # model Transformer yang sudah disediakan pytorch
        self.transformer = Transformer(d_model=emb_size,                      # banyak fitur untuk masukan encoder/ decoder (default = 512)
                                       nhead=nhead,                           # banyak head di multihead attention (default=8)
                                       num_encoder_layers=num_encoder_layers, # banyak sub-encoder layer di encoder (default=6)
                                       num_decoder_layers=num_decoder_layers, # banyak sub-decoder layer di decoder (default=6)
                                       dim_feedforward=dim_feedforward,       # dimensi FFNN (Default=2048)
                                       dropout=dropout)                       # nilai dropout (default=0.1)
        
        # linear layer yang menerima keluaran decoder transformer
        self.out = nn.Linear(emb_size, tgt_vocab_size)

    def forward(self,
                src: Tensor, # (batch_size, src sequence length, dim_model)
                tgt: Tensor, # (batch_size, src sequence length, dim_model)
                tgt_mask=None,
                src_padding_mask= None,
                tgt_padding_mask= None):
      
        # buat positional encoding untuk token embedding untuk source
        src_emb = self.embedding(src) * math.sqrt(self.emb_size)
        src_emb = self.positional_encoding(src_emb)

        # buat positional encoding untuk token embedding untuk target
        tgt_emb = self.embedding(tgt) * math.sqrt(self.emb_size)
        tgt_emb = self.positional_encoding(tgt_emb)

        # permute untuk mendapat ukuran (sequence length, batch_size, dim_model),
        # print(src_emb)
        # print(src_emb.shape)
        src_emb = src_emb.permute(1, 0, 2)
        tgt_emb = tgt_emb.permute(1, 0, 2)

        # lanjut forward proses untuk transformer
        outs = self.transformer(src_emb,                # sekuens encoder (Tensor: required)
                                tgt_emb,                # sekuens decoder (Tensor: required)
                                tgt_mask,               # mask tambahan untuk sekuens target ([Tensor] : optional)
                                None,                   # mask tambahan untuk keluaran encoder = None
                                src_padding_mask,       # ByteTensor mask untuk source keys per batch ([Tensor]: optional)
                                tgt_padding_mask)       # ByteTensor mask untuk target keys per batch ([Tensor]: optional)
        
        # lanjut ke linear layer akhir
        return self.out(outs)

    def get_tgt_mask(self, size) -> torch.tensor:
        # Generates a squeare matrix where the each row allows one word more to be seen
        mask = torch.tril(torch.ones(size, size) == 1) # Lower triangular matrix
        mask = mask.float()
        mask = mask.masked_fill(mask == 0, float('-inf')) # Convert zeros to -inf
        mask = mask.masked_fill(mask == 1, float(0.0)) # Convert ones to 0
        return mask

    def create_pad_mask(self, matrix: torch.tensor, pad_token: int) -> torch.tensor:
        return (matrix == pad_token)

In [None]:
# definisikan parameter-parameter model
torch.manual_seed(0)

SRC_VOCAB_SIZE = input_lang.n_words
TGT_VOCAB_SIZE = output_lang.n_words
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

In [None]:
# defenisikan model, loss function, dan optimizer
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, 
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# TRAINING

In [None]:
# mengambil indeks dari setiap kata di sentence --> hasilnya list of indeks kata
def arrFromSentence(lang, sentence):
    kals = [word for word in sentence.split(' ')]
    l = len(kals)
    res = np.ones(MAX_LENGTH+2)
    res[0] = 2
    for i in range(l):
      if kals[i] in lang.word2index:
        res[i+1] = int(lang.word2index[kals[i]])
      else:
        res[i+1] = 0
    res[l] = 3
    
    return res

def tensorFromSentence(lang, sentence):
  return torch.from_numpy(arrFromSentence(lang, sentence))

SOS_token = np.array([2])
EOS_token = np.array([3])

#UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3

# mengembalikan tuple (input, target)
def arrFromPair(pair):
    input = arrFromSentence(input_lang, pair[0])
    target = arrFromSentence(output_lang, pair[1])
    return [input, target]

def batchify_data(data, padding=False, padding_token=-1):
    batches = []
    for idx in range(0, len(data), BATCH_SIZE):
        # We make sure we dont get the last bit if its not batch_size size
        if idx + BATCH_SIZE < len(data):
            # Here you would need to get the max length of the batch,
            # and normalize the length with the PAD token.
            if padding:
                max_batch_length = 0

                # Get longest sentence in batch
                for seq in data[idx : idx + BATCH_SIZE]:
                    if len(seq) > max_batch_length:
                        max_batch_length = len(seq)

                # Append X padding tokens until it reaches the max length
                for seq_idx in range(BATCH_SIZE):
                    remaining_length = BATCH_SIZE - len(data[idx + seq_idx])
                    data[idx + seq_idx] += [padding_token] * remaining_length

            batches.append(np.array(data[idx : idx + BATCH_SIZE]).astype(np.int64))

    print(f"{len(batches)} batches of size {BATCH_SIZE}")

    return batches

In [None]:
# training
training_pairs = [arrFromPair(td) for td in train_data]
train_dataloader = batchify_data(training_pairs)

# validation
val_pairs = [arrFromPair(td) for td in val_data]
val_dataloader = batchify_data(val_pairs)

312 batches of size 128
10 batches of size 128


In [None]:
def train_loop(model, opt, loss_fn, dataloader):
    
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        X, y = batch[:, 0], batch[:, 1]
        X, y = torch.tensor(X).to(DEVICE), torch.tensor(y).to(DEVICE)

        # shift the tgt by one so with the <SOS> we predict the token at pos 1
        y_input = y[:,:-1]
        y_expected = y[:,1:]
        
        # Get mask to mask out the next words
        sequence_length = y_input.size(1)
        tgt_mask = model.get_tgt_mask(sequence_length+1).to(DEVICE)

        # Standard training except we pass in y_input and tgt_mask
        pred = model(X, y_input, tgt_mask)

        # Permute pred to have batch size first again
        pred = pred.permute(1, 2, 0)      
        loss = loss_fn(pred, y_expected)

        opt.zero_grad()
        loss.backward()
        opt.step()
    
        total_loss += loss.detach().item()
        
    return total_loss / len(dataloader)

In [None]:
def validation_loop(model, loss_fn, dataloader):
    
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in dataloader:
            X, y = batch[:, 0], batch[:, 1]
            X, y = torch.tensor(X, dtype=torch.long, device=DEVICE), torch.tensor(y, dtype=torch.long, device=DEVICE)

            # Now we shift the tgt by one so with the <SOS> we predict the token at pos 1
            y_input = y[:,:-1]
            y_expected = y[:,1:]
            
            # Get mask to mask out the next words
            sequence_length = y_input.size(1)
            tgt_mask = model.get_tgt_mask(sequence_length+1).to(DEVICE)

            # Standard training except we pass in y_input and src_mask
            pred = model(X, y_input, tgt_mask)

            # Permute pred to have batch size first again
            pred = pred.permute(1, 2, 0)      
            loss = loss_fn(pred, y_expected)
            total_loss += loss.detach().item()
        
    return total_loss / len(dataloader)

In [None]:
NUM_EPOCHS = 6

def fit(model, opt, loss_fn, train_dataloader, val_dataloader, epochs):
    
    # Used for plotting later on
    train_loss_list, validation_loss_list = [], []
    
    print("Training and validating model")
    for epoch in range(epochs):
        print("-"*25, f"Epoch {epoch + 1}","-"*25)
        
        train_loss = train_loop(model, opt, loss_fn, train_dataloader)
        train_loss_list += [train_loss]
        
        validation_loss = validation_loop(model, loss_fn, val_dataloader)
        validation_loss_list += [validation_loss]
        
        print(f"Training loss: {train_loss:.4f}")
        print(f"Validation loss: {validation_loss:.4f}")
        print()

        # save model
        torch.save(model.state_dict(), f"{trained_model_path}_epoch:{epoch}.pt")
        
    return train_loss_list, validation_loss_list
    
train_loss_list, validation_loss_list = fit(transformer, optimizer, loss_fn, train_dataloader, val_dataloader, NUM_EPOCHS)

Training and validating model
------------------------- Epoch 1 -------------------------
Training loss: 2.8021
Validation loss: 2.5928

------------------------- Epoch 2 -------------------------
Training loss: 2.4014
Validation loss: 2.3377

------------------------- Epoch 3 -------------------------
Training loss: 2.1299
Validation loss: 2.1552

------------------------- Epoch 4 -------------------------
Training loss: 1.9162
Validation loss: 2.0075

------------------------- Epoch 5 -------------------------


# LOAD THE MODEL

In [None]:
# redefine the model
transformer_trained = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, 
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

# load encoder trained model
transformer_trained.load_state_dict(torch.load(f"{trained_model_path}_epoch:3.pt"))
transformer_trained.eval()

Seq2SeqTransformer(
  (positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (embedding): Embedding(17727, 512)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj