# Transformer

### Sentence Tokenization
Dataset from: https://airesearch.in.th/releases/machine-translation-datasets/

In [1]:
import pandas as pd
import numpy as np
import math
import os
import glob

import torch
from torch import nn
import torch.nn.functional as F

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
START_TOKEN = ''
PADDING_TOKEN = ''
END_TOKEN = '<END>'

thai_character = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')',
                  '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                  '๑', '๒', '๓', '๔', '๕', '๖', '๗', '๘', '๙', '๐', ':', '<',
                  '=', '>', '[', '\\', ']', '^', '_', '`', '{', '|', '}',
                  'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช', 'ซ', 'ฌ',
                  'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ',	'ณ', 'ด',	'ต', 'ถ', 'ท', 'ธ',
                  'น', 'บ', 'ป', 'ผ',	'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย',	'ร', 'ล',
                  'ว', 'ศ',	'ษ', 'ส', 'ห', 'ฬ',	'อ', 'ฮ',
                  'ะ', 'ั', '็', 'า', 'ิ', 'ี', 'ุ', 'ู', 'เ', 'ใ', 'ไ', 'โ', '์',
                  '่',  '้', '๊', '๋', PADDING_TOKEN, END_TOKEN]

english_character = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')',
                    '*', '+', ',', '’', '-', '.', '/', ';',
                    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                    ':', '<', '=', '>', '?', '@',
                    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z',
                    '[', '\\', ']', '^', '_', '`', '{', '|', '}',
                    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                    'y', 'z',
                    PADDING_TOKEN, END_TOKEN]

In [3]:
index_to_thai = {k:v for k,v in enumerate(thai_character)}
thai_to_index = {v:k for k,v in enumerate(thai_character)}
index_to_english = {k:v for k,v in enumerate(english_character)}
english_to_index = {v:k for k,v in enumerate(english_character)}

In [4]:
path = '/content/drive/MyDrive/Colab Notebooks/Transformer/datasets/*.csv'
all_files = glob.glob(path)

def sample(file):
  nth_row = 1
  df = pd.read_csv(file)
  df['index'] = df.index
  df = df[df['index'] % nth_row == 0]
  del df['index']
  return df

df = pd.concat((sample(f) for f in all_files), ignore_index=True)

In [5]:
TOTAL_SENTENCES = 1000000
df = df[:TOTAL_SENTENCES]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 2 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   en_text  1000000 non-null  object
 1   th_text  1000000 non-null  object
dtypes: object(2)
memory usage: 15.3+ MB


In [6]:
# Max charactor in each sentence
display(max(df['en_text'].str.len()))
display(max(df['th_text'].str.len()))

3023

2218

In [7]:
# Percentile of length of charactor in each sentence
PERCENTILE = 85
display(np.percentile([df['en_text'].str.len()], PERCENTILE))
display(np.percentile([df['th_text'].str.len()], PERCENTILE))

210.0

195.0

In [8]:
max_sequence_length = 220

def is_valid_tokens(sentence, vocab):
  for token in list(set(sentence)):
      if token not in vocab:
          return False
  return True

def is_valid_length(sentence, max_sequence_length):
  return len(list(sentence)) < (max_sequence_length - 1)

valid_sentence_indicies = []
for index in range(len(df['th_text'])):
  thai_sentence, english_sentence = df['th_text'][index], df['en_text'][index]
  if is_valid_length(thai_sentence, max_sequence_length) \
    and is_valid_length(english_sentence, max_sequence_length) \
    and is_valid_tokens(english_sentence, english_character) \
    and is_valid_tokens(thai_sentence, thai_character):
      valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(df['th_text'])}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

Number of sentences: 1000000
Number of valid sentences: 165423


In [9]:
thai_sentences = [df['th_text'][i] for i in valid_sentence_indicies]
english_sentences = [df['en_text'][i] for i in valid_sentence_indicies]

### Tokenize

In [None]:
# %%capture
# ! pip install pythainlp

In [None]:
# import pythainlp
# from pythainlp import subword_tokenize, word_tokenize

# thai_subword = list(set().union(*[word_tokenize(s) for s in thai_sentences]))
# thai_subword.insert(0, START_TOKEN)
# thai_subword.append(PADDING_TOKEN)
# thai_subword.append(END_TOKEN)

In [None]:
# english_subword = list(set().union(*[s.split() for s in english_sentences]))
# english_subword.insert(0, START_TOKEN)
# english_subword.append(PADDING_TOKEN)
# english_subword.append(END_TOKEN)

In [None]:
# index_to_thai = {k:v for k,v in enumerate(thai_subword)}
# thai_to_index = {v:k for k,v in enumerate(thai_subword)}
# index_to_english = {k:v for k,v in enumerate(english_subword)}
# english_to_index = {v:k for k,v in enumerate(english_subword)}

### Prepare Dataloader

In [10]:
from torch.utils.data import Dataset, DataLoader

class TextDS(Dataset):
  def __init__(self, english_sentences, thai_sentences):
    self.english_sentences = english_sentences
    self.thai_sentences = thai_sentences

  def __len__(self):
    return len(self.english_sentences)

  def __getitem__(self, idx):
    return self.english_sentences[idx], self.thai_sentences[idx]

In [11]:
dataset = TextDS(english_sentences, thai_sentences)

In [12]:
batch_size = 32
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 3:
        break

[('The fool wanders, the wise man travels.', 'Facts do not cease to exist because they are ignored.', 'Pride cometh before a fall.', 'Use not today what tomorrow will need.', 'See nothing, say nothing, know nothing.', 'The proof of the pudding is in the eating.', "You can't unscramble a scrambled egg.", 'Bad news travels fast.', 'Nothing is ill said if it is not ill taken.', 'You may read it in the whole room.', 'Yes, I concur.', 'he asked me with a keen glance.', 'he cried in amazement.', 'whispered Richard, not thinking she could hear him.', 'Turn off the lights.', 'Turn up the bass.', 'Turning towards the door, he now caught sight of us.', 'Tutankhamen was an Egyptian pharaoh.', 'Tweedledee began instantly.', 'Twenty ways with a kumquat.', 'Uncle Tom said we must get in through a window.', 'Too much, he told me, with ominous rolling head.', 'Trial and error. The only way.', 'Three, four, five, guineas, you would consider handsome, I dare say.', 'This fiscal year.', 'This held her.',

### Utility functions

In [13]:
def create_masks(src_batch, targ_batch, max_sequence_length):
  NEG_INFTY = -1e9
  num_sentences = len(src_batch)
  look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
  look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
  encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
  decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
  decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)

  for idx in range(num_sentences):
    src_sentence_length, targ_sentence_length = len(src_batch[idx]), len(targ_batch[idx])
    src_chars_to_padding_mask = np.arange(src_sentence_length + 1, max_sequence_length)
    targ_chars_to_padding_mask = np.arange(targ_sentence_length + 1, max_sequence_length)
    encoder_padding_mask[idx, :, src_chars_to_padding_mask] = True
    encoder_padding_mask[idx, src_chars_to_padding_mask, :] = True
    decoder_padding_mask_self_attention[idx, :, targ_chars_to_padding_mask] = True
    decoder_padding_mask_self_attention[idx, targ_chars_to_padding_mask, :] = True
    decoder_padding_mask_cross_attention[idx, :, src_chars_to_padding_mask] = True
    decoder_padding_mask_cross_attention[idx, targ_chars_to_padding_mask, :] = True

  encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
  decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
  decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)

  # print(f"encoder_self_attention_mask {encoder_self_attention_mask.size()}: \n{encoder_self_attention_mask[0, :5, :5]}")
  # print(f"decoder_self_attention_mask {decoder_self_attention_mask.size()}: \n{decoder_self_attention_mask[0, :5, :5]}")
  # print(f"decoder_cross_attention_mask {decoder_cross_attention_mask.size()}: \n{decoder_cross_attention_mask[0, :5, :5]}")
  return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [14]:
def get_device():
  return torch.device('cuda') if torch.cuda.is_available() \
    else torch.device('cpu')

class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_seq_len):
    super().__init__()
    self.max_seq_len = max_seq_len
    self.d_model = d_model

  def forward(self):
    even_i = torch.arange(0, self.d_model, 2).float()
    denominator = torch.pow(10000, even_i/self.d_model)
    position = (torch.arange(self.max_seq_len)
                .reshape(self.max_seq_len, 1))  # (650, 1)
    even_PE = torch.sin(position / denominator) # (650, 256)
    odd_PE = torch.cos(position / denominator)  # (650, 256)
    stacked = torch.stack([even_PE, odd_PE], dim=2)
    PE = torch.flatten(stacked, start_dim=1, end_dim=2)
    return PE # (650, 512)

class SentenceEmbedding(nn.Module):
  def __init__(self, max_seq_len, d_model, language_to_index,
               START_TOKEN, END_TOKEN, PADDING_TOKEN):
    super().__init__()
    self.char_size = len(language_to_index)
    self.max_seq_len = max_seq_len
    self.embedding = nn.Embedding(self.max_seq_len, d_model)
    self.language_to_index = language_to_index
    self.position_encoder = PositionalEncoding(d_model, max_seq_len)
    self.dropout = nn.Dropout(p=0.1)
    self.START_TOKEN = START_TOKEN
    self.END_TOKEN = END_TOKEN
    self.PADDING_TOKEN = PADDING_TOKEN

  def batch_tokenize(self, batch, start_token, end_token):
    def tokenize(sentence, start_token, end_token):
      sentence_word_indictes = [self.language_to_index[token]
                                for token in list(sentence)]
      if start_token:
        sentence_word_indictes.insert(0, self.language_to_index[self.START_TOKEN])
      if end_token:
        sentence_word_indictes.append(self.language_to_index[self.END_TOKEN])
      for _ in range(len(sentence_word_indictes), self.max_seq_len):
        sentence_word_indictes.append(self.language_to_index[self.PADDING_TOKEN])
      return torch.tensor(sentence_word_indictes)

    tokenized = []
    for sentence_num in range(len(batch)):
      tokenized.append(tokenize(batch[sentence_num], start_token, end_token))
    tokenized = torch.stack(tokenized)
    return tokenized.to(get_device()) # (30, 650)

  def forward(self, x, start_token, end_token):
    x = self.batch_tokenize(x, start_token, end_token)  # (30, 650)
    x = self.embedding(x) # (650, 512)
    pos = self.position_encoder().to(get_device())  # (650, 512)
    x = self.dropout(x + pos) # (30, 650, 512)
    return x

In [15]:
def scaled_dot_product(q, k, v, mask=None):
  d_k = q.size()[-1]
  scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k) # (1, 8, 650, 650)
  if mask is not None:
    scaled = scaled.permute(1, 0, 2, 3) + mask  # (8, 1, 650, 650)
    scaled = scaled.permute(1, 0, 2, 3)         # (1, 8, 650, 650)
  attention = F.softmax(scaled, dim=-1)
  values = torch.matmul(attention, v)
  return values, attention

In [16]:
class PositionwiseFeedForward(nn.Module):
  def __init__(self, d_model, f_hidden, drop_prob=0.1):
    super(PositionwiseFeedForward, self).__init__()
    self.linear1 = nn.Linear(d_model, f_hidden)
    self.linear2 = nn.Linear(f_hidden, d_model)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(p=drop_prob)

  def forward(self, x):
    x = self.linear1(x)
    x = self.relu(x)
    x = self.dropout(x)
    x = self.linear2(x)
    return x

class LayerNormalization(nn.Module):
  def __init__(self, params_shape, eps=1e-5):
    super().__init__()
    self.params_shape = params_shape
    self.eps = eps
    self.gamma = nn.Parameter(torch.ones(params_shape))
    self.beta = nn.Parameter(torch.zeros(params_shape))

  def forward(self, x):
    dims = [-(i+1) for i in range(len(self.params_shape))]
    mean = x.mean(dim=dims, keepdim=True)
    var = ((x - mean) ** 2).mean(dim=dims, keepdim=True)
    std = (var + self.eps).sqrt()
    y = (x - mean) / std
    out = self.gamma * y + self.beta
    return out

class MultiheadAttention(nn.Module):
  def __init__(self, d_model, n_heads):
    super().__init__()
    self.d_model = d_model              # 512
    self.n_heads = n_heads              # 8
    self.head_dim = d_model // n_heads  # 64
    self.qkv_layer = nn.Linear(d_model, 3 * d_model)  # 512 -> 1536
    self.linear_layer = nn.Linear(d_model, d_model)   # 512 -> 512

  def forward(self, x, mask=None):
    batch_size, max_seq_len, d_model = x.size() # (1, 650, 512)
    qkv = self.qkv_layer(x) # (1, 650, 1536)
    qkv = qkv.reshape(batch_size, max_seq_len, self.n_heads,
                      3 * self.head_dim)  # (1, 650, 8, 192)
    qkv = qkv.permute(0, 2, 1, 3)   # (1, 8, 650, 192)
    q, k, v = qkv.chunk(3, dim=-1)  # (1, 8, 650, 64) x 3
    values, attention = scaled_dot_product(q, k, v, mask)  # (1, 8, 650, 64), (650, 650)
    values = values.permute(0, 2, 1, 3) # (1, 650, 8, 64)
    values = values.reshape(batch_size, max_seq_len, d_model) # (1, 650, 512)
    out = self.linear_layer(values) # (1, 650, 512)
    return out

class MultiHeadCrossAttention(nn.Module):
  def __init__(self, d_model, n_heads):
    super().__init__()
    self.d_model = d_model              # 512
    self.n_heads = n_heads              # 8
    self.head_dim = d_model // n_heads  # 64
    self.kv_layer = nn.Linear(d_model, 2 * d_model) # 512 -> 1024
    self.q_layer = nn.Linear(d_model, d_model)      # 512 -> 512
    self.linear_layer = nn.Linear(d_model, d_model) # 512 -> 512

  def forward(self, x, y, mask=None):
    batch_size, max_seq_len, d_model = x.size() # (1, 650, 512)
    kv = self.kv_layer(x) # (1, 650, 1024)
    kv = kv.reshape(batch_size, max_seq_len, self.n_heads,
                    2 * self.head_dim)  # (1, 650, 8, 128)
    kv = kv.permute(0, 2, 1, 3) # (1, 8, 650, 128)
    k, v = kv.chunk(2, dim=-1)  # (1, 8, 650, 64) x 2
    q = self.q_layer(y)           # (1, 650, 512)
    q = q.reshape(batch_size, max_seq_len, self.n_heads,
                  self.head_dim)  # (1, 650, 8, 64)
    q = q.permute(0, 2, 1, 3)     # (1, 8, 650, 64)
    values, attention = scaled_dot_product(q, k, v, mask) # (1, 8, 650, 64), (650, 650)
    values = values.permute(0, 2, 1, 3) # (1, 650, 8, 64)
    values = values.reshape(batch_size, max_seq_len, d_model) # (1, 650, 512)
    out = self.linear_layer(values) # (1, 650, 512)
    return out

### Encoder

In [17]:
class EncoderLayer(nn.Module):
  def __init__(self, d_model, f_hidden, n_heads, drop_prob):
    super(EncoderLayer, self).__init__()
    self.attention = MultiheadAttention(d_model, n_heads)
    self.dropout1 = nn.Dropout(p=drop_prob)
    self.norm1 = LayerNormalization(params_shape=[d_model])

    self.f = PositionwiseFeedForward(d_model=d_model, f_hidden=f_hidden,
                                     drop_prob=drop_prob)
    self.dropout2 = nn.Dropout(p=drop_prob)
    self.norm2 = LayerNormalization(params_shape=[d_model])

  def forward(self, x, self_attention_mask):
    # Self Attention
    residual_x = x[:]
    x = self.attention(x, mask=self_attention_mask)
    x = self.dropout1(x)
    x = self.norm1(x + residual_x)

    # Feed forward
    residual_x = x[:]
    x = self.f(x)
    x = self.dropout2(x)
    x = self.norm2(x + residual_x)
    return x

class SequentialEncoder(nn.Sequential):
  def forward(self, *inputs):
    x, self_attention_mask = inputs
    for module in self._modules.values():
      x = module(x, self_attention_mask)
    return x

class Encoder(nn.Module):
  def __init__(self, d_model, f_hidden, n_heads, drop_prob, n_layers,
               max_seq_len, lang_to_idx,
               START_TOKEN, END_TOKEN, PADDING_TOKEN):
    super().__init__()
    self.sentence_embedding = SentenceEmbedding(max_seq_len, d_model, lang_to_idx,
                                                START_TOKEN, END_TOKEN, PADDING_TOKEN)
    self.layers = SequentialEncoder(*[EncoderLayer(d_model, f_hidden, n_heads, drop_prob)
                                      for _ in range(n_layers)])

  def forward(self, x, self_attention_mask,
              start_token, end_token):
    x = self.sentence_embedding(x, start_token, end_token)
    x = self.layers(x, self_attention_mask)
    return x

### Decoder

In [18]:
class DecoderLayer(nn.Module):
  def __init__(self, d_model, f_hidden, n_heads, drop_prob):
    super(DecoderLayer, self).__init__()
    self.attention = MultiheadAttention(d_model, n_heads)
    self.dropout1 = nn.Dropout(p=drop_prob)
    self.norm1 = LayerNormalization(params_shape=[d_model])

    self.cross_attention = MultiHeadCrossAttention(d_model, n_heads)
    self.dropout2 = nn.Dropout(p=drop_prob)
    self.norm2 = LayerNormalization(params_shape=[d_model])

    self.f = PositionwiseFeedForward(d_model=d_model, f_hidden=f_hidden,
                                     drop_prob=drop_prob)
    self.dropout3 = nn.Dropout(p=drop_prob)
    self.norm3 = LayerNormalization(params_shape=[d_model])

  def forward(self, x, y, self_attention_mask, cross_attention_mask):
    # Self Mask Attention
    residual_y = y[:]
    y = self.attention(y, mask=self_attention_mask)
    y = self.dropout1(y)
    y = self.norm1(y + residual_y)

    # Cross Attention
    residual_y = y[:]
    y = self.cross_attention(x, y, mask=cross_attention_mask)
    y = self.dropout2(y)
    y = self.norm2(y + residual_y)

    # Feed forward
    residual_y = y[:]
    y = self.f(y)
    y = self.dropout3(y)
    y = self.norm3(y + residual_y)
    return y

class SequentialDecoder(nn.Sequential):
  def forward(self, *inputs):
    x, y, self_attention_mask, cross_attention_mask = inputs
    for module in self._modules.values():
      y = module(x, y, self_attention_mask, cross_attention_mask)
    return y

class Decoder(nn.Module):
  def __init__(self, d_model, f_hidden, n_heads, drop_prob, n_layers,
               max_seq_len, lang_to_idx,
               START_TOKEN, END_TOKEN, PADDING_TOKEN):
    super().__init__()
    self.sentence_embedding = SentenceEmbedding(max_seq_len, d_model, lang_to_idx,
                                                START_TOKEN, END_TOKEN, PADDING_TOKEN)
    self.layers = SequentialDecoder(*[DecoderLayer(d_model, f_hidden, n_heads, drop_prob)
                                      for _ in range(n_layers)])

  def forward(self, x, y, self_attention_mask, cross_attention_mask,
              start_token, end_token):
    y = self.sentence_embedding(y, start_token, end_token)
    y = self.layers(x, y, self_attention_mask, cross_attention_mask)
    return y

### Transformer

In [19]:
class Transformer(nn.Module):
  def __init__(self, d_model, f_hidden, n_heads, drop_prob, n_layers,
               max_seq_len, targ_char_size, src_lang_to_index, targ_lang_to_index,
               START_TOKEN, END_TOKEN, PADDING_TOKEN):
    super().__init__()
    self.encoder = Encoder(d_model, f_hidden, n_heads, drop_prob, n_layers,
                           max_seq_len, src_lang_to_index,
                           START_TOKEN, END_TOKEN, PADDING_TOKEN)
    self.decoder = Decoder(d_model, f_hidden, n_heads, drop_prob, n_layers,
                           max_seq_len, targ_lang_to_index,
                           START_TOKEN, END_TOKEN, PADDING_TOKEN)
    self.linear = nn.Linear(d_model, targ_char_size)
    self.device = torch.device('cuda') if torch.cuda.is_available() else \
                  torch.device('cpu')

  def forward(self, x, y,
              encoder_self_attention_mask=None,
              decoder_self_attention_mask=None,
              decoder_cross_attention_mask=None,
              enc_start_token=False,
              enc_end_token=False,
              dec_start_token=False,
              dec_end_token=False):
    x = self.encoder(x,
                     encoder_self_attention_mask,
                     enc_start_token, enc_end_token)
    out = self.decoder(x, y,
                       decoder_self_attention_mask,
                       decoder_cross_attention_mask,
                       dec_start_token, dec_end_token)
    out = self.linear(out)
    return out

### Training TH-EN

In [None]:
d_model = 512
f_hidden = 1024
n_heads = 8
drop_prob = 0.1
n_layers = 1

max_sequence_length = 256
en_char_size = len(english_character)

In [None]:
model = Transformer(d_model, f_hidden, n_heads, drop_prob, n_layers,
                    max_sequence_length, en_char_size, thai_to_index, english_to_index,
                    START_TOKEN, END_TOKEN, PADDING_TOKEN)

In [None]:
model

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(256, 512)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiheadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (dropout1): Dropout(p=0.1, inplace=False)
        (norm1): LayerNormalization()
        (f): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=1024, bias=True)
          (linear2): Linear(in_features=1024, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout2): Dropout(p=0.1, inplace=False)
        (norm2): LayerNormalization()
      )
    )
  )
  (decoder): Decoder(
    (sentence_embedding): 

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=english_to_index[PADDING_TOKEN],
                                reduction='none')

# initial weights
for params in model.parameters():
  if params.dim() > 1:
    nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(model.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else \
         torch.device('cpu')

In [None]:
model.train()
model.to(device)
n_epochs = 5

for epoch in range(n_epochs):
  print(f'Epoch {epoch + 1}')
  iterator = iter(train_loader)
  for batch_num, batch in enumerate(iterator):
    model.train()
    eng_batch, th_batch = batch
    encoder_self_attention_mask, \
    decoder_self_attention_mask, \
    decoder_cross_attention_mask = create_masks(th_batch, eng_batch,
                                                max_sequence_length)
    optim.zero_grad()
    th_predictions = model(th_batch, eng_batch,
                           encoder_self_attention_mask.to(device),
                           decoder_self_attention_mask.to(device),
                           decoder_cross_attention_mask.to(device),
                           enc_start_token=False,
                           enc_end_token=False,
                           dec_start_token=True,
                           dec_end_token=True)
    labels = model.decoder.sentence_embedding.batch_tokenize(eng_batch,
                                                             start_token=False,
                                                             end_token=True)

    loss = criterion(th_predictions.view(-1, en_char_size).to(device),
                     labels.view(-1).to(device)).to(device)
    valid_indices = torch.where(labels.view(-1) == english_to_index[PADDING_TOKEN], False, True)
    loss = loss.sum() / valid_indices.sum()
    loss.backward()
    optim.step()

    if batch_num % 500 == 0:
      print(f'Iteration {batch_num}: {loss.item()}')
      print(f'Thai: {th_batch[0]}')
      print(f'English: {eng_batch[0]}')
      en_sentence_predicted = torch.argmax(th_predictions[0], axis=1)
      predicted_sentence = ""
      for idx in en_sentence_predicted:
        if idx == english_to_index[END_TOKEN]:
          break
        predicted_sentence += index_to_english[idx.item()]
      print(f'English Prediction: {predicted_sentence}')

      model.eval()
      eng_sentence = ('',)
      th_sentence = ('ฝนตกไหม',)
      for word_counter in range(max_sequence_length):
        encoder_self_attention_mask, \
        decoder_self_attention_mask, \
        decoder_cross_attention_mask = create_masks(th_sentence, eng_sentence,
                                                    max_sequence_length)
        predictions = model(th_sentence, eng_sentence,
                            encoder_self_attention_mask.to(device),
                            decoder_self_attention_mask.to(device),
                            decoder_cross_attention_mask.to(device),
                            enc_start_token=False,
                            enc_end_token=False,
                            dec_start_token=True,
                            dec_end_token=False)
        next_token_prob_distribution = predictions[0][word_counter]
        next_token_index = torch.argmax(next_token_prob_distribution).item()
        next_token = index_to_english[next_token_index]
        eng_sentence = (eng_sentence[0] + next_token, )
        if next_token == END_TOKEN:
          break

      print(f'Evaluation translation (ฝนตกไหม): {eng_sentence}')
      print('-------------------------------------------')

Epoch 1
Iteration 0: 2.2142350673675537
Thai: คนโง่พเนจร คนฉลาดท่องเที่ยว
English: The fool wanders, the wise man travels.
English Prediction: Yha trrr bint     the ailh tektthobe y 
Evaluation translation (ฝนตกไหม): ('What is theating to theation?<END>',)
-------------------------------------------
Iteration 500: 1.6587803363800049
Thai: ช่วยสั่งพิซซ่าหน้ามีทเลิฟเวอร์ส์ถาดใหญ่ขอบมีไส้ให้ทีได้มั้ย
English: Can you order us a Large meat lovers pizza with stuffed crust?
English Prediction: Ion aou wrder apeanrarte ae siticar etizzalcithetear  r taest 
Evaluation translation (ฝนตกไหม): ('What what so the please?<END>',)
-------------------------------------------
Iteration 1000: 1.6174896955490112
Thai: อยากให้เอาไปร้านไหนคะ
English: Which vehicle repair shop would you like to have it repaired at?
English Prediction: Whach werinhe aespin teeustiuld you like to teve tn tes in?? t ?
Evaluation translation (ฝนตกไหม): ('What size size?<END>',)
-------------------------------------------
Itera

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/Transformer/transformer_TH-EN.pt')

### Inference

In [None]:
model.eval()
def translate(th_sentence):
  eng_sentence = ('',)
  th_sentence = (th_sentence,)
  for word_counter in range(max_sequence_length):
    encoder_self_attention_mask, \
    decoder_self_attention_mask, \
    decoder_cross_attention_mask = create_masks(th_sentence, eng_sentence,
                                                max_sequence_length)
    predictions = model(th_sentence, eng_sentence,
                        encoder_self_attention_mask.to(device),
                        decoder_self_attention_mask.to(device),
                        decoder_cross_attention_mask.to(device),
                        enc_start_token=False,
                        enc_end_token=False,
                        dec_start_token=True,
                        dec_end_token=False)
    next_token_prob_distribution = predictions[0][word_counter]
    next_token_index = torch.argmax(next_token_prob_distribution).item()
    next_token = index_to_english[next_token_index]
    eng_sentence = (eng_sentence[0] + next_token, )
    if next_token == END_TOKEN:
      break
  return eng_sentence[0]

In [None]:
translate("ไม่มีปัญหาค่ะ ดูหนังให้สนุกนะคะ")

'No problem seems to seen the problem.<END>'

In [None]:
translate("ฉันจะพร้อมในอีก 5 นาที")

'I will be reading 5 minutes.<END>'

In [None]:
translate("ไม่มีปัญหาค่ะ")

'No problems problems<END>'

In [None]:
translate("วันเสาร์นี้ว่างไหม")

'How the products the product.<END>'

In [None]:
translate("ช่วงนี้เป็นอย่างไรบ้าง")

'How how the stated to seems.<END>'

In [None]:
translate("3 คน")

'3<END>'

In [None]:
translate("ราคาเท่าไหร่")

'What the price the price<END>'

### Training EN-TH

In [20]:
d_model = 512
f_hidden = 1024
n_heads = 8
drop_prob = 0.1
n_layers = 1

max_sequence_length = 256
th_char_size = len(thai_character)

In [21]:
model = Transformer(d_model, f_hidden, n_heads, drop_prob, n_layers,
                    max_sequence_length, th_char_size, english_to_index, thai_to_index,
                    START_TOKEN, END_TOKEN, PADDING_TOKEN)

In [22]:
model

Transformer(
  (encoder): Encoder(
    (sentence_embedding): SentenceEmbedding(
      (embedding): Embedding(256, 512)
      (position_encoder): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): SequentialEncoder(
      (0): EncoderLayer(
        (attention): MultiheadAttention(
          (qkv_layer): Linear(in_features=512, out_features=1536, bias=True)
          (linear_layer): Linear(in_features=512, out_features=512, bias=True)
        )
        (dropout1): Dropout(p=0.1, inplace=False)
        (norm1): LayerNormalization()
        (f): PositionwiseFeedForward(
          (linear1): Linear(in_features=512, out_features=1024, bias=True)
          (linear2): Linear(in_features=1024, out_features=512, bias=True)
          (relu): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout2): Dropout(p=0.1, inplace=False)
        (norm2): LayerNormalization()
      )
    )
  )
  (decoder): Decoder(
    (sentence_embedding): 

In [23]:
criterion = nn.CrossEntropyLoss(ignore_index=thai_to_index[PADDING_TOKEN],
                                reduction='none')

# initial weights
for params in model.parameters():
  if params.dim() > 1:
    nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(model.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else \
         torch.device('cpu')

In [26]:
model.train()
model.to(device)
n_epochs = 5

for epoch in range(n_epochs):
  print(f'Epoch {epoch + 1}')
  iterator = iter(train_loader)
  for batch_num, batch in enumerate(iterator):
    model.train()
    eng_batch, th_batch = batch
    encoder_self_attention_mask, \
    decoder_self_attention_mask, \
    decoder_cross_attention_mask = create_masks(eng_batch, th_batch,
                                                max_sequence_length)
    optim.zero_grad()
    th_predictions = model(eng_batch, th_batch,
                           encoder_self_attention_mask.to(device),
                           decoder_self_attention_mask.to(device),
                           decoder_cross_attention_mask.to(device),
                           enc_start_token=False,
                           enc_end_token=False,
                           dec_start_token=True,
                           dec_end_token=True)
    labels = model.decoder.sentence_embedding.batch_tokenize(th_batch,
                                                             start_token=False,
                                                             end_token=True)

    loss = criterion(th_predictions.view(-1, th_char_size).to(device),
                     labels.view(-1).to(device)).to(device)
    valid_indices = torch.where(labels.view(-1) == thai_to_index[PADDING_TOKEN], False, True)
    loss = loss.sum() / valid_indices.sum()
    loss.backward()
    optim.step()

    if batch_num % 500 == 0:
      print(f'Iteration {batch_num}: {loss.item()}')
      print(f'English: {eng_batch[0]}')
      print(f'Thai: {th_batch[0]}')
      th_sentence_predicted = torch.argmax(th_predictions[0], axis=1)
      predicted_sentence = ""
      for idx in th_sentence_predicted:
        if idx == thai_to_index[END_TOKEN]:
          break
        predicted_sentence += index_to_thai[idx.item()]
      print(f'Thai Prediction: {predicted_sentence}')

      model.eval()
      th_sentence = ('',)
      eng_sentence = ('Choose someone who loves us',)
      for word_counter in range(max_sequence_length):
        encoder_self_attention_mask, \
        decoder_self_attention_mask, \
        decoder_cross_attention_mask = create_masks(eng_sentence, th_sentence,
                                                    max_sequence_length)
        predictions = model(eng_sentence, th_sentence,
                            encoder_self_attention_mask.to(device),
                            decoder_self_attention_mask.to(device),
                            decoder_cross_attention_mask.to(device),
                            enc_start_token=False,
                            enc_end_token=False,
                            dec_start_token=True,
                            dec_end_token=False)
        next_token_prob_distribution = predictions[0][word_counter]
        next_token_index = torch.argmax(next_token_prob_distribution).item()
        next_token = index_to_thai[next_token_index]
        th_sentence = (th_sentence[0] + next_token, )
        if next_token == END_TOKEN:
          break

      print(f'Evaluation translation (Choose someone who loves us): {th_sentence}')
      print('-------------------------------------------')

Epoch 1
Iteration 0: 3.6236155033111572
English: The fool wanders, the wise man travels.
Thai: คนโง่พเนจร คนฉลาดท่องเที่ยว
Thai Prediction: เเนน้านอ่า่เ่น่นนเน่้่่นนนนนนนนนนเ้นย
Evaluation translation (Choose someone who loves us): ('เันนนน่่่่นนนนนนนนนนนนน<END>',)
-------------------------------------------
Iteration 500: 2.9923176765441895
English: Can you order us a Large meat lovers pizza with stuffed crust?
Thai: ช่วยสั่งพิซซ่าหน้ามีทเลิฟเวอร์ส์ถาดใหญ่ขอบมีไส้ให้ทีได้มั้ย
Thai Prediction: โ่าเ่ับงเัรัีา ้ีอกั่ีลันนปายะ ันคน้ห้่งาะะะ่หาะหนาี่ด่ยะ้ยนอออออวอวอวจอจววอวออเอไรจอออจจจจจรจวจอนรวจอนรมอรนออดออวววดรอวดรวทมนวสวววลรวววทมอไอวกออวกรรรรออนออวรรรรรรรรรรรรอรรรนอรมรซรรรวรอรรรรรรวอนจจรจอทรรรงงรรลทรจจจจจจรยรลรจรรรรหรรตหรรตตตหรรวรรรรรรรทซทรรซรวล
Evaluation translation (Choose someone who loves us): ('โอเค่ะ เลยคะ<END>',)
-------------------------------------------
Iteration 1000: 2.4191062450408936
English: Which vehicle repair shop would you like to have it repaired at?
Thai: อยากให้เ

In [27]:
torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/Transformer/transformer_EN-TH.pt')

### Inference

In [30]:
model.eval()
def translate(eng_sentence):
  eng_sentence = (eng_sentence,)
  th_sentence = ('',)
  for word_counter in range(max_sequence_length):
    encoder_self_attention_mask, \
    decoder_self_attention_mask, \
    decoder_cross_attention_mask = create_masks(eng_sentence, th_sentence,
                                                max_sequence_length)
    predictions = model(eng_sentence, th_sentence,
                        encoder_self_attention_mask.to(device),
                        decoder_self_attention_mask.to(device),
                        decoder_cross_attention_mask.to(device),
                        enc_start_token=False,
                        enc_end_token=False,
                        dec_start_token=True,
                        dec_end_token=False)
    next_token_prob_distribution = predictions[0][word_counter]
    next_token_index = torch.argmax(next_token_prob_distribution).item()
    next_token = index_to_thai[next_token_index]
    th_sentence = (th_sentence[0] + next_token, )
    if next_token == END_TOKEN:
      break
  return th_sentence[0]

In [31]:
translate("Who am I")

'เขาไม่มี<END>'

In [32]:
translate("my name is pluem")

'เขาเป็นเพิ่มเติม<END>'

In [33]:
translate("hello, monday")

'เขาไม่มีปัญหา<END>'

In [34]:
translate("I am here")

'ฉันมี<END>'

In [35]:
translate("click this")

'ข้าวเขาความสาย<END>'

In [36]:
translate("today, what should we do")

'อย่างไรก็ตามเราเลย<END>'

In [37]:
translate("should we go to the mall?")

'ดูการเป็นเหล่านี้<END>'

In [38]:
translate("noodles are the best")

'เท่าที่สุดเกี่ยวกับเลย<END>'

In [40]:
translate("what should we do")

'เขาเป็นสิ่งที่ดี<END>'

In [41]:
translate("why did they do this?")

'เขาไม่เห็นอะไรเหรอ<END>'

In [42]:
translate("I will give you something")

'ฉันจะเป๋าเห็นเพียง<END>'