In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open("/content/drive/MyDrive/train.en.en") as f:
  english_text = f.read().split("\n")[:-2]

with open('/content/drive/MyDrive/train.ne') as f:
  nepali_text = f.read().split('\n')[:-2]

len(english_text),len(nepali_text)

(1780247, 1780247)

In [None]:
english_sentences = []
for sentence in english_text:
  english_sentences.append(sentence.lower())

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
tokenizer = Tokenizer(BPE(unk_token='[unk]'))
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(vocab_size = 50000,special_tokens=['[pad]','[unk]','[start]','[end]'])

In [None]:
with open('english_sentences.txt','w',encoding = 'utf-8')as f:
  for sentence in english_sentences:
    f.write(sentence+'\n')

with open('nepali_sentences.txt','w',encoding ='utf-8')as f:
  for sentence in nepali_text:
    f.write(sentence+'\n')

In [None]:
tokenizer.train(['english_sentences.txt'],trainer)
tokenizer.save('/content/english_tokenizer.json')

In [None]:
tokenizer.train(['nepali_sentences.txt'],trainer)
tokenizer.save('/content/nepali_tokenizer.json')

In [None]:
source_tokenizer = tokenizer.from_file('/content/english_tokenizer (1).json')
target_tokenizer = tokenizer.from_file('/content/nepali_tokenizer (1).json')

In [None]:
output = target_tokenizer.encode("नेपालसँग जोडिएको राज्य जसले मोदीलाई पुनः प्रधानमन्त्री बनाउने वा नबनाउने निर्णय गर्छ |")
print(output.ids,target_tokenizer.decode([12502, 5913, 1462, 1763, 22607, 2610, 1797, 2257, 1020, 23711, 1737, 2519, 95]))

[12502, 5913, 1462, 1763, 22607, 2610, 1797, 2257, 1020, 23711, 1737, 2519, 95] नेपालसँग जोडिएको राज्य जसले मोदीलाई पुनः प्रधानमन्त्री बनाउने वा नबनाउने निर्णय गर्छ |


In [None]:
output = source_tokenizer.encode('schedule for budget speech postponed by two hours')
print(output.ids,source_tokenizer.decode([3786, 447, 2023, 3308, 6918, 505, 709, 2517]))

[3786, 447, 2023, 3308, 6918, 505, 709, 2517] schedule for budget speech postponed by two hours


In [None]:
eng_sen_len = [len(sentence.split(" ") ) for sentence in english_sentences]
nep_sen_len = [len(sentence.split(" ") ) for sentence in nepali_text]
import numpy as np
np.percentile(eng_sen_len,95),np.percentile(nep_sen_len,95)
# 95 percent of our english_sentences have length of 38 or fewer whereas 95 percent of sentences in our nepali text data have length of 30 or fewer

(38.0, 30.0)

In [None]:
max_seq_len = 38
nepali_sentences = []
for sentence in nepali_text:
  sentence = '[start]'+" "+sentence+" "+'[end]'
  nepali_sentences.append(sentence)

text_pairs =[]
for i in range(len(english_sentences)):
  if len(english_sentences[i].split(" ")) < max_seq_len and len(nepali_sentences[i].split(" ")) < max_seq_len:
    text_pairs.append((english_sentences[i],nepali_sentences[i]))

In [None]:
import random
random.seed(42)
random.shuffle(text_pairs)

In [None]:
len_train_pairs = int(0.90*len(text_pairs))
train_pairs = text_pairs[:len_train_pairs]
val_pairs = text_pairs[len_train_pairs:]

In [None]:

import random
random.seed(42)
random.shuffle(train_pairs)
random.shuffle(val_pairs)
sample_train_pairs = train_pairs[:500000]
sample_val_pairs = val_pairs[:30000]
train_eng_sen,train_nep_sen = zip(*sample_train_pairs)
val_eng_sen,val_nep_sen = zip(*sample_val_pairs)

In [None]:
train_eng_sen[-1],train_nep_sen[-1]

('that is why he maintained open business freedom within his empire',
 '[start] त्यस कारण उसले आफ्नो साम्राज्य भित्र खुल्ला ब्यापारिक स्वतन्त्रता कायम गरेको थियो [end]')

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
def tokenize (sentences,max_len,value,tokenizer):
  tokenized_sentences = []
  for sentence in sentences:
    output = tokenizer.encode(sentence)
    tokenized_sentences.append(output.ids)

  padded_sentences = pad_sequences(tokenized_sentences,maxlen=max_len,padding='post',truncating = 'post',value = value)
  return padded_sentences

train_en_sen =tokenize(train_eng_sen,max_seq_len,0,source_tokenizer)
train_ne_sen = tokenize(train_nep_sen,max_seq_len+1,0,target_tokenizer)
val_en_sen = tokenize(val_eng_sen,max_seq_len,0,source_tokenizer)
val_ne_sen = tokenize(val_nep_sen,max_seq_len+1,0,target_tokenizer)

In [None]:
import tensorflow as tf
def format_dataset(eng_sen,nep_sen):
  return ({
      'english' :eng_sen,
      'nepali' :nep_sen[:,:-1],
  }),nep_sen[:,1:]

def make_dataset(eng_sen,nep_sen):
  dataset = tf.data.Dataset.from_tensor_slices((eng_sen,nep_sen))
  dataset = dataset.batch(64)
  dataset = dataset.map(format_dataset,num_parallel_calls=4)
  return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_en_sen,train_ne_sen)
val_ds = make_dataset(val_en_sen,val_ne_sen)

In [None]:
for inputs,target in train_ds.take(1):
  print(f"encoder input shape:{inputs['english'].shape}")
  print(f"decoder input shape: {inputs['nepali'].shape}")
  print(f"target shape:{target.shape}")
  print(source_tokenizer.decode(inputs['english'][0]),target_tokenizer.decode(inputs['nepali'][0]))
  print(inputs['nepali'][0],target[0])

encoder input shape:(64, 38)
decoder input shape: (64, 38)
target shape:(64, 38)
liberation is a genre of literature मुक्तक साहित्यको एउटा बिधा हो
tf.Tensor(
[    2 34841  9215  1473 16563  1052     3     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0], shape=(38,), dtype=int32) tf.Tensor(
[34841  9215  1473 16563  1052     3     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0], shape=(38,), dtype=int32)


#Model building

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow import keras

In [None]:
# positional encoding
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
# positional_encoding + embeddings
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
    self.pos_encoding = positional_encoding(length=38, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x

In [None]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

In [None]:
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=False)

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

In [None]:
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [None]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [None]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x)
    return x

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):

    x = self.pos_embedding(x)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`


In [None]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

In [None]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]


  def call(self, x, context):
    x = self.pos_embedding(x)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)
    return x

In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    context, x  = inputs['english'],inputs['nepali']

    context = self.encoder(context)

    x = self.decoder(x, context)

    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)
    return logits


In [None]:
num_layers = 4
d_model = 256
dff = 1024
num_heads = 8
dropout_rate = 0.1

In [None]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=50000,
    target_vocab_size=50000,
    dropout_rate=dropout_rate)

In [None]:
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [None]:
custom_objects = {'Transformer':Transformer,'masked_loss':masked_loss,'masked_accuracy':masked_accuracy}
transformer = keras.models.load_model("/content/drive/MyDrive/transformer_model_latest_final.keras",custom_objects = custom_objects)

In [None]:
transformer.compile(
    loss=masked_loss,
    optimizer='rmsprop',
    metrics=[masked_accuracy])

from tensorflow import keras
callbacks = [keras.callbacks.ModelCheckpoint('/content/drive/MyDrive/transformer_model_latest_final.keras',save_best_only = False,verbose = 1)]

transformer.fit(train_ds,
                epochs=10,
                initial_epoch=8,
                validation_data=val_ds,callbacks  = callbacks)

Epoch 9/10
Epoch 9: saving model to /content/drive/MyDrive/transformer_model_latest_final.keras
Epoch 10/10

In [None]:
custom_objects = {'Transformer':Transformer,'masked_loss':masked_loss,'masked_accuracy':masked_accuracy}
model = keras.models.load_model("/content/drive/MyDrive/transformer_model_latest_final.keras",custom_objects = custom_objects)

model.summary()

Model: "transformer_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_1 (Encoder)         multiple                  23320576  
                                                                 
 decoder_1 (Decoder)         multiple                  31736832  
                                                                 
 dense_33 (Dense)            multiple                  12850000  
                                                                 
Total params: 67907408 (259.05 MB)
Trainable params: 67907408 (259.05 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
import numpy as np
import tensorflow as tf

nep_vocab = target_tokenizer.get_vocab()
max_decoded_sentence_length = 38

def decode_sequence(input_sentence):
    # Tokenize the input sentence
    tokenized_input_sentence = source_tokenizer.encode(input_sentence).ids
    tokenized_input_sentence = tf.convert_to_tensor([tokenized_input_sentence], dtype=tf.int32)

    # Initialize the decoded sentence
    decoded_sentence = '[start]'
    decoded_sentence_tokens = target_tokenizer.encode(decoded_sentence).ids
    decoded_sentence_tokens = tf.convert_to_tensor([decoded_sentence_tokens], dtype=tf.int32)

    for i in range(max_decoded_sentence_length):
        # Create the input dictionary for the model
        inputs = {
            'english': tokenized_input_sentence,
            'nepali': decoded_sentence_tokens
        }

        # Get predictions
        predictions = model(inputs, training=False)

        # Select the last token from the `seq_len` dimension.
        predictions = predictions[:, -1, :]  # Shape `(batch_size, 1, vocab_size)`.

        # Get the index of the highest probability token
        sampled_token_index = tf.argmax(predictions, axis=-1).numpy().item()


        sampled_token = target_tokenizer.decode([sampled_token_index])


        decoded_sentence += " " + sampled_token

        if sampled_token == '[end]':
            break

        decoded_sentence_tokens = target_tokenizer.encode(decoded_sentence, add_special_tokens=False).ids
        decoded_sentence_tokens = tf.convert_to_tensor([decoded_sentence_tokens], dtype=tf.int32)


    return decoded_sentence

In [None]:
sentence = "although a large number of Indian and American tourists entered the country, the number of tourists from China has not increased as per expectation."
decoded_sentence = decode_sequence(sentence.lower())
print(decoded_sentence)

[start] ठूलो संख्यामा भारतीय र अमेरिकी पर्यटक देश भित्र पस्यो तापनि चीनबाट पर्यटकहरूको सङ्ख्या अपेक्षा अनुसार बढेको छैन ।                    
