## English to Nepali Translator

### Setup

In [2]:
import pathlib
import random
import tensorflow as tf
import tensorrt
from tensorflow import keras
import keras_nlp
from tensorflow_text.tools.wordpiece_vocab import (
    bert_vocab_from_dataset as bert_vocab
)

Using TensorFlow backend


In [3]:
BATCH_SIZE = 16
EPOCHS = 10
MAX_SEQUENCE_LENGTH = 40
ENG_VOCAB_SIZE = 15000
NEP_VOCAB_SIZE = 15000

EMBED_DIM = 256
INTERMEDIATE_DIM = 2048
NUM_HEADS = 8

AUTOTUNE = tf.data.AUTOTUNE

In [4]:
text_file = pathlib.Path('npi-eng/npi.txt')

In [5]:
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
text_pairs = []
for line in lines:
    eng, nep = line.split("\t")[:2]
    eng = eng.lower()
    nep = nep.lower()
    text_pairs.append((eng, nep))

Below, I've printed some example sentence pairs.

In [6]:
for _ in range(5):
    print(random.choice(text_pairs))

("tom called his dad to wish him a happy father's day.", 'टमले आफ्नो बुबालाई बुबाको दिनको शुभकामना दिन फोन गरे।')
('tom thinks highly of himself.', 'टमले आफूलाई धेरै ठान्छ।')
('tom is at school right now.', 'टम अहिले स्कूलमा छ।')
('he is foolish.', 'ऊ मूर्ख छ।')
('tom and mary are having a little financial trouble.', 'टम र मेरी अलि आर्थिक समस्यामा छन्।')


Now, we can split the sentence pairs into training, validation, and test sets. Notice that this dataset is quite small. This is one of the challenges of creating models for global minority languages. There is substantially less data to work with than if we were working with, for example, Spanish or French.

In [7]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

1574 total pairs
1102 training pairs
236 validation pairs
236 test pairs


### Tokenizer for Translator

In [8]:
def train_word_piece(text_samples, vocab_size, reserved_tokens):
    word_piece_ds = tf.data.Dataset.from_tensor_slices(text_samples)
    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
        word_piece_ds.batch(1000).prefetch(2),
        vocabulary_size=vocab_size,
        reserved_tokens=reserved_tokens,
    )

    return vocab

In [9]:
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]

eng_samples = [text_pair[0] for text_pair in train_pairs]
eng_vocab = train_word_piece(eng_samples, ENG_VOCAB_SIZE, reserved_tokens)

nep_samples = [text_pair[1] for text_pair in train_pairs]
nep_vocab = train_word_piece(nep_samples, NEP_VOCAB_SIZE, reserved_tokens)


2023-08-08 21:14:06.076895: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-08 21:14:07.663219: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-08 21:14:07.663538: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [10]:
print("English Tokens: ", eng_vocab[100:110])
print("Nepali Tokens: ", nep_vocab[100:110])

English Tokens:  ['didn', 'll', '##ad', '##ome', '##day', '##le', '##o', '##p', 'about', 'and']
Nepali Tokens:  ['##ु', '##ेको', 'थाहा', '##ल', '##हरू', 'छु', '##ँ', '##स', 'हुँ', '##सँग']


In [11]:
eng_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=eng_vocab, lowercase=False
)

nep_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=nep_vocab, lowercase=False
)

### Format the Text Datasets

In [12]:
def preprocess_batch(eng, nep):
    batch_size = tf.shape(nep)[0]

    eng = eng_tokenizer(eng)
    nep = nep_tokenizer(nep)

    # pad eng to max_sequence_length
    eng_start_end_packer = keras_nlp.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH,
        pad_value = eng_tokenizer.token_to_id("[PAD]"),
    )

    eng = eng_start_end_packer(eng)

    # add special tokens [start] and [end] and pad nep
    nep_start_end_packer = keras_nlp.layers.StartEndPacker(
        sequence_length = MAX_SEQUENCE_LENGTH + 1,
        start_value = nep_tokenizer.token_to_id("[START]"),
        end_value = nep_tokenizer.token_to_id("[END]"),
        pad_value = nep_tokenizer.token_to_id("[PAD]")
    )

    nep = nep_start_end_packer(nep)

    return (
        {
        "encoder_inputs": eng,
        "decoder_inputs": nep[:, :-1]
        },
        nep[:, 1:],
    )

def make_dataset(pairs):
    eng_texts, nep_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    nep_texts = list(nep_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, nep_texts))
    dataset=dataset.batch(BATCH_SIZE)
    dataset = dataset.map(preprocess_batch, num_parallel_calls=AUTOTUNE)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

### Building the Translator Model

In [13]:
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=ENG_VOCAB_SIZE,
    sequence_length = MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)(encoder_inputs)

encoder_outputs = keras_nlp.layers.TransformerEncoder(
    intermediate_dim = INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(inputs=x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=NEP_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)(decoder_inputs)

x = keras_nlp.layers.TransformerDecoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
x = keras.layers.Dropout(0.5)(x)
decoder_outputs = keras.layers.Dense(NEP_VOCAB_SIZE, activation="softmax")(x)
decoder = keras.Model(
    [
        decoder_inputs,
        encoded_seq_inputs
    ],
    decoder_outputs,
)
decoder_outputs = decoder([decoder_inputs, encoder_outputs])

transformer = keras.Model(
    [encoder_inputs, decoder_inputs],
    decoder_outputs,
    name="transformer",
)

In [14]:
transformer.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_inputs (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 token_and_position_embeddi  (None, None, 256)            3850240   ['encoder_inputs[0][0]']      
 ng (TokenAndPositionEmbedd                                                                       
 ing)                                                                                             
                                                                                                  
 decoder_inputs (InputLayer  [(None, None)]               0         []                  

In [15]:
transformer.compile(
    "rmsprop", 
    loss="sparse_categorical_crossentropy", 
    metrics=["accuracy"]
)

In [16]:
transformer.fit(
    train_ds, 
    epochs=10, 
    validation_data=val_ds
    )

Epoch 1/10


2023-08-08 21:14:41.059528: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-08-08 21:14:42.006355: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x94b0690 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-08-08 21:14:42.006420: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4070, Compute Capability 8.9
2023-08-08 21:14:42.449517: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-08-08 21:14:43.171079: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8902
2023-08-08 21:14:43.999180: W tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.cc:231] Falling back to the CUDA driver for PTX compilation; ptxas does not support 

 8/69 [==>...........................] - ETA: 16s - loss: 8.6083 - accuracy: 0.0838

KeyboardInterrupt: 

In [57]:
transformer.save('translator.keras')

In [62]:
def decode_sequences(input_sentences):
    batch_size = tf.shape(input_sentences)[0]

    encoder_input_tokens = eng_tokenizer(input_sentences).to_tensor(
        shape=(None, MAX_SEQUENCE_LENGTH)
    )

    def next(prompt, cache, index):
        logits = transformer([encoder_input_tokens, prompt])[:, index - 1, :]
        hidden_states = None
        return logits, hidden_states, cache
    
    length = 40
    start = tf.fill((batch_size, 1), nep_tokenizer.token_to_id("[START]"))
    pad = tf.fill((batch_size, length - 1), nep_tokenizer.token_to_id("[PAD]"))
    prompt = tf.concat((start, pad), axis=-1)

    generated_tokens = keras_nlp.samplers.GreedySampler()(
        next,
        prompt,
        end_token_id=nep_tokenizer.token_to_id("[END]"),
        index=1
    )
    generated_sentences = nep_tokenizer.detokenize(generated_tokens)
    return generated_sentences

test_eng_texts = [pair[0] for pair in test_pairs]
for i in range(5):
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequences(tf.constant([input_sentence]))
    translated = translated.numpy()[0].decode("utf-8")
    translated = (
        translated.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )

    print(f"** Example {i} **")
    print(input_sentence)
    print(translated)
    print()

** Example 0 **
i'll call you right back.
म स्क्वाइनँ ।

** Example 1 **
the last time i saw tom was in october.
स्बाम्रेलनुबा बुराहनन् ।

** Example 2 **
who are you calling for?
यो स्वाताइनुभर ?

** Example 3 **
i think i'd like to do that.
म मानेर्कनुँ ।

** Example 4 **
i got used to wearing a mask.
म स्क्वासनुर्कनुरको छ ।

