# English-to-Bengali Transformer Translation

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from models.transformer import Transformer
import numpy as np
import string
import re
import pandas as pd
import random

`Dataset Preprocessing & Vectorization`

In [None]:
strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

In [None]:
vocab_size = 9000
sequence_length = 20
batch_size = 64

source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length)

target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization)

`Data loading function`

In [None]:
def load_data():
    with open("/content/ben.txt", encoding='utf-8') as f:
        lines = f.readlines()

    text_pairs = []
    for line in lines:
        parts = line.strip().split('\t')
        if len(parts) >= 2:
            en, bn = parts[0], parts[1]
            text_pairs.append((en, '[start] '+ bn + ' [end]'))

    df = pd.DataFrame(text_pairs, columns=["en", "bn"]).drop_duplicates()
    print(df[4000:6000:400])
    return text_pairs

In [None]:
text_pairs = load_data()

random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]


train_eng = [pair[0] for pair in train_pairs]
train_ben = [pair[1] for pair in train_pairs]

source_vectorization.adapt(train_eng)
target_vectorization.adapt(train_ben)

                                        en  \
4000            Would you like to come in?   
4400          Tom has never been so happy.   
4800        What time do you start school?   
5200     I've told you so a hundred times.   
5600  Tell me the reason why he was fired.   

                                                     bn  
4000                  [start] তুমি কি ভেতরে আসবে? [end]  
4400              [start] টম এত খুশি কখনো হয় নি। [end]  
4800    [start] কয়টার সময় তোমার স্কুল শুরু হয়? [end]  
5200         [start] আমি তোমাকে এটা একশবার বলেছি। [end]  
5600  [start] কেন তাকে চাকরিচ্যুত করা হয়েছে তার কার...  


In [None]:
def format_dataset(eng, ben):
    eng = source_vectorization(eng)
    ben = target_vectorization(ben)
    return ({
        "english": eng,
        "bengla": ben[:, :-1],
    }, ben[:, 1:])

def make_dataset(pairs):
    eng_texts, ben_texts = zip(*pairs)
    dataset = tf.data.Dataset.from_tensor_slices((list(eng_texts), list(ben_texts)))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

`Compile and Train`

In [None]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def masked_loss(y_true, y_pred):
    loss = loss_fn(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
    return tf.reduce_sum(loss * mask) / tf.reduce_sum(mask)

In [None]:
embed_dim = 256
num_heads = 8
num_layers = 1

model = Transformer(
    num_layers=num_layers,
    embed_size=embed_dim,
    num_heads=num_heads,
    ff_expansion=8,
    input_vocab_size=vocab_size,
    target_vocab_size=vocab_size,
    max_seq_length=sequence_length,
    source_language='english',
    target_language='bengla')

model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

model.fit(train_ds, validation_data=val_ds, epochs=30)

Epoch 1/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 280ms/step - accuracy: 0.6959 - loss: 3.7077 - val_accuracy: 0.7776 - val_loss: 1.7337
Epoch 2/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 54ms/step - accuracy: 0.7835 - loss: 1.6110 - val_accuracy: 0.7880 - val_loss: 1.5180
Epoch 3/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 57ms/step - accuracy: 0.7908 - loss: 1.4509 - val_accuracy: 0.7937 - val_loss: 1.4335
Epoch 4/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 55ms/step - accuracy: 0.7996 - loss: 1.3149 - val_accuracy: 0.7956 - val_loss: 1.3970
Epoch 5/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 55ms/step - accuracy: 0.8059 - loss: 1.2032 - val_accuracy: 0.8000 - val_loss: 1.3473
Epoch 6/30
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 54ms/step - accuracy: 0.8167 - loss: 1.0950 - val_accuracy: 0.8102 - val_loss: 1.2959
Epoch 7/30
[1m72/72[0m [32m━━

<keras.src.callbacks.history.History at 0x7cf2c951f150>

In [None]:
import numpy as np
ben_vocab = target_vectorization.get_vocabulary()
ben_index_lookup = dict(zip(range(len(ben_vocab)), ben_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"

    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]

        predictions = model({
            "english": tokenized_input_sentence,
            "bengla": tokenized_target_sentence
        })

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = ben_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break

    return ' '.join([token for token in decoded_sentence.split() if token not in ('[start]', '[end]')])


In [None]:
test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(20):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))


-
How old is Tom?
টমের বয়স কত
-
I like him.
আমার ওকে ভালো লাগে।
-
It looked fresh.
এটাকে দেখে আমেরিকান বলে মনে হচ্ছে।
-
You look very pale.
আপনাকে দেখে খুব ফ্যাকাশে লাগছে।
-
Who are those guys?
ওরা কারা
-
We were here about three years ago.
আমরা এখানে তিন বছর আগে পর্যন্ত আমরা এখানে আছি।
-
Where can I catch the bus to the Tokyo station?
আমি স্ট্যাম্প কোথা থেকে অনেক বছর অস্ট্রেলিয়া থেকে কিনতে পারবে।
-
How could I be a robot? Robots can't dream.
আমি এটাকে কিভাবে যাব
-
Don't come in.
ভেতরে আসবেন না।
-
I don't want you to leave.
আমি আপনাকে কথা বলতে চাই না।
-
We're going.
আমরা যাবো।
-
I suspect Tom isn't in Australia.
আমার সন্দেহ হচ্ছে যে টম অস্ট্রেলিয়ায় নেই।
-
Something has happened to this clock.
এই চিড়িয়াখানাটার বয়স কত
-
Tom was utterly humiliated.
টম পাতিলেবুটাকে চিপরালো।
-
I don't like to drink coffee.
আমি নিউইয়র্কে থাকতে চাই।
-
I've never smoked a cigarette in my life.
আমি আমার ছেলেকে দেখতে চাই।
-
Your hat is somewhere around here.
তোমার এই আশেপাশেই আছে।
-
I know why you're ups