In [2]:
#text_file = "eng-ban.txt"
text_file = "ban-eng.txt"
# To open the text file we need to encode the text. Here, we use 'utf8' encoding
with open(text_file, encoding="utf8") as f:
    lines = f.read().split("\n")[:-1]
text_pairs = []
for line in lines:
    bangla, english = line.split(" >>> ")
    bangla = "[start] " + bangla + " [end]"
    text_pairs.append((english, bangla))

In [3]:
print("Total Sentences:", len(text_pairs))
#text_pairs

Total Sentences: 20


In [4]:
import random
for i in range(5):
    print(random.choice(text_pairs));

('Bangladesh is a south-asian country.', '[start] বাংলাদেশ দক্ষিন এশিয়ার একটি দেশ. [end]')
('I know Bangla.', '[start] আমি বাংলা জানি. [end]')
('No.', '[start] না. [end]')
('Are you mad!', '[start] আপনি কি পাগল! [end]')
('Bangla is my mother tounge.', '[start] বাংলা আমার মাতৃভাষা. [end]')


In [5]:
import random
random.shuffle(text_pairs)
total_data_length = len(text_pairs)
num_val_samples = int(0.20 * total_data_length)

In [6]:
print(total_data_length)
print(num_val_samples)

20
4


In [7]:
num_train_samples = (total_data_length - 2 * num_val_samples)
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

In [8]:
print(f"Number of Train Pairs: {len(train_pairs)}")
print(f"Number of Val Pairs: {len(val_pairs)}")
print(f"Number of Test Pairs: {len(test_pairs)}")

Number of Train Pairs: 12
Number of Val Pairs: 4
Number of Test Pairs: 4


In [31]:
import tensorflow as tf
import string
import re

strip_chars = string.punctuation + ""
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

In [21]:
f"[{re.escape(strip_chars)}]"

'[!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\\\\\^_`\\{\\|\\}\\~]'

In [32]:
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
from tensorflow.keras import layers

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return(tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", ""
    ))

vocab_size = 10000
sequence_length = 20
source_vectorization = layers.TextVectorization(
    max_tokens = vocab_size,
    output_mode = "int",
    output_sequence_length = sequence_length
)
target_vectorization = layers.TextVectorization(
    max_tokens = vocab_size,
    output_mode = "int",
    output_sequence_length = sequence_length + 1,
    standardize = custom_standardization
)

In [33]:
train_english_texts = [pair[0] for pair in train_pairs]
train_bangla_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_bangla_texts)

In [34]:
batch_size = 64

def format_dataset(eng, ban):
    eng = source_vectorization(eng)
    ban = source_vectorization(ban)
    return({"english": eng, "bangla": ban[:, :-1]}, ban[:, 1:])

def make_dataset(pairs):
    eng_texts, ban_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    ban_texts = list(ban_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, ban_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls = 4)
    return(dataset.shuffle(2048).prefetch(16).cache())

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [22]:
#print(list(train_ds.as_numpy_iterator()));

In [35]:
for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['bangla'].shape: {inputs['bangla'].shape}")
    print(f"target.shape: {targets.shape}")

inputs['english'].shape: (12, 20)
inputs['bangla'].shape: (12, 19)
target.shape: (12, 19)


In [36]:
from tensorflow import keras

embed_dim = 256
latent_dim = 1024

In [37]:
source = keras.Input(shape=(None,), dtype="int64", name="english")
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(source)
encoded_source = layers.Bidirectional(layers.GRU(latent_dim), merge_mode="sum")(x)

In [38]:
past_target = keras.Input(shape=(None,), dtype="int64", name="bangla")
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(past_target)
decoder_gru = layers.GRU(latent_dim, return_sequences=True)
x = decoder_gru(x, initial_state=encoded_source)
x = layers.Dropout(0.5)(x)
target_next_step = layers.Dense(vocab_size, activation="softmax")(x)
seq2seq_rnn = keras.Model([source, past_target], target_next_step)

In [43]:
#seq2seq_rnn.summary()

In [39]:
seq2seq_rnn.compile(
    optimizer = "rmsprop",
    loss = "sparse_categorical_crossentropy",
    metrics = ["accuracy"]
)
seq2seq_rnn.fit(train_ds, epochs=100, validation_data=val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2a822938610>

In [40]:
import numpy as np
ban_vocab = target_vectorization.get_vocabulary()
ban_index_lookup = dict(zip(range(len(ban_vocab)), ban_vocab))
max_decoded_sentence_length = 20

In [53]:
print(ban_index_lookup)

{0: '', 1: '[UNK]', 2: '[start]', 3: '[end]', 4: 'বাংলা', 5: 'কি', 6: 'পাগল', 7: 'না', 8: 'জানি', 9: 'আমি', 10: 'আপনি', 11: 'হ্যাঁ', 12: 'হেরেছি', 13: 'হায়', 14: 'মানুষ', 15: 'মাতৃভাষা', 16: 'বাংলায়', 17: 'বাংলাদেশে', 18: 'বলে', 19: 'ফেব্রুয়ারি', 20: 'প্রায়', 21: 'নথি', 22: 'দুঃখিত', 23: 'দিবস', 24: 'থেকে', 25: 'তুমি', 26: 'গিয়েছেন', 27: 'খেলায়', 28: 'কোটি', 29: 'কথা', 30: 'কখনো', 31: 'এটা', 32: 'একুশে', 33: 'এক', 34: 'ইংরেজি', 35: 'আমরা', 36: 'আন্তর্জাতিক', 37: 'অনুবাদের'}


In [41]:
# English texts
train_eng_texts = [pair[0] for pair in train_pairs]
print(f"Train Eng Text: {len(train_eng_texts)}")

test_eng_texts = [pair[0] for pair in test_pairs]
print(f"Test Eng Text: {len(test_eng_texts)}")

val_eng_texts = [pair[0] for pair in val_pairs]
print(f"Val Eng Text: {len(val_eng_texts)}")

# Bangla texts
train_ban_texts = [pair[1] for pair in train_pairs]
print(f"\nTrain Ban Text: {len(train_ban_texts)}")

test_ban_texts = [pair[1] for pair in test_pairs]
print(f"Test Ban Text: {len(test_ban_texts)}")

val_ban_texts = [pair[1] for pair in val_pairs]
print(f"Val Ban Text: {len(val_ban_texts)}")

Train Eng Text: 12
Test Eng Text: 4
Val Eng Text: 4

Train Ban Text: 12
Test Ban Text: 4
Val Ban Text: 4


In [42]:
input_sentence = random.choice(test_eng_texts)
print(input_sentence)
tokenized_input_sentence = source_vectorization([input_sentence])
#print(tokenized_input_sentence)

I do not know him.


In [43]:
'''# Test purpose
input_sentence1 = random.choice(test_pairs)
print(input_sentence1[0])
print(input_sentence1[1])
tokenized_input_sentence10 = source_vectorization([input_sentence1[0]])
print(tokenized_input_sentence0)
tokenized_input_sentence11 = source_vectorization([input_sentence1[1]])
print(tokenized_input_sentence1)'''

'# Test purpose\ninput_sentence1 = random.choice(test_pairs)\nprint(input_sentence1[0])\nprint(input_sentence1[1])\ntokenized_input_sentence10 = source_vectorization([input_sentence1[0]])\nprint(tokenized_input_sentence0)\ntokenized_input_sentence11 = source_vectorization([input_sentence1[1]])\nprint(tokenized_input_sentence1)'

In [58]:
decoded_sentence = "[start]"    # The decoded sentence (bangla) that begins with "start"
tokenized_target_sentence = target_vectorization([decoded_sentence])    # It will vectorize the decoded sentence
print(tokenized_target_sentence)
next_token_predictions = seq2seq_rnn.predict([tokenized_input_sentence, tokenized_target_sentence]) # We use seqRNN -
# to predict the next word. It calculates the probability of all vocabs, the highest probability holder will be the next word.
print(next_token_predictions[0, 0, :].shape)
print(next_token_predictions[0, 0, :])
sampled_token_index = np.argmax(next_token_predictions[0, 0, :]);   # The index of the highest probability
print(sampled_token_index)
sampled_token = ban_index_lookup[sampled_token_index]   # The sample_token_index will retrive the corresponding word from 
# the dictionary we created earlier as "ban_index_lookup"
print(sampled_token)
decoded_sentence += " " + sampled_token     # Here, we cancatenated the retrived word with the decoded sentence "[start]"
print(decoded_sentence);

print("\nNext word prediction...\n")
# Below, we did the same thing to predict the next word depending on the "start" token and the word predicted earlier.
tokenized_target_sentence = target_vectorization([decoded_sentence])
print(tokenized_target_sentence)
next_token_predictions = seq2seq_rnn.predict([tokenized_input_sentence, tokenized_target_sentence])
print(next_token_predictions[0, 1, :].shape)
print(next_token_predictions[0, 1, :])
sampled_token_index = np.argmax(next_token_predictions[0, 1, :]);
print(sampled_token_index)
sampled_token = ban_index_lookup[sampled_token_index]
print(sampled_token)
decoded_sentence += " " + sampled_token
print(decoded_sentence);

print("\nNext word prediction...\n")
# We can do it again and again ...
tokenized_target_sentence = target_vectorization([decoded_sentence])
print(tokenized_target_sentence)
next_token_predictions = seq2seq_rnn.predict([tokenized_input_sentence, tokenized_target_sentence])
print(next_token_predictions[0, 1, :].shape)
print(next_token_predictions[0, 1, :])
sampled_token_index = np.argmax(next_token_predictions[0, 2, :]);
print(sampled_token_index)
sampled_token = ban_index_lookup[sampled_token_index]
print(sampled_token)
decoded_sentence += " " + sampled_token
print(decoded_sentence);

tf.Tensor([[2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]], shape=(1, 21), dtype=int64)
(10000,)
[7.4464520e-03 9.9248588e-01 7.2881812e-09 ... 6.1897896e-09 5.3716822e-09
 1.0599247e-08]
1
[UNK]
[start] [UNK]

Next word prediction...

tf.Tensor([[2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]], shape=(1, 21), dtype=int64)
(10000,)
[2.9025363e-02 9.7093630e-01 4.2610129e-09 ... 3.4295125e-09 3.2348813e-09
 5.9960077e-09]
1
[UNK]
[start] [UNK] [UNK]

Next word prediction...

tf.Tensor([[2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]], shape=(1, 21), dtype=int64)
(10000,)
[2.9025363e-02 9.7093630e-01 4.2610129e-09 ... 3.4295125e-09 3.2348813e-09
 5.9960077e-09]
1
[UNK]
[start] [UNK] [UNK] [UNK]


In [59]:
# Lets define a decode_sequence function
def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])
        next_token_predictions = seq2seq_rnn.predict([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
        sampled_token = ban_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]": break
    return(decoded_sentence)

In [61]:
# Lets test the decoder
test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(len(test_eng_texts)):
    input_sentence = random.choice(test_eng_texts)
    print("---")
    print(input_sentence)
    print(decode_sequence(input_sentence))

---
About seventy percentage of people are literate.
[start] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]
---
Bangladesh is a south-asian country.
[start] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]
---
I do not know him.
[start] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]
---
I do not know him.
[start] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]
