<a href="https://colab.research.google.com/github/baixabhi/Machine-transliteration-English-to-Bengali-/blob/main/machine_transliteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets



In [None]:
!pip install fasttext



In [None]:
import json
from tqdm import tqdm
import pandas as pd
tqdm.pandas()
import numpy as np

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding,Bidirectional,Concatenate
from keras.optimizers import *

from tensorflow.keras.utils import to_categorical

import pickle
from datasets import load_metric

from nltk.translate.bleu_score import corpus_bleu
import fasttext
import tqdm as notebook_tqdm

In [None]:
# Read the training data from JSON file
train_data = pd.read_json("/kaggle/input/ben-machine/ben_train.json", lines=True)
test_data=pd.read_json("/kaggle/input/ben-machine/ben_test.json", lines=True)

In [None]:
#converts the JSON data into a Pandas DataFrame
def correct_json_format(json_file_path):
    with open(json_file_path, 'r', encoding='utf-8') as file:
        json_data = file.read()

    # Process the JSON string to insert commas between objects
    json_data = json_data.replace('}\n{', '},{')
    json_list = json.loads(f'[{json_data}]')
    df = pd.json_normalize(json_list)

    return df


In [None]:
df_train = correct_json_format("/kaggle/input/ben-machine/ben_train.json")
df_test = correct_json_format("/kaggle/input/ben-machine/ben_test.json")


In [None]:
df_train


Unnamed: 0,unique_identifier,native word,english word,source,score
0,ben1,ভীষ্ম,vismo,Dakshina,
1,ben2,চরিত্র,choritra,Dakshina,
2,ben3,অটোরিক্সা,autorickshawa,Dakshina,
3,ben4,জ্যোতির্বিদ্যা,jotirvidya,Dakshina,
4,ben5,ভৌগলিকভাবে,bhougleekbhabe,Dakshina,
...,...,...,...,...,...
1231423,ben1231424,আদমগড়,adamgarh,IndicCorp,-0.070839
1231424,ben1231425,মনাভাই,manabhai,IndicCorp,-0.253134
1231425,ben1231426,সোনালীর,sonalir,IndicCorp,-0.212031
1231426,ben1231427,প্রসেক্কো,prosekko,IndicCorp,-0.194534


In [None]:
df_test

Unnamed: 0,unique_identifier,native word,english word,source
0,ben1,প্রতিসংখ্যায়,pratisangkhyaay,AK-Freq
1,ben2,অতিষ্ঠ,atishth,AK-Freq
2,ben3,অকল্পনীয়,akalpaneey,AK-Freq
3,ben4,অ্যান্টের,anter,AK-Freq
4,ben5,ছাইখুল,chhaikhul,AK-Freq
...,...,...,...,...
14162,ben14163,দেবদেবীদের,debdebeeder,Dakshina
14163,ben14164,যক্ষার,jakkhar,Dakshina
14164,ben14165,গোষ্ঠীগত,gasthigato,Dakshina
14165,ben14166,অক্ষরগুলো,akkhargulo,Dakshina


In [None]:
#subset train DataFrame
subset_df_train=df_train.sample(frac=0.3).reset_index(drop=True)
subset_df_train.shape

(369428, 5)

In [None]:
df_train=subset_df_train
df_train.shape

(369428, 5)

In [None]:
 #start symbol "\t" at the beginning of the text and an end symbol "\n" at the end of the text(target_text)
def add_start_end(target_text):
  text = "\t" + target_text + "\n"
  return text


In [None]:
#the DataFrame df_train will have a new column 'target_ass' containing the original text values from the 'native word' column with a start symbol "\t" at the beginning and an end symbol "\n" at the end of each text value.
df_train['target_ben'] = df_train['native word'].progress_apply(add_start_end)
df_train.head()

100%|██████████| 369428/369428 [00:00<00:00, 556246.21it/s]


Unnamed: 0,unique_identifier,native word,english word,source,score,target_ben
0,ben1187853,মুদিগিরি,mudigiri,IndicCorp,-0.05673,\tমুদিগিরি\n
1,ben593607,অসম্মতিসত্ত্বেও,osommotisotteo,IndicCorp,-0.145597,\tঅসম্মতিসত্ত্বেও\n
2,ben1003218,আর্মান্ডা,armanda,IndicCorp,-0.144504,\tআর্মান্ডা\n
3,ben34892,সর্বোৎকৃষ্ট,sorbotkrishto,Dakshina,,\tসর্বোৎকৃষ্ট\n
4,ben550722,মাজেনি,mageni,IndicCorp,-0.341199,\tমাজেনি\n


In [None]:
# add only the end token
# add_end takes a string target_text as input and adds an end symbol "\n"
def add_end(target_text):
  text = target_text + "\n"
  return text

In [None]:
 #adds a new column named 'decoder_target' to the DataFrame df_train. The values in this new column are derived from the values in the 'native word' column.
df_train['decoder_target'] = df_train['native word'].progress_apply(add_end)
df_train.head()


100%|██████████| 369428/369428 [00:00<00:00, 591066.09it/s]


Unnamed: 0,unique_identifier,native word,english word,source,score,target_ben,decoder_target
0,ben1187853,মুদিগিরি,mudigiri,IndicCorp,-0.05673,\tমুদিগিরি\n,মুদিগিরি\n
1,ben593607,অসম্মতিসত্ত্বেও,osommotisotteo,IndicCorp,-0.145597,\tঅসম্মতিসত্ত্বেও\n,অসম্মতিসত্ত্বেও\n
2,ben1003218,আর্মান্ডা,armanda,IndicCorp,-0.144504,\tআর্মান্ডা\n,আর্মান্ডা\n
3,ben34892,সর্বোৎকৃষ্ট,sorbotkrishto,Dakshina,,\tসর্বোৎকৃষ্ট\n,সর্বোৎকৃষ্ট\n
4,ben550722,মাজেনি,mageni,IndicCorp,-0.341199,\tমাজেনি\n,মাজেনি\n


In [None]:
def tokenize(lang):
# Create tokenizer
    #(filters)means that all characters in the input text will be considered for tokenization,(char_lvl)each character will be treated as a separate token.
    tokenizer = Tokenizer(filters='',char_level=True)
    # Fit texts
     #creates the vocabulary based on the characters present in the corpus.
    tokenizer.fit_on_texts(lang)
    word_index = tokenizer.word_index
    return tokenizer,word_index

In [None]:
# Tokenize words
roman_words = df_train['english word']
ben_words = df_train['target_ben']

roman_tokenizer, roman_tokens = tokenize(roman_words)
ben_tokenizer, ben_tokens = tokenize(ben_words)


print(roman_tokens)

print("No.of unique input tokens:", len(roman_tokens))


{'a': 1, 'r': 2, 'i': 3, 'o': 4, 'e': 5, 'h': 6, 'n': 7, 't': 8, 's': 9, 'k': 10, 'u': 11, 'd': 12, 'b': 13, 'l': 14, 'm': 15, 'p': 16, 'g': 17, 'c': 18, 'j': 19, 'y': 20, 'v': 21, 'f': 22, 'z': 23, 'w': 24, 'x': 25, 'q': 26}
No.of unique input tokens: 26


In [None]:
ben_words

0                \tমুদিগিরি\n
1         \tঅসম্মতিসত্ত্বেও\n
2               \tআর্মান্ডা\n
3             \tসর্বোৎকৃষ্ট\n
4                  \tমাজেনি\n
                 ...         
369423           \tসরযন্ত্র\n
369424         \tসুন্দরবনির\n
369425             \tকারাবখ\n
369426       \tধোনিপ্রেমীরা\n
369427            \tস্যাডার\n
Name: target_ben, Length: 369428, dtype: object

In [None]:
roman_words[15]

'nirbahokolpe'

In [None]:
print(ben_tokens )
print("No.of unique output tokens:", len(ben_tokens))


{'\t': 1, '\n': 2, 'া': 3, 'র': 4, '্': 5, 'ি': 6, 'ে': 7, 'ন': 8, 'ক': 9, 'স': 10, 'ব': 11, 'ল': 12, 'ত': 13, 'ম': 14, 'ু': 15, 'প': 16, 'দ': 17, 'য': 18, 'ট': 19, 'গ': 20, 'ো': 21, 'জ': 22, 'হ': 23, 'ী': 24, 'শ': 25, 'ই': 26, 'ভ': 27, 'চ': 28, 'ড': 29, '়': 30, 'ও': 31, 'ধ': 32, 'ষ': 33, 'ফ': 34, 'ণ': 35, 'অ': 36, 'আ': 37, 'থ': 38, 'খ': 39, 'ং': 40, 'উ': 41, 'এ': 42, 'ছ': 43, 'ূ': 44, 'ৃ': 45, 'ঙ': 46, 'ঞ': 47, 'ঘ': 48, 'ঠ': 49, 'ৌ': 50, 'ঁ': 51, 'ৈ': 52, 'ঝ': 53, 'ৎ': 54, 'ঃ': 55, 'ঢ': 56, 'ঐ': 57, 'ঋ': 58, 'ঈ': 59, 'ঔ': 60, 'ঊ': 61, '।': 62, '২': 63}
No.of unique output tokens: 63


In [None]:
# vocab size (no. of unique characters for each script)

roman_vocab = len(roman_tokens)+1
ben_vocab = len(ben_tokens)+1
roman_vocab,ben_vocab

(27, 64)

In [None]:
roman_vocab

27

In [None]:
roman_words[45],ben_words[45]

('muthukumaran', '\tমুথুকুমারন\n')

In [None]:
# convert words to int sequence

roman_words_in_ids = roman_tokenizer.texts_to_sequences(roman_words)
ben_words_in_ids = ben_tokenizer.texts_to_sequences(ben_words)

In [None]:
roman_words_in_ids[45],ben_words_in_ids[45]

([15, 11, 8, 6, 11, 10, 11, 15, 1, 2, 1, 7],
 [1, 14, 15, 38, 15, 9, 15, 14, 3, 4, 8, 2])

In [None]:
roman_words_in_ids

[[15, 11, 12, 3, 17, 3, 2, 3],
 [4, 9, 4, 15, 15, 4, 8, 3, 9, 4, 8, 8, 5, 4],
 [1, 2, 15, 1, 7, 12, 1],
 [9, 4, 2, 13, 4, 8, 10, 2, 3, 9, 6, 8, 4],
 [15, 1, 17, 5, 7, 3],
 [10, 5, 8, 5, 2],
 [12, 4, 8, 8, 4, 7, 18, 6, 4],
 [1, 7, 8, 4, 2, 19, 1, 8, 3, 10, 15, 1, 7, 5, 2],
 [13, 6, 3, 8, 4, 2, 17, 11, 14, 4, 8, 5],
 [6, 3, 9, 1, 2, 1, 10],
 [10, 4, 13, 3, 8, 1, 9, 4, 6, 4],
 [13, 11, 12, 12, 6, 1, 9, 20, 1],
 [11, 3, 2, 1, 10, 11, 7],
 [15, 4, 6, 1, 13, 3, 9, 24, 4, 10, 5],
 [12, 6, 3, 14, 3, 10, 1],
 [7, 3, 2, 13, 1, 6, 4, 10, 4, 14, 16, 5],
 [13, 11, 2, 11, 7, 19, 3],
 [1, 6, 4, 15, 5, 12, 5, 2, 5],
 [4, 21, 3, 13, 1, 13, 4, 10, 12, 5, 2, 4],
 [16, 1, 12, 1, 15, 9, 5, 5],
 [6, 1, 21, 5, 21, 1, 13, 5],
 [19, 1, 19, 17, 1, 7, 5, 2],
 [13, 20, 1, 8, 3, 10, 2, 1, 15, 3, 10, 5],
 [10, 1, 15, 2, 1, 15, 5],
 [10, 6, 20, 1, 8, 18, 6, 3, 14, 4],
 [13, 20, 1, 10, 8, 3, 7, 3, 2, 13, 6, 1, 2, 1, 8, 1],
 [7, 4, 8, 6, 3, 10, 5, 3],
 [7, 3, 2, 19, 1, 9, 17, 11, 14, 4],
 [21, 5, 7, 8, 11, 2],
 [4, 2,

In [None]:
roman_words[5]

'keter'

In [None]:
ben_words_in_ids[5]


[1, 9, 7, 13, 7, 4, 2]

In [None]:
roman_words_in_ids[5]

[10, 5, 8, 5, 2]

In [None]:
label_words = df_train["decoder_target"]
print(label_words[6])
#convert the text sequences in label_words into sequences of integers
y_train = ass_tokenizer.texts_to_sequences(label_words)
y_train

দত্তঞ্চ



[[14, 15, 17, 6, 21, 6, 4, 6, 2],
 [36, 10, 14, 5, 14, 13, 6, 10, 13, 5, 13, 5, 11, 7, 31, 2],
 [37, 4, 5, 14, 3, 8, 5, 29, 3, 2],
 [10, 4, 5, 11, 20, 54, 9, 45, 33, 5, 19, 2],
 [14, 3, 22, 7, 8, 6, 2],
 [9, 7, 13, 7, 4, 2],
 [17, 13, 5, 13, 47, 5, 28, 2],
 [37, 8, 5, 13, 4, 5, 22, 3, 13, 6, 9, 14, 3, 8, 7, 4, 2],
 [27, 6, 13, 4, 21, 15, 12, 20, 13, 7, 2],
 [23, 6, 10, 3, 4, 9, 2],
 [9, 11, 6, 13, 3, 10, 23, 2],
 [11, 15, 17, 5, 32, 10, 5, 18, 2],
 [41, 26, 4, 3, 9, 15, 8, 2],
 [14, 23, 3, 11, 6, 25, 5, 11, 9, 7, 2],
 [32, 6, 12, 6, 9, 3, 2],
 [8, 6, 4, 5, 11, 3, 23, 9, 12, 5, 16, 7, 2],
 [11, 15, 4, 15, 47, 5, 22, 6, 2],
 [37, 23, 14, 7, 17, 7, 4, 7, 2],
 [36, 27, 6, 11, 3, 11, 9, 17, 7, 4, 31, 2],
 [16, 3, 17, 3, 14, 10, 6, 2],
 [23, 3, 27, 7, 27, 3, 11, 7, 2],
 [22, 22, 21, 35, 7, 4, 2],
 [11, 5, 18, 13, 6, 9, 5, 4, 14, 24, 9, 7, 2],
 [9, 3, 14, 4, 3, 14, 7, 2],
 [38, 5, 18, 3, 13, 43, 6, 12, 2],
 [11, 5, 18, 9, 5, 13, 6, 8, 6, 4, 5, 27, 4, 13, 3, 2],
 [8, 40, 6, 9, 7, 26, 2],
 [8, 

In [None]:
y_train[9]

[23, 6, 10, 3, 4, 9, 2]

In [None]:
print(label_words[9])

হিসারক



In [None]:
# max sequence length

max_encoder_seq_length_roman = df_train['english word'].str.len().max()
max_decoder_seq_length_ben = df_train['target_ben'].str.len().max()

max_encoder_seq_length_roman, max_decoder_seq_length_ben

(28, 31)

In [None]:
def post_padding(data_in_int_seq, max_seq_len):
  return pad_sequences(data_in_int_seq, max_seq_len, padding = "post")
# padding

roman_padded = post_padding(roman_words_in_ids, max_encoder_seq_length_roman)
ben_padded = post_padding(ben_words_in_ids, max_decoder_seq_length_ben)
y_train_padded = post_padding(y_train, max_decoder_seq_length_ben)

print(roman_padded[16], len(roman_padded[16]))
print(ben_padded[16], len(ben_padded[16]))
print(y_train_padded[16],len(y_train_padded[16]))

[13 11  2 11  7 19  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0] 28
[ 1 11 15  4 15 47  5 22  6  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0] 31
[11 15  4 15 47  5 22  6  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0] 31


In [None]:
latent_dim = 64
emb_dim = 100

In [None]:
# Define the encoder model
encoder_inputs = Input(shape=(max_encoder_seq_length_roman,), name="encoder_input")
emb_encoder = Embedding(roman_vocab, emb_dim, mask_zero=True, name="encoder_embedding")(encoder_inputs)

encoder_outputs, forward_h, forward_c, backward_h, backward_c = Bidirectional(LSTM(latent_dim, return_state=True, return_sequences=False),name="BiLSTM",merge_mode="mul")(emb_encoder)


In [None]:
# concatenate along the last dimension
encoder_h = Concatenate(axis=-1,name="concat_h")([forward_h, backward_h])
encoder_c = Concatenate(axis=-1,name="concat_c")([forward_c, backward_c])
encoder_h, encoder_c


(<KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'concat_h')>,
 <KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'concat_c')>)

In [None]:
encoder_states = [encoder_h,encoder_c]
encoder_states

[<KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'concat_h')>,
 <KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'concat_c')>]

In [None]:
# decoder with two lstm layers

decoder_inputs = keras.Input(shape=(None,),name="decoder_input")

decoder_emb = Embedding(ben_vocab, emb_dim, mask_zero=True, name="decoder_embedding")(decoder_inputs)
d_lstm_out,d_h,d_c = LSTM(128, return_state=True, return_sequences=True, name="decoder_lstm_1")(decoder_emb, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(ben_vocab, activation="softmax")
decoder_outputs = decoder_dense(d_lstm_out)

In [None]:
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
loss = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.01, rho=0.9)

model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_input (InputLayer)  [(None, 28)]                 0         []                            
                                                                                                  
 encoder_embedding (Embeddi  (None, 28, 100)              2700      ['encoder_input[0][0]']       
 ng)                                                                                              
                                                                                                  
 decoder_input (InputLayer)  [(None, None)]               0         []                            
                                                                                                  
 BiLSTM (Bidirectional)      [(None, 64),                 84480     ['encoder_embedding[0][0

In [None]:
roman_padded[45]

array([15, 11,  8,  6, 11, 10, 11, 15,  1,  2,  1,  7,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)

In [None]:
roman_words[45]

'muthukumaran'

In [None]:

ben_padded[45]

array([ 1, 14, 15, 38, 15,  9, 15, 14,  3,  4,  8,  2,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)

In [None]:
y_train_padded[53587]

array([26, 46,  5, 21,  6, 13, 21,  4,  5, 27,  2,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)

In [None]:
batch_size = 64
epochs = 50




model.fit(
    [roman_padded, ben_padded],
    y_train_padded,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
)
model.save("/kaggle/working/mt_bilstm_1l.h5")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  saving_api.save_model(


In [None]:
#Load saved model
saved_model = tf.keras.models.load_model('/kaggle/working/mt_bilstm_1l.h5')
saved_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_input (InputLayer)  [(None, 28)]                 0         []                            
                                                                                                  
 encoder_embedding (Embeddi  (None, 28, 100)              2700      ['encoder_input[0][0]']       
 ng)                                                                                              
                                                                                                  
 decoder_input (InputLayer)  [(None, None)]               0         []                            
                                                                                                  
 BiLSTM (Bidirectional)      [(None, 64),                 84480     ['encoder_embedding[0][0

In [None]:
inputs = saved_model.get_layer('encoder_input').output                    #Encoder input
bi_lstm_out,f_h,f_c,b_h,b_c = saved_model.get_layer('BiLSTM').output   #Encoder LSTM output

concate_h = saved_model.get_layer('concat_h').output
concate_c = saved_model.get_layer('concat_c').output

In [None]:
bi_lstm_out

<KerasTensor: shape=(None, 64) dtype=float32 (created by layer 'BiLSTM')>

In [None]:
targets = saved_model.get_layer('decoder_input').output           # Decoder input
embedding_layer = saved_model.get_layer('decoder_embedding')      # Decoder Embedding layer
decoder_lstm1 = saved_model.get_layer('decoder_lstm_1')          # Decoder LSTM layer
dense1 = saved_model.get_layer('dense_1')                           # Updated layer name

In [None]:
#Encoder
encoder = keras.Model(inputs, [concate_h, concate_c])
encoder.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_input (InputLayer)  [(None, 28)]                 0         []                            
                                                                                                  
 encoder_embedding (Embeddi  (None, 28, 100)              2700      ['encoder_input[0][0]']       
 ng)                                                                                              
                                                                                                  
 BiLSTM (Bidirectional)      [(None, 64),                 84480     ['encoder_embedding[0][0]']   
                              (None, 64),                                                         
                              (None, 64),                                                   

In [None]:
#Decoder

decoder_input_h1 = Input(shape=(128,))
decoder_input_c1 = Input(shape=(128,))

x = embedding_layer(targets)

In [None]:
x

<KerasTensor: shape=(None, None, 100) dtype=float32 (created by layer 'decoder_embedding')>

In [None]:
x,d_out_h,d_out_c = decoder_lstm1(x, initial_state=[decoder_input_h1, decoder_input_c1])

In [None]:
d_out_h

<KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'decoder_lstm_1')>

In [None]:
#x = decoder_lstm2(x, initial_state=[decoder_input_h1, decoder_input_c1])
x = dense1(x)
x.shape

TensorShape([None, None, 64])

In [None]:
decoder = keras.Model([targets, decoder_input_h1, decoder_input_c1], [x, d_out_h, d_out_c])
decoder.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 decoder_input (InputLayer)  [(None, None)]               0         []                            
                                                                                                  
 decoder_embedding (Embeddi  (None, None, 100)            6400      ['decoder_input[0][0]']       
 ng)                                                                                              
                                                                                                  
 input_3 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 input_4 (InputLayer)        [(None, 128)]                0         []                      

In [None]:
reverse_input_char_index = dict((i, char) for char, i in roman_tokens.items())
reverse_target_char_index = dict((i, char) for char, i in ben_tokens.items())

In [None]:
reverse_input_char_index

{1: 'a',
 2: 'r',
 3: 'i',
 4: 'o',
 5: 'e',
 6: 'h',
 7: 'n',
 8: 't',
 9: 's',
 10: 'k',
 11: 'u',
 12: 'd',
 13: 'b',
 14: 'l',
 15: 'm',
 16: 'p',
 17: 'g',
 18: 'c',
 19: 'j',
 20: 'y',
 21: 'v',
 22: 'f',
 23: 'z',
 24: 'w',
 25: 'x',
 26: 'q'}

In [None]:
reverse_target_char_index

{1: '\t',
 2: '\n',
 3: 'া',
 4: 'র',
 5: '্',
 6: 'ি',
 7: 'ে',
 8: 'ন',
 9: 'ক',
 10: 'স',
 11: 'ব',
 12: 'ল',
 13: 'ত',
 14: 'ম',
 15: 'ু',
 16: 'প',
 17: 'দ',
 18: 'য',
 19: 'ট',
 20: 'গ',
 21: 'ো',
 22: 'জ',
 23: 'হ',
 24: 'ী',
 25: 'শ',
 26: 'ই',
 27: 'ভ',
 28: 'চ',
 29: 'ড',
 30: '়',
 31: 'ও',
 32: 'ধ',
 33: 'ষ',
 34: 'ফ',
 35: 'ণ',
 36: 'অ',
 37: 'আ',
 38: 'থ',
 39: 'খ',
 40: 'ং',
 41: 'উ',
 42: 'এ',
 43: 'ছ',
 44: 'ূ',
 45: 'ৃ',
 46: 'ঙ',
 47: 'ঞ',
 48: 'ঘ',
 49: 'ঠ',
 50: 'ৌ',
 51: 'ঁ',
 52: 'ৈ',
 53: 'ঝ',
 54: 'ৎ',
 55: 'ঃ',
 56: 'ঢ',
 57: 'ঐ',
 58: 'ঋ',
 59: 'ঈ',
 60: 'ঔ',
 61: 'ঊ',
 62: '।',
 63: '২'}

In [None]:
v = np.reshape(roman_padded[6], (1, -1))


In [None]:
v.shape

(1, 28)

In [None]:
encoder.predict(v,verbose=0)

[array([[ 8.22370887e-01,  2.31054429e-11,  2.63541686e-08,
          2.53154917e-06, -2.89788120e-04, -3.72258434e-03,
          9.64171529e-01, -6.20303126e-06,  3.06345499e-03,
          2.18851983e-05, -3.91420326e-05, -9.44051027e-01,
         -4.39377964e-01,  3.32376640e-03, -1.99740706e-03,
          9.57324207e-01,  3.72104277e-03,  1.44721300e-04,
         -6.05136214e-04,  8.91537726e-01,  3.68310253e-08,
          3.87517692e-11,  5.02334103e-07, -7.82405823e-06,
          1.16324172e-05,  9.30604152e-03,  2.00181294e-05,
         -1.80713433e-09,  5.11930346e-01,  2.12721425e-04,
          9.99231458e-01,  4.93230313e-01, -2.58472021e-04,
          3.05923801e-02, -7.81142712e-01, -3.56496871e-01,
          1.37937960e-07, -4.24176551e-06,  3.00967167e-06,
         -9.87526476e-01, -1.88572589e-08, -2.56394942e-05,
         -1.18201569e-06, -7.61394560e-01, -1.23111764e-04,
         -9.74524438e-01,  4.96900976e-02,  1.67166558e-03,
          1.60590619e-09, -9.93969381e-0

In [None]:
def predict_list_of_words(list_source_words_in_padded_int_seq):
  list_pred_words = []
  start_token = np.zeros((1, 1))
  start_token[0] = ass_tokenizer.word_index['\t']
  print(start_token)

  for x in tqdm(range(len(list_source_words_in_padded_int_seq))):
    source_seq = start_token
    pred_word = ''
    v = np.reshape(list_source_words_in_padded_int_seq[x], (1, -1))
    next_h, next_c = encoder.predict(v,verbose=0)
    for i in range(max_decoder_seq_length_ass):
      output, next_h, next_c = decoder.predict([source_seq] + [next_h, next_c],verbose=0)
      next_token = np.argmax(output[0, 0, :])
      next_char = reverse_target_char_index[next_token]
      if next_char == '\n':
        break
      else:
        pred_word += next_char
        source_seq = np.zeros((1, 1))
        source_seq[0] = next_token
    list_pred_words.append(pred_word)
  return list_pred_words

In [None]:
predicted = predict_list_of_words(roman_padded[:20])

[[1.]]


100%|██████████| 20/20 [00:15<00:00,  1.29it/s]


In [None]:
df_analyse = pd.DataFrame()
df_analyse["source"] = roman_words[:20]
df_analyse["predicted"] = predicted
df_analyse["ground_truth"] = df_train["native word"]
df_analyse

Unnamed: 0,source,predicted,ground_truth
0,mudigiri,মুদিোোোোো,মুদিগিরি
1,osommotisotteo,অসম্মতিষ্টতেও,অসম্মতিসত্ত্বেও
2,armanda,আরমান্দা,আর্মান্ডা
3,sorbotkrishto,সর্বতকৃষ্ট,সর্বোৎকৃষ্ট
4,mageni,মাজেনি,মাজেনি
5,keter,কেটের,কেতের
6,dottoncho,দত্তন্চ,দত্তঞ্চ
7,antorjatikmaner,আন্তর্জামিক্তানের,আন্তর্জাতিকমানের
8,bhitorgulote,ভিতরোোোোোলগে,ভিতরগুলোতে
9,hisarak,হিসারক,হিসারক


In [None]:
test_roman_words = df_test['english word']
test_target_words = df_test['native word']
test_roman_words[2],test_target_words[2]

('akalpaneey', 'অকল্পনীয়')

In [None]:
test_roman_words_in_ids = roman_tokenizer.texts_to_sequences(test_roman_words)
test_target_words_in_ids = ass_tokenizer.texts_to_sequences(test_target_words)

print(test_roman_words_in_ids[4])
print(test_target_words_in_ids[4])

[18, 6, 6, 1, 3, 10, 6, 11, 14]
[43, 3, 26, 38, 15, 12]


In [None]:
#2. padding

test_roman_padded = post_padding(test_roman_words_in_ids, max_encoder_seq_length_roman)
test_target_padded = post_padding(test_target_words_in_ids, max_decoder_seq_length_ass)

print(test_roman_padded[2])
print(test_target_padded[2])

[ 1 10  1 14 16  1  7  5  5 20  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0]
[36  9 12  5 16  8 24 18 30  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0]


In [None]:
len(test_roman_padded[2]), len(test_target_padded[2])

(28, 32)

In [None]:
# prediction on test set
predicted_test = predict_list_of_words(test_roman_padded[:1500])

[[1.]]


100%|██████████| 1500/1500 [20:49<00:00,  1.20it/s]


In [None]:
# analyse
df_analyse_test = pd.DataFrame()
df_analyse_test["source"] = test_roman_words[:1500]
df_analyse_test["predicted"] = predicted_test
df_analyse_test["ground_truth_label"] = df_test["native word"][:1500]

df_analyse_test.head(10)

Unnamed: 0,source,predicted,ground_truth_label
0,pratisangkhyaay,প্রতিসখ্যানওয়ায়,প্রতিসংখ্যায়
1,atishth,অতিষ্ঠ,অতিষ্ঠ
2,akalpaneey,আকালপনি,অকল্পনীয়
3,anter,অ্যান্টার,অ্যান্টের
4,chhaikhul,ছাইথুল,ছাইখুল
5,bipadaseemaar,বিপাদেসমূর,বিপদসীমার
6,janmaashtaamee,জন্মশতামি,জন্মাষ্টামী
7,naayikaader,নাইয়াকেদের,নায়িকাদের
8,pratishtaai,প্রতিষ্টায়,প্রতিষ্টাই
9,vyanililaamaaid,ভ্যালিনাইলামীদ,ভ্যনিলিলামাইড


In [None]:

def split_characters(text_string):
  return [character for character in text_string]

split_characters('চ' 'তু' 'ৰ্থ')

['চ', 'ত', 'ু', 'ৰ', '্', 'থ']

In [None]:
df_analyse_test["predicted_tokenized"] = df_analyse_test["predicted"].apply(split_characters)
df_analyse_test["label_tokenized"] = df_analyse_test["ground_truth_label"].apply(split_characters)

df_analyse_test.head()

Unnamed: 0,source,predicted,ground_truth_label,predicted_tokenized,label_tokenized
0,pratisangkhyaay,প্রতিসখ্যানওয়ায়,প্রতিসংখ্যায়,"[প, ্, র, ত, ি, স, খ, ্, য, া, ন, ও, য, ়, া, ...","[প, ্, র, ত, ি, স, ং, খ, ্, য, া, য, ়]"
1,atishth,অতিষ্ঠ,অতিষ্ঠ,"[অ, ত, ি, ষ, ্, ঠ]","[অ, ত, ি, ষ, ্, ঠ]"
2,akalpaneey,আকালপনি,অকল্পনীয়,"[আ, ক, া, ল, প, ন, ি]","[অ, ক, ল, ্, প, ন, ী, য, ়]"
3,anter,অ্যান্টার,অ্যান্টের,"[অ, ্, য, া, ন, ্, ট, া, র]","[অ, ্, য, া, ন, ্, ট, ে, র]"
4,chhaikhul,ছাইথুল,ছাইখুল,"[ছ, া, ই, থ, ু, ল]","[ছ, া, ই, খ, ু, ল]"


In [None]:
def create_reference(tokenized):
  return [tokenized]

df_analyse_test["reference"] = df_analyse_test["label_tokenized"].apply(create_reference)
df_analyse_test.head()

Unnamed: 0,source,predicted,ground_truth_label,predicted_tokenized,label_tokenized,reference
0,pratisangkhyaay,প্রতিসখ্যানওয়ায়,প্রতিসংখ্যায়,"[প, ্, র, ত, ি, স, খ, ্, য, া, ন, ও, য, ়, া, ...","[প, ্, র, ত, ি, স, ং, খ, ্, য, া, য, ়]","[[প, ্, র, ত, ি, স, ং, খ, ্, য, া, য, ়]]"
1,atishth,অতিষ্ঠ,অতিষ্ঠ,"[অ, ত, ি, ষ, ্, ঠ]","[অ, ত, ি, ষ, ্, ঠ]","[[অ, ত, ি, ষ, ্, ঠ]]"
2,akalpaneey,আকালপনি,অকল্পনীয়,"[আ, ক, া, ল, প, ন, ি]","[অ, ক, ল, ্, প, ন, ী, য, ়]","[[অ, ক, ল, ্, প, ন, ী, য, ়]]"
3,anter,অ্যান্টার,অ্যান্টের,"[অ, ্, য, া, ন, ্, ট, া, র]","[অ, ্, য, া, ন, ্, ট, ে, র]","[[অ, ্, য, া, ন, ্, ট, ে, র]]"
4,chhaikhul,ছাইথুল,ছাইখুল,"[ছ, া, ই, থ, ু, ল]","[ছ, া, ই, খ, ু, ল]","[[ছ, া, ই, খ, ু, ল]]"


In [None]:
# BLEU Score

# Convert DataFrame columns to lists
predictions = df_analyse_test['predicted_tokenized'].tolist()
true_labels = df_analyse_test['reference'].tolist()

In [None]:
true_labels

[[['প', '্', 'র', 'ত', 'ি', 'স', 'ং', 'খ', '্', 'য', 'া', 'য', '়']],
 [['অ', 'ত', 'ি', 'ষ', '্', 'ঠ']],
 [['অ', 'ক', 'ল', '্', 'প', 'ন', 'ী', 'য', '়']],
 [['অ', '্', 'য', 'া', 'ন', '্', 'ট', 'ে', 'র']],
 [['ছ', 'া', 'ই', 'খ', 'ু', 'ল']],
 [['ব', 'ি', 'প', 'দ', 'স', 'ী', 'ম', 'া', 'র']],
 [['জ', 'ন', '্', 'ম', 'া', 'ষ', '্', 'ট', 'া', 'ম', 'ী']],
 [['ন', 'া', 'য', '়', 'ি', 'ক', 'া', 'দ', 'ে', 'র']],
 [['প', '্', 'র', 'ত', 'ি', 'ষ', '্', 'ট', 'া', 'ই']],
 [['ভ', '্', 'য', 'ন', 'ি', 'ল', 'ি', 'ল', 'া', 'ম', 'া', 'ই', 'ড']],
 [['গ', 'ু', 'দ', 'ে', 'র']],
 [['ন', 'া', 'ম', 'জ', 'া', 'দ', 'া']],
 [['ই', 'ন', 'ড', 'ি', 'প', 'ে', 'ন', 'ড', 'ে', 'ন', '্', 'ট', 'ক', 'ে']],
 [['র', 'া', 'জ', 'প', 'র', 'ি', 'ব', 'া', 'র', 'ঃ']],
 [['স', '্', 'থ', 'া', 'ন', 'ী', 'য', '়', 'দ', 'ে', 'র']],
 [['প', 'ো', 'ল', 'া', 'র', 'স', '্', 'ট', 'া', 'র', '্', 'ন', 'ে']],
 [['প', '্', 'র', 'ত', '্', 'য', 'া', 'র', 'ে', 'র']],
 [['র', 'া', 'খ', 'ল']],
 [['স', '্', 'ব', 'া', 'ত', '্', 'ব', 'ি', 'ক']],
 [['ক', '্

In [None]:
predictions

[['প',
  '্',
  'র',
  'ত',
  'ি',
  'স',
  'খ',
  '্',
  'য',
  'া',
  'ন',
  'ও',
  'য',
  '়',
  'া',
  'য',
  '়'],
 ['অ', 'ত', 'ি', 'ষ', '্', 'ঠ'],
 ['আ', 'ক', 'া', 'ল', 'প', 'ন', 'ি'],
 ['অ', '্', 'য', 'া', 'ন', '্', 'ট', 'া', 'র'],
 ['ছ', 'া', 'ই', 'থ', 'ু', 'ল'],
 ['ব', 'ি', 'প', 'া', 'দ', 'ে', 'স', 'ম', 'ূ', 'র'],
 ['জ', 'ন', '্', 'ম', 'শ', 'ত', 'া', 'ম', 'ি'],
 ['ন', 'া', 'ই', 'য', '়', 'া', 'ক', 'ে', 'দ', 'ে', 'র'],
 ['প', '্', 'র', 'ত', 'ি', 'ষ', '্', 'ট', 'া', 'য', '়'],
 ['ভ', '্', 'য', 'া', 'ল', 'ি', 'ন', 'া', 'ই', 'ল', 'া', 'ম', 'ী', 'দ'],
 ['ো', 'ো', 'ো', 'ো', 'জ', 'া', 'র'],
 ['ন', 'া', 'ম', 'জ', 'া', 'দ', 'া'],
 ['ই', 'ন', 'া', 'ড', 'ি', 'ড', 'ে', 'ন', 'প', 'া', 'ন', '্', 'ট', 'ে', 'ট'],
 ['র', 'া', 'জ', 'প', 'ী', 'র', 'া', 'ব', 'র', 'া', 'হ'],
 ['স', '্', 'ং', 'ং', 'ী', 'ত', 'া', 'দ', 'ে', 'র'],
 ['প', 'গ', 'া', 'ল', 'া', 'র', '্', 'স', 'া', 'ত', '্', 'র', 'ে'],
 ['প', '্', 'র', 'ত', '্', 'য', 'া', 'র', 'ে', 'র'],
 ['র', 'া', 'থ', 'া', 'ল'],
 ['স', '্', 'ব', 'ত', '্

In [None]:
bleu = load_metric("bleu")
bleu.compute(predictions = predictions, references = true_labels)


{'bleu': 0.4456544625954141,
 'precisions': [0.7042406437603781,
  0.49618590196355417,
  0.3798388370990678,
  0.29718587560494714],
 'brevity_penalty': 1.0,
 'length_ratio': 1.154123977297855,
 'translation_length': 15658,
 'reference_length': 13567}

In [None]:
def transliterate(source_word_in_padded_int_seq):
  #list_pred_words = []
  start_token = np.zeros((1, 1))
  start_token[0] = ass_tokenizer.word_index['\t']
  #print(start_token)

  #for x in tqdm(range(len(list_source_words_in_padded_int_seq))):
  source_seq = start_token
  pred_word = ''
  #v = np.reshape(source_word_in_padded_int_seq, (1, -1))
  next_h, next_c = encoder.predict(source_word_in_padded_int_seq,verbose=0)
  for i in range(max_decoder_seq_length_ass):
    output, next_h, next_c = decoder.predict([source_seq] + [next_h, next_c],verbose=0)
    next_token = np.argmax(output[0, 0, :])
    next_char = reverse_target_char_index[next_token]
    if next_char == '\n':
      break
    else:
      pred_word += next_char
      source_seq = np.zeros((1, 1))
      source_seq[0] = next_token
  #list_pred_words.append(pred_word)
  return pred_word

input_word = input("\nEnter a romanized word: ")
input_word_in_ids = roman_tokenizer.texts_to_sequences([input_word])

input_word_in_ids_padded = post_padding(input_word_in_ids, max_encoder_seq_length_roman)

print(f"\nTransliterated word : {transliterate(input_word_in_ids_padded)}")



Enter a romanized word:  Abhijit



Transliterated word : অভিজিৎ
