# **University of Madras,Guindy Campus**#
# **Department of Computer Science**
**II M.Sc Computer Science**

Team-1:

Angel Sarah Josephine B 36822101

Deepika M 36822102

Rithish R 36822112

Singavarapu Rohit Roy 36822113

Sunil Kumar M 36822114

Syed Aljibre A 36822115


In [None]:
class Config:
    vocab_size = 15000 # Vocabulary Size
    sequence_length = 20
    batch_size = 20
    validation_split = 0.3
    embed_dim = 256
    latent_dim = 256
    num_heads = 2
    epochs = 10 # Number of Epochs to train
    start_token = "[start]"
    end_token = "[end]"
config = Config()

In [None]:
!pip install keras-nlp --upgrade
!pip install rouge-score



In [None]:
import keras_nlp
import pandas as pd
import tensorflow as tf
from keras.layers import TextVectorization
import pathlib
import random
import string
import re
import numpy as np
from tensorflow import keras
from keras import layers
import sklearn
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm



In [None]:
data = pd.read_excel("/content/data - Copy.xlsx")
data.head()

Unnamed: 0,english,tamil
0,Go.,போ.
1,Go.,போ.
2,Go.,போ.
3,Go.,போ.
4,Hi.,வணக்கம்.


In [None]:
data["tamil"] = data["tamil"].apply(lambda item: f"{config.start_token} " + item + f" {config.end_token}")

In [None]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
def tamil_standardize(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]"%re.escape(strip_chars), "")
english_vectorization = TextVectorization(
    max_tokens=config.vocab_size,
    output_mode="int",
    output_sequence_length=config.sequence_length,
)
tamil_vectorization = TextVectorization(
    max_tokens=config.vocab_size,
    output_mode="int",
    output_sequence_length=config.sequence_length + 1,
    standardize=tamil_standardize,
)

english_vectorization.adapt(list(data["english"]))
tamil_vectorization.adapt(list(data["tamil"]))


In [None]:
def preprocess(english, tamil):
    english = english_vectorization(english)
    tamil = tamil_vectorization(tamil)
    return ({"encoder_inputs": english, "decoder_inputs": tamil[:, :-1]}, tamil[:, 1:])
def make_dataset(df, batch_size, mode):
    dataset = tf.data.Dataset.from_tensor_slices((list(df["english"]), list(df["tamil"])))
    if mode == "train":
       dataset = dataset.shuffle(batch_size * 4)
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(preprocess)
    dataset = dataset.prefetch(tf.data.AUTOTUNE).cache()
    return dataset

In [None]:
train, valid = train_test_split(data, test_size=config.validation_split)
train.shape, valid.shape

((17500, 2), (7500, 2))

In [None]:
train_ds = make_dataset(train, batch_size=config.batch_size, mode="train")
valid_ds = make_dataset(valid, batch_size=config.batch_size, mode="valid")

In [None]:
def get_model(config):
    encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
    x = keras_nlp.layers.TokenAndPositionEmbedding(
        config.vocab_size,
        config.sequence_length,
        config.embed_dim,
        mask_zero=True
    )(encoder_inputs)
    encoder_outputs = keras_nlp.layers.TransformerEncoder(intermediate_dim=config.embed_dim, num_heads=config.num_heads)(x)
    encoder = keras.Model(encoder_inputs, encoder_outputs)

    decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
    encoded_seq_inputs = keras.Input(shape=(None, config.embed_dim), name="decoder_state_inputs")
    x = keras_nlp.layers.TokenAndPositionEmbedding(
        config.vocab_size,
        config.sequence_length,
        config.embed_dim,
        mask_zero=True
    )(decoder_inputs)
    x = keras_nlp.layers.TransformerDecoder(config.latent_dim, config.num_heads)(x, encoded_seq_inputs)
    x = layers.Dropout(0.1)(x)
    decoder_outputs = layers.Dense(config.vocab_size, activation="softmax")(x)
    decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

    decoder_outputs = decoder([decoder_inputs, encoder_outputs])
    transformer = keras.Model(
        [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
    )
    transformer.compile(
        "adam",
        loss="sparse_categorical_crossentropy",
        metrics=[
            "accuracy"
        ]
    )
    return transformer

In [None]:
model_ta = get_model(config)

In [None]:
checkpoints = tf.keras.callbacks.ModelCheckpoint(
    "model_ta.tf",
    monitor="val_accuracy",
    mode="min",
    save_best_only=True
)
early_stop = tf.keras.callbacks.EarlyStopping(
    patience=10,
    monitor="val_loss",
    mode="min",
    restore_best_weights=True
)
history = model_ta.fit(train_ds, epochs=config.epochs, validation_data=valid_ds, callbacks=[checkpoints, early_stop])
accuracy = model_ta.evaluate(train_ds, return_dict=True)['accuracy']
print(f'Accuracy Of Tamil: {accuracy*100:.4f}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy Of Tamil: 91.6260


In [None]:
loaded_model_ta = tf.keras.models.load_model("model_ta.tf", custom_objects={
    "TokenAndPositionEmbedding": keras_nlp.layers.TokenAndPositionEmbedding,
    "TransformerEncoder": keras_nlp.layers.TransformerEncoder,
    "TransformerDecoder": keras_nlp.layers.TransformerDecoder
})
loaded_model_ta.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_inputs (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 token_and_position_embeddi  (None, None, 256)            3845120   ['encoder_inputs[0][0]']      
 ng (TokenAndPositionEmbedd                                                                       
 ing)                                                                                             
                                                                                                  
 decoder_inputs (InputLayer  [(None, None)]               0         []                  

In [None]:
tamil_vocab = tamil_vectorization.get_vocabulary()
tamil_index_lookup = dict(zip(range(len(tamil_vocab)), tamil_vocab))
start_index = tamil_vocab.index(config.start_token)
end_index = tamil_vocab.index(config.end_token)
unk_index = tamil_vocab.index("[UNK]")
def decode_sequence_ta(model_ta, input_sentence, filtered_values = [start_index, end_index, unk_index]):
    tokenized_input_sentence = english_vectorization([input_sentence])
    decoded_sentence = [start_index] + [0] * (config.sequence_length)
    for i in range(config.sequence_length):
        decoded_sentence_constant = tf.constant([decoded_sentence[:config.sequence_length]])
        predictions = model_ta([tokenized_input_sentence, decoded_sentence_constant])
        sampled_token_index = np.argmax(predictions[0, i, :])
        decoded_sentence[i + 1] = sampled_token_index
        if sampled_token_index == end_index:
            break
    components = [tamil_index_lookup[c] for c in decoded_sentence if c not in filtered_values]
    return " ".join(components)

In [None]:
for i in tqdm(np.random.choice(len(data), 10)):
    item = data.iloc[i]
    translated = decode_sequence_ta(loaded_model_ta, item["english"])
    print("English:", item["english"])
    print("tamil:", item["tamil"].replace("[start] ", "").replace(" [end]", ""))
    print("Translated:", translated)

  0%|          | 0/10 [00:00<?, ?it/s]

English: Help yourself.
tamil: உங்களுக்கு உதவுங்கள்.
Translated: நீங்கள் யாரை                 
English: Are you imitating me?
tamil: நீங்கள் என்னைப் பின்பற்றுகிறீர்களா?
Translated: நீங்கள் என்னை அழையுங்கள்                
English: He is at his office.
tamil: அவர் தனது அலுவலகத்தில் இருக்கிறார்.
Translated: அவர் தனது பெயரை இருக்கிறார்               
English: I fell asleep.
tamil: நான் தூங்கிவிட்டேன்.
Translated: நான் கணிதத்தை                 
English: You're mean.
tamil: நீங்கள் சராசரி.
Translated: நீங்கள் சராசரி                 
English: Even Tom smiled.
tamil: டாம் கூட சிரித்தார்.
Translated: டாம் கூறினார்                 
English: Stop right here.
tamil: இங்கேயே நிறுத்துங்கள்.
Translated: இங்கே வாருங்கள்                 
English: I left.
tamil: நான் வெளியேறினேன்.
Translated: நான் உன்னைப்                 
English: Let me call Tom.
tamil: நான் டாம் என்று அழைக்கிறேன்.
Translated: டாம் என்னை மறந்து விடுங்கள்               
English: Let's vote.
tamil: வாக்களிப்போம்.
Translated: டாம் அக்கறை

In [None]:
data_hi = pd.read_csv("/content/hindi - data.csv")
data_hi.head()

Unnamed: 0,english,hindi
0,Go.,जाना।
1,Go.,जाना।
2,Go.,जाना।
3,Go.,जाना।
4,Hi.,नमस्ते।


In [None]:
data_hi["hindi"] = data_hi["hindi"].apply(lambda item: f"{config.start_token} " + item + f" {config.end_token}")

In [None]:
def hindi_standardize(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]"%re.escape(strip_chars), "")
english_vectorization = TextVectorization(
    max_tokens=config.vocab_size,
    output_mode="int",
    output_sequence_length=config.sequence_length,
)
hindi_vectorization = TextVectorization(
    max_tokens=config.vocab_size,
    output_mode="int",
    output_sequence_length=config.sequence_length + 1,
    standardize=hindi_standardize,
)
english_vectorization.adapt(list(data_hi["english"]))
hindi_vectorization.adapt(list(data_hi["hindi"]))

In [None]:
def preprocess(english, hindi):
    english = english_vectorization(english)
    hindi = hindi_vectorization(hindi)
    return ({"encoder_inputs": english, "decoder_inputs": hindi[:, :-1]}, hindi[:, 1:])
def make_dataset(df, batch_size, mode):
    dataset = tf.data.Dataset.from_tensor_slices((list(df["english"]), list(df["hindi"])))
    if mode == "train":
       dataset = dataset.shuffle(batch_size * 4)
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(preprocess)
    dataset = dataset.prefetch(tf.data.AUTOTUNE).cache()
    return dataset

In [None]:
train, valid = train_test_split(data_hi, test_size=config.validation_split)
train.shape, valid.shape

((17500, 2), (7500, 2))

In [None]:
train_ds = make_dataset(train, batch_size=config.batch_size, mode="train")
valid_ds = make_dataset(valid, batch_size=config.batch_size, mode="valid")

In [None]:
def get_model(config):
    encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
    x = keras_nlp.layers.TokenAndPositionEmbedding(
        config.vocab_size,
        config.sequence_length,
        config.embed_dim,
        mask_zero=True
    )(encoder_inputs)
    encoder_outputs = keras_nlp.layers.TransformerEncoder(intermediate_dim=config.embed_dim, num_heads=config.num_heads)(x)
    encoder = keras.Model(encoder_inputs, encoder_outputs)

    decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
    encoded_seq_inputs = keras.Input(shape=(None, config.embed_dim), name="decoder_state_inputs")
    x = keras_nlp.layers.TokenAndPositionEmbedding(
        config.vocab_size,
        config.sequence_length,
        config.embed_dim,
        mask_zero=True
    )(decoder_inputs)
    x = keras_nlp.layers.TransformerDecoder(config.latent_dim, config.num_heads)(x, encoded_seq_inputs)
    x = layers.Dropout(0.1)(x)
    decoder_outputs = layers.Dense(config.vocab_size, activation="softmax")(x)
    decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

    decoder_outputs = decoder([decoder_inputs, encoder_outputs])
    transformer = keras.Model(
        [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
    )
    transformer.compile(
        "adam",
        loss="sparse_categorical_crossentropy",
        metrics=[
            "accuracy"
        ]
    )
    return transformer

In [None]:
model_hi = get_model(config)

In [None]:
checkpoints = tf.keras.callbacks.ModelCheckpoint(
    "model_hi.tf",
    monitor="val_accuracy",
    mode="min",
    save_best_only=True
)
early_stop = tf.keras.callbacks.EarlyStopping(
    patience=10,
    monitor="val_loss",
    mode="min",
    restore_best_weights=True
)
history = model_hi.fit(train_ds, epochs=config.epochs, validation_data=valid_ds, callbacks=[checkpoints, early_stop])
accuracy = model_hi.evaluate(train_ds, return_dict=True)['accuracy']
print(f'Accuracy of Hindi: {accuracy*100:.4f}')

In [None]:
loaded_model_hi = tf.keras.models.load_model("model_hi.tf", custom_objects={
    "TokenAndPositionEmbedding": keras_nlp.layers.TokenAndPositionEmbedding,
    "TransformerEncoder": keras_nlp.layers.TransformerEncoder,
    "TransformerDecoder": keras_nlp.layers.TransformerDecoder
})
loaded_model_hi.summary()

In [None]:
hindi_vocab = hindi_vectorization.get_vocabulary()
hindi_index_lookup = dict(zip(range(len(hindi_vocab)), hindi_vocab))
start_index = hindi_vocab.index(config.start_token)
end_index = hindi_vocab.index(config.end_token)
unk_index = hindi_vocab.index("[UNK]")
def decode_sequence_hi(model_hi, input_sentence, filtered_values = [start_index, end_index, unk_index]):
    tokenized_input_sentence = english_vectorization([input_sentence])
    decoded_sentence = [start_index] + [0] * (config.sequence_length)
    for i in range(config.sequence_length):
        decoded_sentence_constant = tf.constant([decoded_sentence[:config.sequence_length]])
        predictions = model_hi([tokenized_input_sentence, decoded_sentence_constant])
        sampled_token_index = np.argmax(predictions[0, i, :])
        decoded_sentence[i + 1] = sampled_token_index
        if sampled_token_index == end_index:
            break
    components = [hindi_index_lookup[c] for c in decoded_sentence if c not in filtered_values]
    return " ".join(components)

In [None]:
for i in tqdm(np.random.choice(len(data_hi), 10)):
    item = data_hi.iloc[i]
    translated = decode_sequence_hi(loaded_model_hi, item["english"])
    print("English:", item["english"])
    print("hindi:", item["hindi"].replace("[start] ", "").replace(" [end]", ""))
    print("Translated:", translated)

In [None]:
data_ml = pd.read_csv("/content/Malayalam - data.csv")
data_ml.head()

In [None]:
data_ml["malayalam"] = data_ml["malayalam"].apply(lambda item: f"{config.start_token} " + item + f" {config.end_token}")

In [None]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
def malayalam_standardize(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]"%re.escape(strip_chars), "")
english_vectorization = TextVectorization(
    max_tokens=config.vocab_size,
    output_mode="int",
    output_sequence_length=config.sequence_length,
)
malayalam_vectorization = TextVectorization(
    max_tokens=config.vocab_size,
    output_mode="int",
    output_sequence_length=config.sequence_length + 1,
    standardize=malayalam_standardize,
)
english_vectorization.adapt(list(data_ml["english"]))
malayalam_vectorization.adapt(list(data_ml["malayalam"]))


In [None]:
def preprocess(english, malayalam):
    english = english_vectorization(english)
    malayalam = malayalam_vectorization(malayalam)
    return ({"encoder_inputs": english, "decoder_inputs": malayalam[:, :-1]}, malayalam[:, 1:])
def make_dataset(df, batch_size, mode):
    dataset = tf.data.Dataset.from_tensor_slices((list(df["english"]), list(df["malayalam"])))
    if mode == "train":
       dataset = dataset.shuffle(batch_size * 4)
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(preprocess)
    dataset = dataset.prefetch(tf.data.AUTOTUNE).cache()
    return dataset

In [None]:
train, valid = train_test_split(data_ml, test_size=config.validation_split)
train.shape, valid.shape

In [None]:
train_ds = make_dataset(train, batch_size=config.batch_size, mode="train")
valid_ds = make_dataset(valid, batch_size=config.batch_size, mode="valid")

In [None]:
def get_model_ml(config):
    encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
    x = keras_nlp.layers.TokenAndPositionEmbedding(
        config.vocab_size,
        config.sequence_length,
        config.embed_dim,
        mask_zero=True
    )(encoder_inputs)
    encoder_outputs = keras_nlp.layers.TransformerEncoder(intermediate_dim=config.embed_dim, num_heads=config.num_heads)(x)
    encoder = keras.Model(encoder_inputs, encoder_outputs)

    decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
    encoded_seq_inputs = keras.Input(shape=(None, config.embed_dim), name="decoder_state_inputs")
    x = keras_nlp.layers.TokenAndPositionEmbedding(
        config.vocab_size,
        config.sequence_length,
        config.embed_dim,
        mask_zero=True
    )(decoder_inputs)
    x = keras_nlp.layers.TransformerDecoder(config.latent_dim, config.num_heads)(x, encoded_seq_inputs)
    x = layers.Dropout(0.1)(x)
    decoder_outputs = layers.Dense(config.vocab_size, activation="softmax")(x)
    decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

    decoder_outputs = decoder([decoder_inputs, encoder_outputs])
    transformer = keras.Model(
        [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
    )
    transformer.compile(
        "adam",
        loss="sparse_categorical_crossentropy",
        metrics=[
            "accuracy"
        ]
    )
    return transformer

In [None]:
model_ml = get_model_ml(config)

In [None]:
checkpoints = tf.keras.callbacks.ModelCheckpoint(
    "model_ml.tf",
    monitor="val_accuracy",
    mode="min",
    save_best_only=True
)
early_stop = tf.keras.callbacks.EarlyStopping(
    patience=10,
    monitor="val_loss",
    mode="min",
    restore_best_weights=True
)
history = model_ml.fit(train_ds, epochs=config.epochs, validation_data=valid_ds, callbacks=[checkpoints, early_stop])
accuracy = model_ml.evaluate(train_ds, return_dict=True)['accuracy']
print(f'Accuracy Of Malayalam: {accuracy*100:.4f}')

In [None]:
loaded_model_ml = tf.keras.models.load_model("model_ml.tf", custom_objects={
    "TokenAndPositionEmbedding": keras_nlp.layers.TokenAndPositionEmbedding,
    "TransformerEncoder": keras_nlp.layers.TransformerEncoder,
    "TransformerDecoder": keras_nlp.layers.TransformerDecoder
})
loaded_model_ml.summary()

In [None]:
malayalam_vocab = malayalam_vectorization.get_vocabulary()
malayalam_index_lookup = dict(zip(range(len(malayalam_vocab)), malayalam_vocab))
start_index = malayalam_vocab.index(config.start_token)
end_index = malayalam_vocab.index(config.end_token)
unk_index = malayalam_vocab.index("[UNK]")
def decode_sequence_ml(model_ml, input_sentence, filtered_values = [start_index, end_index, unk_index]):
    tokenized_input_sentence = english_vectorization([input_sentence])
    decoded_sentence = [start_index] + [0] * (config.sequence_length)
    for i in range(config.sequence_length):
        decoded_sentence_constant = tf.constant([decoded_sentence[:config.sequence_length]])
        predictions = model_ml([tokenized_input_sentence, decoded_sentence_constant])
        sampled_token_index = np.argmax(predictions[0, i, :])
        decoded_sentence[i + 1] = sampled_token_index
        if sampled_token_index == end_index:
            break
    components = [malayalam_index_lookup[c] for c in decoded_sentence if c not in filtered_values]
    return " ".join(components)

In [None]:
for i in tqdm(np.random.choice(len(data_ml), 10)):
    item = data_ml.iloc[i]
    translated = decode_sequence_ml(loaded_model_ml, item["english"])
    print("English:", item["english"])
    print("malayalam:", item["malayalam"].replace("[start] ", "").replace(" [end]", ""))
    print("Translated:", translated)

In [None]:
data_te = pd.read_csv("/content/telugu - data.csv")
data_te.head()

In [None]:
data_te["telugu"] = data_te["telugu"].apply(lambda item: f"{config.start_token} " + item + f" {config.end_token}")

In [None]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
def telugu_standardize(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]"%re.escape(strip_chars), "")
english_vectorization = TextVectorization(
    max_tokens=config.vocab_size,
    output_mode="int",
    output_sequence_length=config.sequence_length,
)
telugu_vectorization = TextVectorization(
    max_tokens=config.vocab_size,
    output_mode="int",
    output_sequence_length=config.sequence_length + 1,
    standardize=telugu_standardize,
)
english_vectorization.adapt(list(data_te["english"]))
telugu_vectorization.adapt(list(data_te["telugu"]))


In [None]:
def preprocess(english, telugu):
    english = english_vectorization(english)
    telugu = telugu_vectorization(telugu)
    return ({"encoder_inputs": english, "decoder_inputs": telugu[:, :-1]}, telugu[:, 1:])
def make_dataset(df, batch_size, mode):
    dataset = tf.data.Dataset.from_tensor_slices((list(df["english"]), list(df["telugu"])))
    if mode == "train":
       dataset = dataset.shuffle(batch_size * 4)
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(preprocess)
    dataset = dataset.prefetch(tf.data.AUTOTUNE).cache()
    return dataset

In [None]:
train, valid = train_test_split(data_te, test_size=config.validation_split)
train.shape, valid.shape

In [None]:
train_ds = make_dataset(train, batch_size=config.batch_size, mode="train")
valid_ds = make_dataset(valid, batch_size=config.batch_size, mode="valid")

In [None]:
def get_model_te(config):
    encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
    x = keras_nlp.layers.TokenAndPositionEmbedding(
        config.vocab_size,
        config.sequence_length,
        config.embed_dim,
        mask_zero=True
    )(encoder_inputs)
    encoder_outputs = keras_nlp.layers.TransformerEncoder(intermediate_dim=config.embed_dim, num_heads=config.num_heads)(x)
    encoder = keras.Model(encoder_inputs, encoder_outputs)

    decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
    encoded_seq_inputs = keras.Input(shape=(None, config.embed_dim), name="decoder_state_inputs")
    x = keras_nlp.layers.TokenAndPositionEmbedding(
        config.vocab_size,
        config.sequence_length,
        config.embed_dim,
        mask_zero=True
    )(decoder_inputs)
    x = keras_nlp.layers.TransformerDecoder(config.latent_dim, config.num_heads)(x, encoded_seq_inputs)
    x = layers.Dropout(0.1)(x)
    decoder_outputs = layers.Dense(config.vocab_size, activation="softmax")(x)
    decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

    decoder_outputs = decoder([decoder_inputs, encoder_outputs])
    transformer = keras.Model(
        [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
    )
    transformer.compile(
        "adam",
        loss="sparse_categorical_crossentropy",
        metrics=[
            "accuracy"
        ]
    )
    return transformer

In [None]:
model_te = get_model_te(config)

In [None]:
checkpoints = tf.keras.callbacks.ModelCheckpoint(
    "model_te.tf",
    monitor="val_accuracy",
    mode="min",
    save_best_only=True
)
early_stop = tf.keras.callbacks.EarlyStopping(
    patience=10,
    monitor="val_loss",
    mode="min",
    restore_best_weights=True
)
history = model_te.fit(train_ds, epochs=config.epochs, validation_data=valid_ds, callbacks=[checkpoints, early_stop])
accuracy = model_te.evaluate(train_ds, return_dict=True)['accuracy']
print(f'Accuracy of Telugu: {accuracy*100:.4f}')

In [None]:
loaded_model_te = tf.keras.models.load_model("model_te.tf", custom_objects={
    "TokenAndPositionEmbedding": keras_nlp.layers.TokenAndPositionEmbedding,
    "TransformerEncoder": keras_nlp.layers.TransformerEncoder,
    "TransformerDecoder": keras_nlp.layers.TransformerDecoder
})
loaded_model_te.summary()

In [None]:
telugu_vocab = telugu_vectorization.get_vocabulary()
telugu_index_lookup = dict(zip(range(len(telugu_vocab)), telugu_vocab))
start_index = telugu_vocab.index(config.start_token)
end_index = telugu_vocab.index(config.end_token)
unk_index = telugu_vocab.index("[UNK]")
def decode_sequence_te(model_te, input_sentence, filtered_values = [start_index, end_index, unk_index]):
    tokenized_input_sentence = english_vectorization([input_sentence])
    decoded_sentence = [start_index] + [0] * (config.sequence_length)
    for i in range(config.sequence_length):
        decoded_sentence_constant = tf.constant([decoded_sentence[:config.sequence_length]])
        predictions = model_te([tokenized_input_sentence, decoded_sentence_constant])
        sampled_token_index = np.argmax(predictions[0, i, :])
        decoded_sentence[i + 1] = sampled_token_index
        if sampled_token_index == end_index:
            break
    components = [telugu_index_lookup[c] for c in decoded_sentence if c not in filtered_values]
    return " ".join(components)

In [None]:
for i in tqdm(np.random.choice(len(data_te), 10)):
    item = data_te.iloc[i]
    translated = decode_sequence_te(loaded_model_te, item["english"])
    print("English:", item["english"])
    print("telugu:", item["telugu"].replace("[start] ", "").replace(" [end]", ""))
    print("Translated:", translated)

***TAMIL***

In [None]:
def translate_user_input(input_sentence):
    translated = decode_sequence_ta(loaded_model_ta, input_sentence)

    print("Input English:", input_sentence)
    print("Translated Tamil:", translated)

# Example usage:
user_input_sentence = "Where is Tom?"
translate_user_input(user_input_sentence)

***HINDI***



In [None]:
def translate_user_input(input_sentence):
    translated = decode_sequence_hi(loaded_model_hi, input_sentence)

    print("Input English:", input_sentence)
    print("Translated hindi:", translated)

# Example usage:
user_input_sentence = "Tom is happy"
translate_user_input(user_input_sentence)

***MALAYALAM***

In [None]:
def translate_user_input(input_sentence):
    translated = decode_sequence_ml(loaded_model_ml, input_sentence)

    print("Input English:", input_sentence)
    print("Translated malayalam:", translated)

# Example usage:
user_input_sentence = "We know her"
translate_user_input(user_input_sentence)

***TELUGU***

In [None]:
def translate_user_input(input_sentence):
    translated = decode_sequence_te(loaded_model_te, input_sentence)

    print("Input English:", input_sentence)
    print("Translated telugu:", translated)

# Example usage:
user_input_sentence = "Tom is happy"
translate_user_input(user_input_sentence)