# 1. Importing Libraries

The necessary libraries are imported for data processing, model building, and evaluation.

In [1]:
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import nltk.translate.bleu_score as bleu
import random
import string
from sklearn.model_selection import train_test_split
import os
import time

# 2. Loading and Preprocessing Data

The data from the CSV file (eng_tel.csv) is loaded into a pandas DataFrame and preprocessed to remove special characters, digits, and extra spaces.

In [2]:
eng_tel=pd.read_csv('eng_tel.csv')
eng_tel.head()

Unnamed: 0,English,Telugu
0,politicians do not have permission to do what ...,రాజకీయ నాయకులకు చేయవలసినది చేయడానికి అనుమతి లేదు.
1,"I'd like to tell you about one such child,",అలాంటి ఒక పిల్లల గురించి నేను మీకు చెప్పాలనుకు...
2,This percentage is even greater than the perce...,ఈ శాతం భారతదేశంలో ఉన్న శాతం కంటే ఎక్కువ.
3,what we really mean is that they're bad at not...,మేము నిజంగా అర్థం ఏమిటంటే వారు శ్రద్ధ చూపకపోవడ...
4,.The ending portion of these Vedas is called U...,.ఈ వేదాల ముగింపు భాగాన్ని ఉపనిషత్తు అంటారు.


In [3]:
eng_tel.dropna(inplace=True)
eng_tel.shape

(5615, 2)

In [4]:
exclude = set(string.punctuation) # Set of all special characters
remove_digits = str.maketrans('', '', string.digits) # Set of all digits

In [5]:
def preprocess(text):
    '''Function to preprocess English sentence'''
    text = text.lower() # lower casing
    text = re.sub("'", '', text) # remove the quotation marks if any
    text = ''.join(ch for ch in text if ch not in exclude)
    text = text.translate(remove_digits) # remove the digits
    text = text.strip()
    text = re.sub(" +", " ", text) # remove extra spaces
    text = '<start> ' + text + ' <end>'
    return text

In [6]:
def preprocess_tel(text):
    '''Function to preprocess Telugu sentence'''
    text = re.sub("'", '', text) # remove the quotation marks if any
    text = ''.join(ch for ch in text if ch not in exclude)
    text = text.strip()
    text = re.sub(" +", " ", text) # remove extra spaces
    text = '<start> ' + text + ' <end>'
    return text

In [7]:
eng_tel['English'] = eng_tel['English'].apply(preprocess)
eng_tel['Telugu'] = eng_tel['Telugu'].apply(preprocess_tel)


eng_tel.head()

Unnamed: 0,English,Telugu
0,<start> politicians do not have permission to ...,<start> రాజకీయ నాయకులకు చేయవలసినది చేయడానికి అ...
1,<start> id like to tell you about one such chi...,<start> అలాంటి ఒక పిల్లల గురించి నేను మీకు చెప...
2,<start> this percentage is even greater than t...,<start> ఈ శాతం భారతదేశంలో ఉన్న శాతం కంటే ఎక్కు...
3,<start> what we really mean is that theyre bad...,<start> మేము నిజంగా అర్థం ఏమిటంటే వారు శ్రద్ధ ...
4,<start> the ending portion of these vedas is c...,<start> ఈ వేదాల ముగింపు భాగాన్ని ఉపనిషత్తు అంట...


# 3. Tokenization and Dataset Preparation
The English and Telugu sentences are tokenized, and the dataset is prepared for training by splitting it into training and validation sets.

In [8]:
def tokenize(lang):

  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post',maxlen=20,dtype='int32')

  return tensor, lang_tokenizer

In [9]:
def load_dataset():

  input_tensor, inp_lang_tokenizer = tokenize(eng_tel['English'].values)
  target_tensor, targ_lang_tokenizer = tokenize(eng_tel['Telugu'].values)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [10]:
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset()

In [11]:
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [12]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.15)

print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

4772 4772 843 843


# 4. Define Constants and Hyperparameters

Constants and hyperparameters such as batch size, embedding dimensions, and optimizer settings are defined.

In [13]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 16
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 128
units = 1024
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE

vocab_inp_size =len(inp_lang.word_index.keys())
vocab_tar_size =len(targ_lang.word_index.keys())

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

# 5. Load GloVe Embeddings and Create Embedding Matrix

GloVe embeddings are loaded from a text file (glove.6B.300d.txt), and an embedding matrix is created based on the words in the dataset.

In [23]:
import numpy as np

# Initialize the embeddings_index dictionary
embeddings_index = dict()

# Open the GloVe embeddings file
with open('glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    # Iterate through each line in the file
    for line in f:
        values = line.split()
        word = values[0]
        # Concatenate negative sign "-" with numeric values
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Close the file
f.close()

'''
embedding_matrix = np.zeros((vocab_inp_size+1, 300))
for word, i in inp_lang.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector '''


embedding_matrix = np.zeros((vocab_inp_size + 1, 300))

for word, i in inp_lang.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Check if the dimensions match, if not, pad or truncate
        if embedding_vector.shape[0] != 300:
            if embedding_vector.shape[0] < 300:
                # Pad the vector with zeros to match the expected shape
                padded_embedding = np.pad(embedding_vector, (0, 300 - embedding_vector.shape[0]), 'constant')
                embedding_matrix[i] = padded_embedding
            else:
                # Truncate the vector to match the expected shape
                truncated_embedding = embedding_vector[:300]
                embedding_matrix[i] = truncated_embedding
        else:
            # If the dimensions already match, just assign it directly
            embedding_matrix[i] = embedding_vector


# 6. Define Encoder and Decoder Classes

Classes for the encoder and decoder models are defined, including the attention mechanism.

In [24]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, name="embedding_layer_encoder",trainable=False)
        self.gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_activation='sigmoid', recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [25]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_activation='sigmoid', recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

                # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, x, hidden, enc_output):

        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))

        attention_weights = tf.nn.softmax(score, axis=1)

        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)

        x = self.embedding(x)

        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        output, state = self.gru(x)

        output = tf.reshape(output, (-1, output.shape[2]))

        x = self.fc(output)

        return x, state, attention_weights

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [26]:
tf.keras.backend.clear_session()

encoder = Encoder(vocab_inp_size+1, 300, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size+1, embedding_dim, units, BATCH_SIZE)

# 7. Define Loss Function and Optimization Step

The loss function and optimization step for training the model are defined.

In [27]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction='none')


def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

# 8. Training Loop

The model is trained over multiple epochs, and the loss is printed for each epoch.

In [28]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    encoder.get_layer('embedding_layer_encoder').set_weights([embedding_matrix])
    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [29]:
EPOCHS = 25

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')

  print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')
  print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

Epoch 1 Batch 0 Loss 5.8198
Epoch 1 Batch 100 Loss 4.2694
Epoch 1 Batch 200 Loss 5.0857
Epoch 1 Loss 4.4449
Time taken for 1 epoch 65.49 sec

Epoch 2 Batch 0 Loss 5.1837
Epoch 2 Batch 100 Loss 3.3217
Epoch 2 Batch 200 Loss 3.2042
Epoch 2 Loss 4.1271
Time taken for 1 epoch 34.40 sec

Epoch 3 Batch 0 Loss 5.6114
Epoch 3 Batch 100 Loss 4.0349
Epoch 3 Batch 200 Loss 3.7618
Epoch 3 Loss 3.9874
Time taken for 1 epoch 34.05 sec

Epoch 4 Batch 0 Loss 3.8017
Epoch 4 Batch 100 Loss 3.4790
Epoch 4 Batch 200 Loss 3.6839
Epoch 4 Loss 3.8351
Time taken for 1 epoch 33.95 sec

Epoch 5 Batch 0 Loss 3.6902
Epoch 5 Batch 100 Loss 3.5116
Epoch 5 Batch 200 Loss 3.0642
Epoch 5 Loss 3.6073
Time taken for 1 epoch 33.92 sec

Epoch 6 Batch 0 Loss 3.0507
Epoch 6 Batch 100 Loss 2.9054
Epoch 6 Batch 200 Loss 3.8868
Epoch 6 Loss 3.2392
Time taken for 1 epoch 34.03 sec

Epoch 7 Batch 0 Loss 2.9565
Epoch 7 Batch 100 Loss 2.8350
Epoch 7 Batch 200 Loss 1.8837
Epoch 7 Loss 2.6675
Time taken for 1 epoch 34.05 sec

Epoch 

# 9. Evaluation Function

A function to evaluate input sentences and generate predictions in Telugu is defined.

In [30]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=20, padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)
    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()
    predicted_id = tf.argmax(predictions[0]).numpy()

    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result,attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result,attention_plot

# 10. Sample Input Sentences and Predictions

Two sample English sentences are provided, and their corresponding Telugu translations are generated using the trained model.



In [31]:
input_sentence= 'please ensure that you use the appropriate form '
print('Input sentence in english : ',input_sentence)
predicted_output,attention_plot=evaluate(input_sentence)
print('Predicted sentence in telugu : ',predicted_output)

Input sentence in english :  please ensure that you use the appropriate form 
Predicted sentence in telugu :  దయచేసి మీరు తగిన ఫారమ్‌ను ఉపయోగిస్తున్నారని నిర్ధారించుకోండి <end> 


In [39]:
input_sentence="So I decided I'm going to sell this new machine"
print('Input sentence in english : ',input_sentence)
predicted_output,attention_plot=evaluate(input_sentence)
print('Predicted sentence in telugu : ',predicted_output)

Input sentence in english :  So I decided I'm going to sell this new machine
Predicted sentence in telugu :  నేను ఈ కొత్త యంత్రాన్ని విక్రయించబోతున్నాను <end> 


In [43]:
input_sentence='due to this he refused to give in writing for his pardon ,even after his punishment was announced'
print('Input sentence in english : ',input_sentence)
predicted_output,attention_plot=evaluate(input_sentence)
print('Predicted sentence in telugu : ',predicted_output)

Input sentence in english :  due to this he refused to give in writing for his pardon ,even after his punishment was announced
Predicted sentence in telugu :  ఈ కారణంగా అతను తన క్షమాపణ కోసం లిఖితపూర్వకంగా ఇవ్వడానికి నిరాకరించాడు <end> 


# 11. Saving the model i.e encoder and decoder designed

In [44]:
from google.colab import drive
drive.mount('/content/drive')

# Save the models
encoder.save('/content/drive/MyDrive/Eng_Tel_encoder_model')
decoder.save('/content/drive/MyDrive/Eng_Tel_decoder_model')

Mounted at /content/drive


# 12 . FUnction to Find Accuracy of the predictions

In [45]:
# Function to evaluate BLEU score
def calculate_bleu_score(reference, candidate):
    return bleu.sentence_bleu([reference.split()], candidate.split())

# Function to evaluate accuracy
def evaluate_accuracy(input_sentence, target_sentence):
    predicted_output, _ = evaluate(input_sentence)
    bleu_score = calculate_bleu_score(target_sentence, predicted_output)
    return bleu_score

# Example usage to evaluate accuracy
input_sentence = 'due to this he refused to give in writing for his pardon ,even after his punishment was announced'
target_sentence = '<start> ఈ కారణంగా అతను తన క్షమాపణ కోసం లిఖితపూర్వకంగా ఇవ్వడానికి నిరాకరించాడు <end> '
accuracy = evaluate_accuracy(input_sentence, target_sentence)
print("BLEU Score:", accuracy)

BLEU Score: 0.9048374180359595


In [46]:
# Example usage to evaluate accuracy
input_sentence = " So I decided I'm going to sell this new machine"
target_sentence = '<start> నేను ఈ కొత్త యంత్రాన్ని విక్రయించబోతున్నాను  <end> '
accuracy = evaluate_accuracy(input_sentence, target_sentence)
print("BLEU Score:", accuracy)

BLEU Score: 0.846481724890614


In [47]:
# Example usage to evaluate accuracy
input_sentence = " please ensure that you use the appropriate form"
target_sentence = '<start> దయచేసి మీరు తగిన ఫారమ్‌ను ఉపయోగిస్తున్నారని నిర్ధారించుకోండి <end> '
accuracy = evaluate_accuracy(input_sentence, target_sentence)
print("BLEU Score:", accuracy)

BLEU Score: 0.8668778997501817


In [51]:
input_sentence="We may need to ask you about your background and look at any official documents you have to support the information you give ."
print('Input sentence in english : ',input_sentence)
predicted_output,attention_plot=evaluate(input_sentence)
print('Predicted sentence in telugu : ',predicted_output)
accuracy = evaluate_accuracy(input_sentence, predicted_output)
print("BLEU Score : ",accuracy)

Input sentence in english :  We may need to ask you about your background and look at any official documents you have to support the information you give .
Predicted sentence in telugu :  మీ నేపథ్యం గురించి మేము మిమ్మల్ని అడగాలి మరియు మేము ఇచ్చే సమాచారానికి మద్దతు ఇవ్వవలసిన అధికారిక పత్రాలను చూడండి <end> 
BLEU Score :  1.0
