# 1. Importing Libraries

The necessary libraries are imported for data processing, model building, and evaluation.

In [1]:
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import nltk.translate.bleu_score as bleu
import random
import string
from sklearn.model_selection import train_test_split
import os
import time

# 2. Loading and Preprocessing Data

The data from the CSV file (eng_tel.csv) is loaded into a pandas DataFrame and preprocessed to remove special characters, digits, and extra spaces.

In [2]:
eng_hin=pd.read_csv('Hindi_English_Truncated_Corpus.csv')
eng_hin.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [3]:
eng_hin.dropna(inplace=True)
eng_hin=eng_hin[:50000]
eng_hin.drop(['source'],axis=1,inplace=True)
eng_hin.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eng_hin.drop(['source'],axis=1,inplace=True)


(9654, 2)

In [4]:
exclude = set(string.punctuation) # Set of all special characters
remove_digits = str.maketrans('', '', string.digits) # Set of all digits

In [5]:
def preprocess(text):
    '''Function to preprocess English sentence'''
    text = text.lower() # lower casing
    text = re.sub("'", '', text) # remove the quotation marks if any
    text = ''.join(ch for ch in text if ch not in exclude)
    text = text.translate(remove_digits) # remove the digits
    text = text.strip()
    text = re.sub(" +", " ", text) # remove extra spaces
    text = '<start> ' + text + ' <end>'
    return text

In [6]:
def preprocess_hin(text):
    '''Function to preprocess Marathi sentence'''
    text = re.sub("'", '', text) # remove the quotation marks if any
    text = ''.join(ch for ch in text if ch not in exclude)
    text = re.sub("[२३०८१५७९४६]", "", text) # remove the digits
    text = text.strip()
    text = re.sub(" +", " ", text) # remove extra spaces
    text = '<start> ' + text + ' <end>'
    return text

In [7]:
eng_hin['english_sentence'] = eng_hin['english_sentence'].apply(preprocess)
eng_hin['hindi_sentence'] = eng_hin['hindi_sentence'].apply(preprocess_hin)

eng_hin.rename(columns={"english_sentence": "english", "hindi_sentence": "hindi"},inplace=True)

eng_hin.head()

Unnamed: 0,english,hindi
0,<start> politicians do not have permission to ...,<start> राजनीतिज्ञों के पास जो कार्य करना चाहि...
1,<start> id like to tell you about one such chi...,<start> मई आपको ऐसे ही एक बच्चे के बारे में बत...
2,<start> this percentage is even greater than t...,<start> यह प्रतिशत भारत में हिन्दुओं प्रतिशत स...
3,<start> what we really mean is that theyre bad...,<start> हम ये नहीं कहना चाहते कि वो ध्यान नहीं...
4,<start> the ending portion of these vedas is c...,<start> इन्हीं वेदों का अंतिम भाग उपनिषद कहलात...


# 3. Tokenization and Dataset Preparation

The English and Telugu sentences are tokenized, and the dataset is prepared for training by splitting it into training and validation sets.

In [8]:
def tokenize(lang):

  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post',maxlen=20,dtype='int32')

  return tensor, lang_tokenizer

In [9]:
def load_dataset():

  input_tensor, inp_lang_tokenizer = tokenize(eng_hin['english'].values)
  target_tensor, targ_lang_tokenizer = tokenize(eng_hin['hindi'].values)

  return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer

In [10]:
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset()

In [11]:
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [12]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

7723 7723 1931 1931


# 4. Define Constants and Hyperparameters

Constants and hyperparameters such as batch size, embedding dimensions, and optimizer settings are defined.

In [13]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 256
units = 1024
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE

vocab_inp_size =len(inp_lang.word_index.keys())
vocab_tar_size =len(targ_lang.word_index.keys())

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

# 5. Load GloVe Embeddings and Create Embedding Matrix

GloVe embeddings are loaded from a text file (glove.6B.300d.txt), and an embedding matrix is created based on the words in the dataset.

In [16]:
embeddings_index = dict()
f = open('glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((vocab_inp_size + 1, 300))

for word, i in inp_lang.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Check if the dimensions match, if not, pad or truncate
        if embedding_vector.shape[0] != 300:
            if embedding_vector.shape[0] < 300:
                # Pad the vector with zeros to match the expected shape
                padded_embedding = np.pad(embedding_vector, (0, 300 - embedding_vector.shape[0]), 'constant')
                embedding_matrix[i] = padded_embedding
            else:
                # Truncate the vector to match the expected shape
                truncated_embedding = embedding_vector[:300]
                embedding_matrix[i] = truncated_embedding
        else:
            # If the dimensions already match, just assign it directly
            embedding_matrix[i] = embedding_vector


# 6. Define Encoder and Decoder Classes

Classes for the encoder and decoder models are defined, including the attention mechanism.

In [17]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, name="embedding_layer_encoder",trainable=False)
        self.gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_activation='sigmoid', recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [18]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True, recurrent_activation='sigmoid', recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

                # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, x, hidden, enc_output):

        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))

        attention_weights = tf.nn.softmax(score, axis=1)

        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)

        x = self.embedding(x)

        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        output, state = self.gru(x)

        output = tf.reshape(output, (-1, output.shape[2]))

        x = self.fc(output)

        return x, state, attention_weights

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [19]:
tf.keras.backend.clear_session()

encoder = Encoder(vocab_inp_size+1, 300, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size+1, embedding_dim, units, BATCH_SIZE)

# 7. Define Loss Function and Optimization Step

The loss function and optimization step for training the model are defined.

In [20]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction='none')


def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [21]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

# 8. Training Loop

The model is trained over multiple epochs, and the loss is printed for each epoch.

In [22]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    encoder.get_layer('embedding_layer_encoder').set_weights([embedding_matrix])
    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [23]:
EPOCHS = 15

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix=checkpoint_prefix)

  print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')
  print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

Epoch 1 Batch 0 Loss 6.3610
Epoch 1 Batch 100 Loss 5.0771
Epoch 1 Loss 4.9001
Time taken for 1 epoch 54.49 sec

Epoch 2 Batch 0 Loss 4.2841
Epoch 2 Batch 100 Loss 4.5213
Epoch 2 Loss 4.5070
Time taken for 1 epoch 29.40 sec

Epoch 3 Batch 0 Loss 4.8070
Epoch 3 Batch 100 Loss 4.0473
Epoch 3 Loss 4.3789
Time taken for 1 epoch 22.69 sec

Epoch 4 Batch 0 Loss 4.1506
Epoch 4 Batch 100 Loss 4.2864
Epoch 4 Loss 4.2275
Time taken for 1 epoch 23.97 sec

Epoch 5 Batch 0 Loss 4.4798
Epoch 5 Batch 100 Loss 3.9106
Epoch 5 Loss 4.0594
Time taken for 1 epoch 22.61 sec

Epoch 6 Batch 0 Loss 3.8949
Epoch 6 Batch 100 Loss 3.6991
Epoch 6 Loss 3.8836
Time taken for 1 epoch 24.89 sec

Epoch 7 Batch 0 Loss 3.9689
Epoch 7 Batch 100 Loss 3.6945
Epoch 7 Loss 3.7129
Time taken for 1 epoch 22.60 sec

Epoch 8 Batch 0 Loss 3.1643
Epoch 8 Batch 100 Loss 3.8585
Epoch 8 Loss 3.5072
Time taken for 1 epoch 24.71 sec

Epoch 9 Batch 0 Loss 3.4841
Epoch 9 Batch 100 Loss 3.4347
Epoch 9 Loss 3.2769
Time taken for 1 epoch 22.

In [24]:
for epoch in range(EPOCHS,20):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix=checkpoint_prefix)

  print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')
  print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

Epoch 16 Batch 0 Loss 0.9226
Epoch 16 Batch 100 Loss 1.0712
Epoch 16 Loss 1.0633
Time taken for 1 epoch 24.85 sec

Epoch 17 Batch 0 Loss 0.9146
Epoch 17 Batch 100 Loss 0.9474
Epoch 17 Loss 0.8563
Time taken for 1 epoch 22.92 sec

Epoch 18 Batch 0 Loss 0.6780
Epoch 18 Batch 100 Loss 0.6572
Epoch 18 Loss 0.6764
Time taken for 1 epoch 25.00 sec

Epoch 19 Batch 0 Loss 0.5077
Epoch 19 Batch 100 Loss 0.4994
Epoch 19 Loss 0.5257
Time taken for 1 epoch 22.89 sec

Epoch 20 Batch 0 Loss 0.3561
Epoch 20 Batch 100 Loss 0.4207
Epoch 20 Loss 0.4040
Time taken for 1 epoch 24.73 sec



# 9. Evaluation Function

A function to evaluate input sentences and generate predictions in Telugu is defined.

In [25]:
def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess(sentence)

  inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],maxlen=20, padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)
    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()
    predicted_id = tf.argmax(predictions[0]).numpy()

    result += targ_lang.index_word[predicted_id] + ' '

    if targ_lang.index_word[predicted_id] == '<end>':
      return result,attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result,attention_plot

# 10. Function to check Accuracy


In [46]:
# Function to evaluate BLEU score
def calculate_bleu_score(reference, candidate):
    return bleu.sentence_bleu([reference.split()], candidate.split())

# Function to evaluate accuracy
def evaluate_accuracy(input_sentence, target_sentence):
    predicted_output, _ = evaluate(input_sentence)
    bleu_score = calculate_bleu_score(target_sentence, predicted_output)
    return bleu_score

# 11. Sample Input Sentences and Predictions and Checking for accuracy


Two sample English sentences are provided, and their corresponding Telugu translations are generated using the trained model.

In [47]:
input_sentence= 'please ensure that you use the appropriate form '
print('Input sentence in english : ',input_sentence)
predicted_output,attention_plot=evaluate(input_sentence)
print('Predicted sentence in hindi : ',predicted_output)
accuracy = evaluate_accuracy(input_sentence,predicted_output )
print("BLEU Score:", accuracy)

Input sentence in english :  please ensure that you use the appropriate form 
Predicted sentence in hindi :  कृपया यह सुनिश्चित कर लें कि आप सही फॉर्म का प्रयोग कर रहें हैं <end> 
BLEU Score: 1.0


In [48]:
input_sentence='and do something with it to change the world '
print('Input sentence in english : ',input_sentence)
predicted_output,attention_plot=evaluate(input_sentence)
print('Predicted sentence in hindi : ',predicted_output)
accuracy = evaluate_accuracy(input_sentence,predicted_output )
print("BLEU Score:", accuracy)

Input sentence in english :  and do something with it to change the world 
Predicted sentence in hindi :  और इस दुनिया को बेहतर बनाने के लिये कुछ करेंगे । <end> 
BLEU Score: 1.0


In [49]:
input_sentence="So I decided I'm going to sell this new machine "
print('Input sentence in english : ',input_sentence)
predicted_output,attention_plot=evaluate(input_sentence)
print('Predicted sentence in hindi : ',predicted_output)
accuracy = evaluate_accuracy(input_sentence,predicted_output )
print("BLEU Score:", accuracy)

Input sentence in english :  So I decided I'm going to sell this new machine 
Predicted sentence in hindi :  तो मैने इस नए मशीन को <end> 
BLEU Score: 1.0


In [50]:
input_sentence="due to this he refused to give in writing for his pardon ,even after his punishment was announced "
print('Input sentence in english : ',input_sentence)
predicted_output,attention_plot=evaluate(input_sentence)
print('Predicted sentence in hindi : ',predicted_output)
accuracy = evaluate_accuracy(input_sentence,predicted_output )
print("BLEU Score:", accuracy)

Input sentence in english :  due to this he refused to give in writing for his pardon ,even after his punishment was announced 
Predicted sentence in hindi :  इसी कारण उन्होंने सजा सुनाने के बाद भी माफ़ीनामा लिखने से मना कर दिया । <end> 
BLEU Score: 1.0


In [51]:
input_sentence="We may need to ask you about your background and look at any official documents you have to support the information you give . "
print('Input sentence in english : ',input_sentence)
predicted_output,attention_plot=evaluate(input_sentence)
print('Predicted sentence in hindi : ',predicted_output)
accuracy = evaluate_accuracy(input_sentence,predicted_output )
print("BLEU Score:", accuracy)

Input sentence in english :  We may need to ask you about your background and look at any official documents you have to support the information you give . 
Predicted sentence in hindi :  आप का कोऋ ओङ्ङिचिअल् दस्तावेज आप का कोऋ ओङ्ङिचिअल् दस्तावेज आप का कोऋ ओङ्ङिचिअल् दस्तावेज आप का कोऋ ओङ्ङिचिअल् दस्तावेज 
BLEU Score: 1.0


In [52]:
input_sentence="What are you doing? "
print('Input sentence in english : ',input_sentence)
predicted_output,attention_plot=evaluate(input_sentence)
print('Predicted sentence in hindi : ',predicted_output)
accuracy = evaluate_accuracy(input_sentence,predicted_output )
print("BLEU Score:", accuracy)

Input sentence in english :  What are you doing? 
Predicted sentence in hindi :  क्या आप क्या करते हैं <end> 
BLEU Score: 1.0


# 11. Saving Model i.e encoder and de-coder



In [28]:
from google.colab import drive
drive.mount('/content/drive')

# Save the models
encoder.save('/content/drive/MyDrive/Eng_Hin_encoder_model')
decoder.save('/content/drive/MyDrive/Eng_Hin_decoder_model')

Mounted at /content/drive
