# Loading Libraries

In [None]:
import re
import pickle
import string
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout 
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.preprocessing.text import Tokenizer

# Custom Functions

In [None]:
def remove_punctuation(text):  
    """
    This function removes all punctuation characters from the input text.
    """
    return text.translate(str.maketrans('', '', string.punctuation))

def contains_hindi(text):  
    """
    Function that checks if the input string contains any Hindi characters.
    """
    if isinstance(text, str):
        return bool(re.search(r'[\u0900-\u097F]', text))
    return False

def contains_english(text):
    """
    Function that checks if the input string contains any English characters.
    """
    if isinstance(text, str):
        return bool(re.search(r'[a-zA-Z]', text))
    return False

def generate_model_input(X: pd.Series, y: pd.Series, encoder_word_index: dict, decoder_word_index: dict, 
                         max_encoder_sent_size: int = 20, max_decoder_sent_size: int = 20, decoder_vocab_size: int = 50000):
    """
    Function to prepare input and target data for sequence to sequence model training.
    """
    batch_size = len(X)  
    for j in range(0, len(X), batch_size):
        encoder_input_data = np.zeros((batch_size, max_encoder_sent_size), dtype = 'float64')
        decoder_input_data = np.zeros((batch_size, max_decoder_sent_size), dtype = 'float64')
        decoder_target_data = np.zeros((batch_size, max_decoder_sent_size, decoder_vocab_size), dtype = 'float64')
        for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
            for t, word in enumerate(input_text.split()):
                encoder_input_data[i, t] = encoder_word_index[word]
            for t, word in enumerate(target_text.split()):
                if t < len(target_text.split(' '))-1:
                    decoder_input_data[i, t] = decoder_word_index[word]
                if t > 0:
                    decoder_target_data[i, t-1, decoder_word_index[word]] = 1 
    return encoder_input_data, decoder_input_data, decoder_target_data

# Loading data

In [None]:
data = pd.read_csv('Hindi_English_Truncated_Corpus.csv')  

# Let's select 10,000 lines from the data due to computational constraints and the large dataset size
data = data.sample(n=10000,random_state=42).reset_index(drop=True)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   source            10000 non-null  object
 1   english_sentence  9999 non-null   object
 2   hindi_sentence    10000 non-null  object
dtypes: object(3)
memory usage: 234.5+ KB


# Pre-Process Data

In [None]:
# Eliminate rows where english_sentence is null
data = data[data['english_sentence'].notnull()].reset_index(drop=True) 

# Conver both the columns into lower case
data['english_sentence'] = data['english_sentence'].str.lower()
data['hindi_sentence'] = data['hindi_sentence'].str.lower() 

# Eliminate all punctuation
data['english_sentence'] = data['english_sentence'].apply(remove_punctuation)
data['hindi_sentence'] = data['hindi_sentence'].apply(remove_punctuation)

# Remove rows where the "english text" column contains Hindi text to prevent data noise, and vice versa
data = data[~ data['english_sentence'].apply(contains_hindi)].reset_index(drop=True)
data = data[~ data['hindi_sentence'].apply(contains_english)].reset_index(drop=True)

# Eliminate all numerical digits from the text
data['english_sentence'] = data['english_sentence'].str.replace(r"\d", "", regex=True)
data['hindi_sentence'] = data['hindi_sentence'].str.replace(r"[२३०८१५७९४६\da-zA-Z]", "", regex=True)

# Remove some additional special characters
special_chars_english = ['”“', '‘', 'বসু', 'ē', '””', 'ó', '“', '“”“”', 'ī', '€', 'á', 'é', '”', 'š', '“”', '♫', 'ś', 'ā', 'ō', 'চন্দ্র', '°', 'í', 'সুভাষ']
data["english_sentence"]  =  data["english_sentence"].str.replace('|'.join(map(re.escape,special_chars_english)), "", regex=True)

special_chars_hindi = ['ن', '™', 'ভ', '‘', 'ق', 'م', '্', 'র', 'ا', '†', '\u200c', '½', '¬', 'ل', 'ر', 'ٓ', '…', '\u200b', '“', 'া', 'ন', 'থ', '中', '⇒', 'º', 'ক', 'ষ', 'س', 'ٕ', '”', 'দ', '\u200e', '♫', '¼', '國', '\x14', 'ঠ', 'চ', 'স', '¥', 'ী', 'ু', '°', 'ব', "_", '।']
data["hindi_sentence"]  =  data["hindi_sentence"].str.replace('|'.join(map(re.escape,special_chars_hindi)), "", regex=True)

# Trim the sentences by removing extra spaces
data['english_sentence'] = data['english_sentence'].str.strip()
data['hindi_sentence'] = data['hindi_sentence'].str.strip()

# Add start and end tokens to the target text
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: 'START_ ' + str(x) + ' _END')

# Eliminate long sentences
data['english_sentence_len'] = data['english_sentence'].apply(lambda x: len(x.split(' ')))
data['hindi_sentence_len'] = data['hindi_sentence'].apply(lambda x: len(x.split(' '))) 

data = data[((data['english_sentence_len']<=20) & (data['hindi_sentence_len']<=20))].reset_index(drop=True)

In [6]:
print("max length of hindi sentence:{}".format(max(data['hindi_sentence_len'])))
print("max length of english sentence:{}".format(max(data['english_sentence_len'])))
print(data.shape)
data.head()


max length of hindi sentence:20
max length of english sentence:20
(6359, 5)


Unnamed: 0,source,english_sentence,hindi_sentence,english_sentence_len,hindi_sentence_len
0,ted,was a little uncomfortable for them,START_ थोडा कठिन था _END,6,5
1,indic2012,but mulla assamudin was proved to be not eligible,START_ मगर मुल्ला असमुद्दीन अक्षम सिद्ध हुए _END,9,8
2,ted,i would never have to make a book and then pre...,START_ मुझे कभी भी किताब बना कर किसी प्रदर्शनस...,15,16
3,tides,no other national leader except nehru shared t...,START_ नेहरू को छोड़कर और किसी भी राष्ट्र नेता...,9,17
4,tides,innocent people were shot arrested jailed an...,START_ अंग्रेज शासकों ने निर्दोष लोगों की जान ...,14,20


# Convert the Text Into Tokens

In [None]:
# Tokinize English Text
english_encoder = Tokenizer(filters='', lower=False)  
english_encoder.fit_on_texts(data.english_sentence)

# Tokinize Hindi Text
hindi_encoder = Tokenizer(filters='', lower=False)
hindi_encoder.fit_on_texts(data.hindi_sentence)

In [None]:
# Define English vocabulary size and word index, and vice versa
english_vocab_len = len(english_encoder.word_index) + 1 # Adding 1 for padding zero
english_word_index = english_encoder.word_index
english_index_word = english_encoder.index_word

# Define Hindi vocabulary size and word index, and vice versa
hindi_vocab_len = len(hindi_encoder.word_index) + 1 # Adding 1 for padding zero
hindi_word_index = hindi_encoder.word_index
hindi_index_word = hindi_encoder.index_word

# Max Size of sentence
max_eng_sen_len = 20
max_hindi_sen_len = 20

X = data['english_sentence']
y = data['hindi_sentence']

In [None]:
# Creating the input data for Seq2Seq modeling 
encoder_input_data, decoder_input_data, decoder_target_data = generate_model_input(X = X,
                                                                                    y = y,
                                                                                    encoder_word_index = english_word_index,
                                                                                    decoder_word_index = hindi_word_index,
                                                                                    max_encoder_sent_size = max_eng_sen_len,
                                                                                    max_decoder_sent_size = max_hindi_sen_len,
                                                                                    decoder_vocab_size = hindi_vocab_len)

* Encode all the inputs for Encoder and Decoder 
* decoder_input_data: This represents the input to the decoder during training
* decoder_target_data: This is the expected output from the decoder. (Teacher Forcing) 
* Teacher forcing is a training strategy used in sequence-to-sequence models where the actual target output from the training dataset is passed as the next input to the decoder, rather than using the decoder's own previous prediction. This approach helps the model converge faster and improves performance by providing correct context during training.

# Seq2Seq Model Building and Training 

![My Image](Encoder_and_Decoder.jpg)

In [None]:
# Encoder and Decoder Architecture
embedding_dim = 64  # Dimension of embedding vectors
lstm_dim = 64  # Number of LSTM units

# Encoder
encoder_inputs = Input(shape=(None,), name='encoder_inputs')  # Encoder input layer
encoder_emb = Embedding(input_dim=english_vocab_len, output_dim=embedding_dim, mask_zero=True, name='encoder_embedding')(encoder_inputs)  # Encoder embedding layer
encoder_lstm = LSTM(lstm_dim, name='encoder_lstm', return_state=True)  # Encoder LSTM layer
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_emb)  # Get encoder outputs and states
encoder_state = [encoder_state_h, encoder_state_c]  # Store encoder states

# Decoder
decoder_inputs = Input(shape=(None,), name='decoder_inputs')  # Decoder input layer
decoder_emb_layer = Embedding(input_dim=hindi_vocab_len, output_dim=embedding_dim, mask_zero=True, name='decoder_embedding')  # Decoder embedding layer
decoder_emb = decoder_emb_layer(decoder_inputs)  # Apply embedding to decoder inputs
decoder_lstm = LSTM(lstm_dim, name='decoder_lstm', return_state=True, return_sequences=True)  # Decoder LSTM layer
decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=encoder_state)  # Connect decoder LSTM with encoder states

decoder_dense = Dense(hindi_vocab_len, activation='softmax')  # Dense layer with softmax activation
decoder_outputs = decoder_dense(decoder_outputs)  # Apply dense layer to decoder outputs

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)  # Define the model with encoder and decoder inputs

In [None]:
# Complie the model with optimizer rmsprop and loss categorical_crossentropy as we have multi-class classification problems 
model.compile(optimizer='rmsprop', loss='categorical_crossentropy') 

In [12]:
model.summary()

In [None]:
# Training the model 
batch_size = 128
epochs = 500

history = model.fit([encoder_input_data,decoder_input_data],
                    decoder_target_data, 
                    epochs=epochs,
                    # batch_size = batch_size,
                    )

Epoch 1/500
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 89ms/step - loss: 8.1673
Epoch 2/500
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 94ms/step - loss: 6.6792
Epoch 3/500
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 84ms/step - loss: 6.5883
Epoch 4/500
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 100ms/step - loss: 6.5499
Epoch 5/500
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 90ms/step - loss: 6.5083
Epoch 6/500
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 76ms/step - loss: 6.4513
Epoch 7/500
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 77ms/step - loss: 6.4117
Epoch 8/500
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 76ms/step - loss: 6.3599
Epoch 9/500
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 81ms/step - loss: 6.3288
Epoch 10/500
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

# Testing and performing inference with the trained model

### Training vs. Inference in Language Translation
* Training Phase:
    * Objective: Teach the model to accurately translate sentences from the source language (e.g., English) to the target language (e.g., Hindi) by learning from paired examples.
    * Mechanism: Utilizes teacher forcing, where the model is guided using the actual target translations during training.
    * Data: Uses large datasets of aligned source-target sentence pairs.
    * Inference (Prediction) Phase:
* Inference Phase:
    * Objective: Translate new, unseen sentences from the source language to the target language using the trained model.
    * Mechanism: Relies on the model's own previous translations to generate the next word, as true target sentences aren't available.
    * Data: Processes individual source sentences to produce translations.

### Detailed Differences in Architecture and Workflow
* Training:
    * Teacher Forcing: The decoder receives the actual next word from the target sentence at each timestep.
    * Example: If the target sentence is "मैं ठीक हूँ" ("I am fine"), after processing "मैं" ("I"), the decoder is fed "ठीक" ("fine") as the next input.

* Inference:
    * Autoregressive Generation: The decoder uses its previously generated word to predict the next word.
    * Example: Starting with a start token <start>, the decoder predicts "मैं" ("I"), then uses "मैं" as input to predict the next word, and so on

## Model Structure Configuration
* Training:
    * Unified Encoder-Decoder Model: Combines both encoder and decoder into a single model that processes entire sentence pairs simultaneously.
    * Batch Processing: Handles multiple sentence pairs at once for efficient computation.

* Inference:
    * Separate Models: Often splits encoder and decoder into two distinct models to facilitate step-by-step translation.
    * Encoder Model: Encodes the source sentence and outputs the initial states.
    * Decoder Model: Generates the target sentence one word at a time using its own predictions.

In [35]:
# As Training and Predict is slightly Diffrent in terms of Architecture  
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_state)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(lstm_dim,))
decoder_state_input_c = Input(shape=(lstm_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= decoder_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [None]:
# Decode the input sequence 
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    print(states_value)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = hindi_word_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = hindi_index_word[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

# Prediction

In [46]:
testing_index = 22
input_seq, actual_output, _ = encoder_input_data[[testing_index]], decoder_input_data[[testing_index]], decoder_target_data[testing_index]
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X[testing_index:testing_index+1].values[0])
print('Actual Hindi Translation:', y[testing_index:testing_index+1].values[0])
print('Predicted Hindi Translation:', decoded_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
[array([[ 0.6704298 , -0.4029944 , -0.33142275,  0.06034991,  0.15625395,
        -0.17619132,  0.52965045,  0.76254046,  0.03213897,  0.06815488,
        -0.06503742, -0.03291818,  0.25282866, -0.78122604, -0.24525733,
         0.5436336 ,  0.03457872,  0.7299581 ,  0.22003767,  0.18649064,
         0.47737852, -0.6951509 , -0.14749774, -0.8636037 , -0.6667455 ,
         0.68263155,  0.8621505 ,  0.14227512, -0.23429173, -0.02629138,
        -0.37613198,  0.80473554,  0.61116403,  0.5954827 ,  0.09045365,
        -0.6389973 , -0.03640006, -0.5198302 ,  0.40076995, -0.24318467,
         0.31379873, -0.37021825,  0.66576624,  0.6855959 ,  0.10738394,
        -0.06449943,  0.11774065, -0.3164354 ,  0.7613881 , -0.08142158,
        -0.12353835,  0.8132702 ,  0.6940669 ,  0.06921986, -0.02606781,
         0.17728792, -0.3205029 , -0.3554554 ,  0.6668752 ,  0.1727669 ,
         0.12468718, -0.18950728,  0.51169026, -0.1

# Save all the necessary Artifacts for future reference and the Streamlit app

In [None]:
# Save all the model 
model.save("encoder_decoder_model.keras")
encoder_model.save("encoder_model.keras")
decoder_model.save("decoder_model.keras")

In [None]:
# Save the English tokenizer
with open('english_tokenizer.pickle', 'wb') as f:
    pickle.dump(english_encoder, f)

# Save the Hindi tokenizer
with open('hindi_tokenizer.pickle', 'wb') as f:
    pickle.dump(hindi_encoder, f)