<a href="https://colab.research.google.com/github/akhilesh008/AIML-Training-Akhilesh/blob/main/English_to_Hindi_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('/content/drive/MyDrive/Language_Translation/Hindi_English_Truncated_Corpus.csv', encoding='utf-8')
print(data.head())

      source                                   english_sentence  \
0        ted  politicians do not have permission to do what ...   
1        ted         I'd like to tell you about one such child,   
2  indic2012  This percentage is even greater than the perce...   
3        ted  what we really mean is that they're bad at not...   
4  indic2012  .The ending portion of these Vedas is called U...   

                                      hindi_sentence  
0  राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर...  
1  मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...  
2   यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।  
3     हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते  
4        इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।  


## Data Cleaning and preprocessing

In [4]:
pd.isnull(data).sum()

source              0
english_sentence    2
hindi_sentence      0
dtype: int64

#### There are some null values under english_sentences. so remove it and repopulate the data

In [5]:
data = data[~pd.isnull(data['english_sentence'])]
pd.isnull(data).sum()

source              0
english_sentence    0
hindi_sentence      0
dtype: int64

#### Remove duplicates

In [6]:
data.drop_duplicates(inplace=True)

### Experimenting with some subset of the data - 25000 samples

In [7]:
data = data.sample(n=25000)
data.shape

(25000, 3)

### Do some preprocessing to the dataset like converting to lowercase , remove quotes, punctuations, numbers . Add start and end token to the dataset

In [8]:
# lowercase conversion
data['english_sentence'] = data['english_sentence'].apply(lambda x: x.lower())
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.lower())

In [9]:
# removing quotes
import re
data['english_sentence'] = data['english_sentence'].apply(lambda x: re.sub("'",'',x))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub("'",'',x))

In [10]:
# remove punctuations
# get all the punctuations
import string
punc = set(string.punctuation)
data['english_sentence'] = data['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in punc))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if x not in punc))


In [11]:
# remove all numbers
from string import digits
remove_digits = str.maketrans('','',digits)
data['english_sentence'] = data['english_sentence'].apply(lambda x: x.translate(remove_digits))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.translate(remove_digits))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

#remove extra space
data['english_sentence'] = data['english_sentence'].apply(lambda x: x.strip())
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.strip())
data['english_sentence'] = data['english_sentence'].apply(lambda x: re.sub(" +"," ",x))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub(" +"," ",x))

In [12]:
#add start and end tokens to each target sentence , i.e hindi sentence
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [13]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

# print the pre-processed dataframe header

data.head()

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,source,english_sentence,hindi_sentence
32285,ted,and for the rest of your life every time you hear classical music,"START_ और अपने बाकी जीवन में, हर बार जब आप शास्त्रीय संगीत सुनें _END"
57936,ted,♫ the flies are the size of your head ♫,START_ ♫ मक्खियां आपके सिर के नाप की हैं♫ _END
69653,indic2012,rathore kalyandas threatened to kill both mota raja rao udaisingh and jahangir because udai singh had decided to marry his daughter jodha bai to akbars son jahangir,START_ गढ़ सिवान के राठौर कल्याणदास ने मोटा राजा राव उदयसिंह और जहांगीर को मारने की धमकी भी दी थी क्योंकि उदयसिंह ने अपनी पुत्री जोधाबाई का विवाह अकबर के पुत्र जहांगीर से करने का निश्चय किया था। _END
64448,indic2012,using it hindi language can be written on onlineoffline,START_ ( इनके उपयोग से आनलाइन/आफलाइन कहीं भी हिन्दी में लिखा जा सकता है) _END
110882,ted,wherever theres a wall to the south,START_ जब भी दक्षिण के तरफ दीवार होती है _END


### Get english and Hindi vocab

In [14]:
all_eng_words = set()
for eng in data['english_sentence']:
  for word in eng.split():
    if word not in all_eng_words:
      all_eng_words.add(word)

all_hindi_words = set()
for hindi in data['hindi_sentence']:
  for word in hindi.split():
    if word not in all_hindi_words:
      all_hindi_words.add(word)

print('English vocab length - {}'.format(len(all_eng_words)))
print('Hindi Vocab length - {}'.format(len(all_hindi_words)))

English vocab length - 30880
Hindi Vocab length - 39332


### Add length of each sentence as column to the dataset

In [15]:
data['length_english_sentence'] = data['english_sentence'].apply(lambda x:len(x.split(" ")))
data['length_hindi_sentence'] = data['hindi_sentence'].apply(lambda x:len(x.split(" ")))

data.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_english_sentence,length_hindi_sentence
32285,ted,and for the rest of your life every time you hear classical music,"START_ और अपने बाकी जीवन में, हर बार जब आप शास्त्रीय संगीत सुनें _END",13,14
57936,ted,♫ the flies are the size of your head ♫,START_ ♫ मक्खियां आपके सिर के नाप की हैं♫ _END,10,10
69653,indic2012,rathore kalyandas threatened to kill both mota raja rao udaisingh and jahangir because udai singh had decided to marry his daughter jodha bai to akbars son jahangir,START_ गढ़ सिवान के राठौर कल्याणदास ने मोटा राजा राव उदयसिंह और जहांगीर को मारने की धमकी भी दी थी क्योंकि उदयसिंह ने अपनी पुत्री जोधाबाई का विवाह अकबर के पुत्र जहांगीर से करने का निश्चय किया था। _END,27,39
64448,indic2012,using it hindi language can be written on onlineoffline,START_ ( इनके उपयोग से आनलाइन/आफलाइन कहीं भी हिन्दी में लिखा जा सकता है) _END,9,15
110882,ted,wherever theres a wall to the south,START_ जब भी दक्षिण के तरफ दीवार होती है _END,7,10


In [16]:
data[data['length_english_sentence']>30].shape

(2451, 5)

In [17]:
data = data[data['length_english_sentence']<=20]
data = data[data['length_hindi_sentence']<=20]

In [18]:
data.shape

(16023, 5)

### Store the maximum length of english and hindi sentence

In [19]:
max_eng_sentence = max(data['length_english_sentence'])
max_hin_sentence = max(data['length_english_sentence'])

print('Max lenth of english sentence - {}'.format(max_eng_sentence))
print('Max length of hindi senetce - {}'.format(max_hin_sentence))

Max lenth of english sentence - 20
Max length of hindi senetce - 20


### Get the english and hindi words in sorted order
### Also get the encoder and decoder tokens

In [20]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))
num_encoder_token = len(all_eng_words)
num_decoder_token = len(all_hindi_words)

print('Encoder tokens {} ; Decoder Tokens {}'.format(num_encoder_token, num_encoder_token))

Encoder tokens 30880 ; Decoder Tokens 30880


### add one extra token for zero padding

In [21]:
num_decoder_token +=1

In [22]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

from sklearn.utils import shuffle
data = shuffle(data)
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_english_sentence,length_hindi_sentence
116138,indic2012,soor has described well about the character and attitude of yashodha and others,START_ . सूर ने यशोदा आदि के शील गुण आदि का सुंदर चित्रण किया है। _END,13,16
61953,ted,has been the age of intervention,"START_ हस्तक्षेप के वर्ष रहे हैं, _END",6,7
108598,ted,because we dont just enjoy now,START_ क्योंकि हम सिर्फ़ मज़ा ही नहीं उठाते बल्कि _END,6,10
40960,ted,you dont even know you have a model,START_ आपको पता भी नहीं चलेगा की आपके पास एक नमूना है| _END,8,13
52780,ted,the third set of ideas are what i call as,START_ तीसरा समूह हैं _END,10,5


### Split the data in train and test

In [23]:
from sklearn.model_selection import train_test_split

X, y = data['english_sentence'], data['hindi_sentence']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
print('Train size - {}; Test Size - {}'.format(X_train.shape, X_test.shape))

Train size - (12818,); Test Size - (3205,)


### Save datasets

In [24]:
X_train.to_pickle('/content/drive/MyDrive/Language_Translation/X_train.pkl')
X_test.to_pickle('/content/drive/MyDrive/Language_Translation/X_test.pkl')

### Generate training and testing batch

In [25]:
import numpy as np

def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_eng_sentence),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_hin_sentence),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_hin_sentence, num_decoder_token),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

### Encoder-Decoder Architecture

In [26]:
latent_dim =300

from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_token, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [27]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_token, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_token, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [28]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 300)            9264000   ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 300)            1179990   ['input_2[0][0]']             
                                                          0                                   

In [29]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 100

In [None]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)

  model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100