In [134]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import string
from string import digits
import matplotlib.pyplot as plt
import re

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences

In [135]:
df = pd.read_csv("/kaggle/input/hindienglish-corpora/Hindi_English_Truncated_Corpus.csv",encoding='utf-8')
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [136]:
df.drop('source', axis=1, inplace=True)

In [137]:
df.isnull().sum()

english_sentence    2
hindi_sentence      0
dtype: int64

In [138]:
df.dropna(inplace=True)

In [139]:
def data_cleaning(text):

    # Convert to lower
    text = text.lower()

    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove digits
    text = re.sub(r"\d", '', text)
    
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text)
    
    return text

In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 127605 entries, 0 to 127606
Data columns (total 2 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   english_sentence  127605 non-null  object
 1   hindi_sentence    127605 non-null  object
dtypes: object(2)
memory usage: 2.9+ MB


In [141]:
df['cleaned_english_sentence'] = df['english_sentence'].apply(data_cleaning)
df['cleaned_hindi_sentence'] = df['hindi_sentence'].apply(data_cleaning)

In [142]:
df['length_eng_sentence']=df['cleaned_english_sentence'].apply(lambda x:len(x.split(" ")))
df['length_hin_sentence']=df['cleaned_hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [143]:
df = df[df['length_eng_sentence']< 4]

In [144]:
# Add start and end tokens to target sequences
df['cleaned_hindi_sentence'] = df['cleaned_hindi_sentence'].apply(lambda x : '<START> '+ x + ' <END>')

In [145]:
# Shuffle the dataframe
df = df.sample(n=1000, random_state=42).reset_index(drop=True)

In [146]:
df.head()

Unnamed: 0,english_sentence,hindi_sentence,cleaned_english_sentence,cleaned_hindi_sentence,length_eng_sentence,length_hin_sentence
0,(Laughter),(हंसी),laughter,<START> हंसी <END>,1,1
1,everything understood,मेरी बात मानो भाई,everything understood,<START> मेरी बात मानो भाई <END>,2,4
2,Nepali,नेपाली,nepali,<START> नेपाली <END>,1,1
3,Class:Rivers of northpole,श्रेणी:उत्तराखण्ड की नदियाँ,classrivers of northpole,<START> श्रेणीउत्तराखण्ड की नदियाँ <END>,3,3
4,External links,बाहरी कड़ियाँ,external links,<START> बाहरी कड़ियाँ <END>,2,2


In [147]:
df['hindi_sentence'][0], df['cleaned_hindi_sentence'][0]

('(हंसी)', '<START> हंसी <END>')

In [148]:
#Tokenize the texts and convert to sequences
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='')
en_tokenizer.fit_on_texts(df['cleaned_english_sentence'])
en_sequences = en_tokenizer.texts_to_sequences(df['cleaned_english_sentence'])

hi_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='')
hi_tokenizer.fit_on_texts(df['cleaned_hindi_sentence'])
hi_sequences = hi_tokenizer.texts_to_sequences(df['cleaned_hindi_sentence'])

english_vocab_size = len(en_tokenizer.word_index) + 1
hindi_vocab_size = len(hi_tokenizer.word_index) + 1
print("English Vocab Size: ", english_vocab_size)
print("Hindi Vocab Size: ", hindi_vocab_size)

English Vocab Size:  1220
Hindi Vocab Size:  1391


In [149]:
en_max_length = df['cleaned_english_sentence'].apply(lambda x:len(str(x).split())).max()
hi_max_length = df['cleaned_hindi_sentence'].apply(lambda x:len(str(x).split())).max()

en_max_length, hi_max_length

(3, 57)

In [150]:
#Prepare encoder data
encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(en_sequences, maxlen=en_max_length, padding='post')

In [151]:
#Prepare decoder data
decoder_inputs = []
decoder_outputs = []

for hi in hi_sequences:
  decoder_inputs.append(hi[:-1])
  decoder_outputs.append(hi[1:])

decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs, maxlen=hi_max_length, padding='post')
decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs, maxlen=hi_max_length, padding='post')

In [152]:
# Training and Testing split
# 90%, 10%
split = int(0.90 * df.shape[0])
print(split)

X_train = [encoder_inputs[:split], decoder_inputs[:split]]
y_train = decoder_outputs[:split]

# Test data to evaluate our NMT model using BLEU score
X_test = [encoder_inputs[split:], decoder_inputs[split:]]
y_test = decoder_outputs[split:]

print(X_train[0].shape, X_train[1].shape, y_train.shape)
     
print(X_test[0].shape, X_test[1].shape, y_test.shape)

900
(900, 3) (900, 57) (900, 57)
(100, 3) (100, 57) (100, 57)


In [196]:
#Define LSTM model
d_model = 256

#Encoder
inputs = tf.keras.layers.Input(shape=(None,))
x = tf.keras.layers.Embedding(english_vocab_size, d_model, mask_zero=True)(inputs)
_,state_h,state_c = tf.keras.layers.LSTM(d_model,activation='relu',return_state=True)(x)

#Decoder
targets = tf.keras.layers.Input(shape=(None,))
embedding_layer = tf.keras.layers.Embedding(hindi_vocab_size, d_model, mask_zero=True)
x = embedding_layer(targets)
decoder_lstm = tf.keras.layers.LSTM(d_model,activation='relu',return_sequences=True, return_state=True)
x,_,_ = decoder_lstm(x, initial_state=[state_h, state_c])
dense1 = tf.keras.layers.Dense(hindi_vocab_size, activation='softmax')
x = dense1(x)

model = tf.keras.models.Model(inputs=[inputs, targets],outputs=x)
model.summary()

loss = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer='rmsprop', loss=loss, metrics=['accuracy'])

In [197]:
history = model.fit(X_train, y_train, epochs=70, validation_data=(X_test, y_test))

Epoch 1/70
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 349ms/step - accuracy: 0.0152 - loss: 2137.6064 - val_accuracy: 0.0175 - val_loss: 523.5079
Epoch 2/70
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 252ms/step - accuracy: 0.0175 - loss: 739460416.0000 - val_accuracy: 0.0175 - val_loss: 10.3030
Epoch 3/70
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 254ms/step - accuracy: 0.0175 - loss: 172469.6719 - val_accuracy: 0.0175 - val_loss: 11.5714
Epoch 4/70
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 274ms/step - accuracy: 0.0175 - loss: 18969.0449 - val_accuracy: 0.0175 - val_loss: 9.7297
Epoch 5/70
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 259ms/step - accuracy: 0.0175 - loss: 25565.8984 - val_accuracy: 0.0175 - val_loss: 10.6536
Epoch 6/70
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 262ms/step - accuracy: 0.0175 - loss: 2084.9958 - val_accuracy: 0.0175 - val_loss

In [198]:
#Inference Model
#Encoder
encoder = tf.keras.models.Model(inputs, [state_h, state_c])

#Decoder
decoder_input_h = tf.keras.layers.Input(shape=(d_model,))
decoder_input_c = tf.keras.layers.Input(shape=(d_model,))
x = embedding_layer(targets)
x, decoder_output_h, decoder_output_c = decoder_lstm(x, initial_state=[decoder_input_h, decoder_input_c])
x = dense1(x)
decoder = tf.keras.models.Model([targets] + [decoder_input_h, decoder_input_c], 
                                [x] + [decoder_output_h, decoder_output_c])

In [55]:
# My indepth analysis about encoder and decoder outputs

# X_train[0][0]
a, b = encoder.predict(X_train[[0][0]])

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 137ms/step


In [56]:
en_tokenizer.sequences_to_texts([X_train[0][0]])


['he declares the result and reports it to the election commission and to the secretarygeneral of the concerned house                                                                                                                               ']

In [57]:
target_seq = np.zeros(1)
target_seq[0] = hi_tokenizer.word_index['start']

In [58]:
output_tokens, h, c = decoder.predict([target_seq.reshape(1, -1)] + [a[0].reshape(1, -1), b[0].reshape(1, -1)])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step


In [38]:
output_tokens[0][0]

array([-0.17923562, -0.12630852, -0.1736213 , ..., -0.06300774,
       -0.07561757, -0.05123287], dtype=float32)

In [59]:
np.argmax(output_tokens[0][0])

22

In [60]:
stop_condition = False
decoded_sentence = ''
# state_values = encoder_h, encoder_c
# h = a[0]
# c = b[0]
while not stop_condition:
    output_tokens, h, c = decoder.predict([target_seq.reshape(1, -1)] + [h.reshape(1, -1), c.reshape(1, -1)])
#         output_tokens, h, c = decoder.predict([target_seq.reshape(1, -1)] + [state_values])
    print(output_tokens.shape)

    # Sample a token
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
#         sampled_token_index = np.argmax(output_tokens[0][0])
    print(sampled_token_index)
#     sampled_char = reverse_target_char_index[sampled_token_index]
    sampled_char = hi_tokenizer.index_word[sampled_token_index]
    decoded_sentence += ' '+sampled_char

    # Exit condition: either hit max length
    # or find stop character.
    if (sampled_char == 'end' or
       len(decoded_sentence) > 50):
        stop_condition = True

    # Update the target sequence (of length 1).
    target_seq = np.zeros(1)
    target_seq[0] = sampled_token_index
    h = h
    c = c

#         states_value = [h, c]
# #     return decoded_sentence

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
(1, 1, 5080)
22
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
(1, 1, 5080)
59
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
(1, 1, 5080)
7
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
(1, 1, 5080)
998
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
(1, 1, 5080)
7
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
(1, 1, 5080)
197
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
(1, 1, 5080)
5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
(1, 1, 5080)
8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
(1, 1, 5080)
654
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
(1, 1, 5080)
200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
(1, 1, 5080)
8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━

In [61]:
decoded_sentence

' इस प्रकार की रूढ़ियों की अवधि में और तीसरी बार और वह'

In [199]:
def decode_sequence(input_seq):
    a, b = encoder.predict(input_seq)
    print(a.shape, b.shape)
    target_seq = np.zeros(1)
    target_seq[0] = hi_tokenizer.word_index['start']

    stop_condition = False
    decoded_sentence = ''
#     state_values = encoder_h, encoder_c
    h = a[0]
    c = b[0]
    # h = a[0]
    # c = b[0]
    while not stop_condition:
        output_tokens, h, c = decoder.predict([target_seq.reshape(1, -1)] + [h.reshape(1, -1), c.reshape(1, -1)])
#         output_tokens, h, c = decoder.predict([target_seq.reshape(1, -1)] + [state_values])
#         print(output_tokens.shape)

        # Sample a token
#         sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token_index = np.argmax(output_tokens[0][0])
#         print(sampled_token_index)
    #     sampled_char = reverse_target_char_index[sampled_token_index]
        sampled_char = hi_tokenizer.index_word[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == 'end' or
           len(decoded_sentence) > 100):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros(1)
        target_seq[0] = sampled_token_index
        h = h
        c = c

#         states_value = [h, c]
    return decoded_sentence

In [None]:
# X_train[0][0].reshape(1,-1)

In [83]:
# X_train[0][0].reshape(1,-1)

array([[  24, 1434,    2,  519,    4, 1435,    9,    5,    2,  772,  386,
           4,    5,    2,  773,    3,    2, 1436,  138,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0]], dtype=int3

In [86]:
# X_train[0][1].reshape(1, -1)

array([[  15,    7,  246, 1437,   11,   41,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0]], dtype=int3

In [200]:
for i in range(20):
    print(en_tokenizer.sequences_to_texts(X_train[0][i].reshape(1, -1)))
    print(hi_tokenizer.sequences_to_texts([y_train[i]]))

    print(decode_sequence(X_train[0][i].reshape(1, -1)))
    print("----")

['laughter  ']
['हंसी end                                                       ']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 315ms/step
(1, 256) (1, 256)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 228ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
 ठहाका end
----
['everything understood ']
['मेरी बात मानो भाई end                                                    ']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
(1, 256) (1, 256)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
 मेरी बात मानो भाई end
----
['nepali  ']
['नेपाली end                                                       ']

In [201]:
for i in range(20):
    print(en_tokenizer.sequences_to_texts(X_test[0][i].reshape(1, -1)))
    print(hi_tokenizer.sequences_to_texts([y_test[i]]))

    print(decode_sequence(X_test[0][i].reshape(1, -1)))

['tourist places ']
['दर्शनीय स्थल end                                                      ']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
(1, 256) (1, 256)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
 जोश end
['prompt then ']
['शब्द सुझाएँ फिर end                                                     ']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
(1, 256) (1, 256)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
 सांध्य ज्योति दर्पण end
['category architecture ']
['श्रेणीस्थापत्य end                                                       ']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0

In [None]:
X_test.shape