In [216]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences

# print(os.listdir("../input"))

# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)
# pd.set_option('display.max_colwidth', -1)

# Any results you write to the current directory are saved as output.

In [217]:
df = pd.read_csv("/kaggle/input/hindienglish-corpora/Hindi_English_Truncated_Corpus.csv",encoding='utf-8')
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [218]:
df.drop('source', axis=1, inplace=True)

In [219]:
df.isnull().sum()

english_sentence    2
hindi_sentence      0
dtype: int64

In [220]:
df.dropna(inplace=True)

In [221]:
def data_cleaning(text):

    # Convert to lower
    text = text.lower()

    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove digits
    text = re.sub(r"\d", '', text)
    
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text)
    
    return text

In [222]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 127605 entries, 0 to 127606
Data columns (total 2 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   english_sentence  127605 non-null  object
 1   hindi_sentence    127605 non-null  object
dtypes: object(2)
memory usage: 2.9+ MB


In [223]:
df['cleaned_english_sentence'] = df['english_sentence'].apply(data_cleaning)
df['cleaned_hindi_sentence'] = df['hindi_sentence'].apply(data_cleaning)

In [224]:
# Add start and end tokens to target sequences
df['cleaned_hindi_sentence'] = df['cleaned_hindi_sentence'].apply(lambda x : '<START> '+ x + ' <END>')

In [225]:
# Shuffle the dataframe
df = df.sample(n=100, random_state=42).reset_index(drop=True)

In [226]:
df.shape

(100, 4)

In [227]:
df.head()

Unnamed: 0,english_sentence,hindi_sentence,cleaned_english_sentence,cleaned_hindi_sentence
0,He declares the result and reports it to the E...,वही परिणाम की घोषणा करता है और निर्वाचन आयोग क...,he declares the result and reports it to the e...,<START> वही परिणाम की घोषणा करता है और निर्वाच...
1,was a little uncomfortable for them.,थोडा कठिन था।,was a little uncomfortable for them,<START> थोडा कठिन था। <END>
2,"A multi-purpose auditorium , a branch of the S...","बहुउद्देशीय सभागार , भारतीय स्टेट बैंक की शाखा...",a multipurpose auditorium a branch of the stat...,<START> बहुउद्देशीय सभागार भारतीय स्टेट बैंक क...
3,No fees is to be paid for filing the appeal to...,अधिकरण में अपील करने के लिए कोई फीस नहीं देनी ...,no fees is to be paid for filing the appeal to...,<START> अधिकरण में अपील करने के लिए कोई फीस नह...
4,headind kaun banega crorepati,शीर्षक कौन बनेगा करोड़पति (Kaun Banega Crorepa...,headind kaun banega crorepati,<START> शीर्षक कौन बनेगा करोड़पति kaun banega ...


In [228]:
df['hindi_sentence'][0], df['cleaned_hindi_sentence'][0]

('वही परिणाम की घोषणा करता है और निर्वाचन आयोग को और संबद्ध सदन के महासचिव को उसकी सूचना देता है .',
 '<START> वही परिणाम की घोषणा करता है और निर्वाचन आयोग को और संबद्ध सदन के महासचिव को उसकी सूचना देता है  <END>')

In [229]:
#Tokenize the texts and convert to sequences
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='')
en_tokenizer.fit_on_texts(df['cleaned_english_sentence'])
en_sequences = en_tokenizer.texts_to_sequences(df['cleaned_english_sentence'])

hi_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='')
hi_tokenizer.fit_on_texts(df['cleaned_hindi_sentence'])
hi_sequences = hi_tokenizer.texts_to_sequences(df['cleaned_hindi_sentence'])

english_vocab_size = len(en_tokenizer.word_index) + 1
hindi_vocab_size = len(hi_tokenizer.word_index) + 1
print("English Vocab Size: ", english_vocab_size)
print("Hindi Vocab Size: ", hindi_vocab_size)

English Vocab Size:  798
Hindi Vocab Size:  907


In [230]:
en_max_length = df['cleaned_english_sentence'].apply(lambda x:len(str(x).split())).max()
hi_max_length = df['cleaned_hindi_sentence'].apply(lambda x:len(str(x).split())).max()

en_max_length, hi_max_length

(55, 65)

In [231]:
#Prepare encoder data
encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(en_sequences, maxlen=en_max_length, padding='post')

In [232]:
#Prepare decoder data
decoder_inputs = []
decoder_outputs = []

for hi in hi_sequences:
  decoder_inputs.append(hi[:-1])
  decoder_outputs.append(hi[1:])

decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs, maxlen=hi_max_length, padding='post')
decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs, maxlen=hi_max_length, padding='post')

In [233]:
encoder_inputs

array([[ 11, 142,   2, ...,   0,   0,   0],
       [ 10,   7, 149, ...,   0,   0,   0],
       [  7, 151, 152, ...,   0,   0,   0],
       ...,
       [ 14, 775,   8, ...,   0,   0,   0],
       [ 23,  16, 783, ...,   0,   0,   0],
       [793, 794,   7, ...,   0,   0,   0]], dtype=int32)

In [234]:
decoder_inputs, decoder_outputs

(array([[  2, 187, 188, ...,   0,   0,   0],
        [  2, 195,  41, ...,   0,   0,   0],
        [  2, 196, 197, ...,   0,   0,   0],
        ...,
        [  2,  16, 882, ...,   0,   0,   0],
        [  2, 892, 893, ...,   0,   0,   0],
        [  2, 904,  76, ...,   0,   0,   0]], dtype=int32),
 array([[187, 188,   8, ...,   0,   0,   0],
        [195,  41,  42, ...,   0,   0,   0],
        [196, 197,  65, ...,   0,   0,   0],
        ...,
        [ 16, 882,  44, ...,   0,   0,   0],
        [892, 893, 894, ...,   0,   0,   0],
        [904,  76,  14, ...,   0,   0,   0]], dtype=int32))

In [235]:
df.shape[0], en_max_length, hi_max_length

(100, 55, 65)

In [236]:
# Training and Testing split
# 90%, 10%
split = int(0.90 * df.shape[0])
print(split)

X_train = [encoder_inputs[:split], decoder_inputs[:split]]
y_train = decoder_outputs[:split]

# Test data to evaluate our NMT model using BLEU score
X_test = [encoder_inputs[split:], decoder_inputs[split:]]
y_test = decoder_outputs[split:]

print(X_train[0].shape, X_train[1].shape, y_train.shape)
     
print(X_test[0].shape, X_test[1].shape, y_test.shape)

90
(90, 55) (90, 65) (90, 65)
(10, 55) (10, 65) (10, 65)


In [306]:
hindi_vocab_size

907

In [245]:
#Define LSTM model
d_model = 256

#Encoder
inputs = tf.keras.layers.Input(shape=(None,))
x = tf.keras.layers.Embedding(english_vocab_size, d_model, mask_zero=True)(inputs)
_,state_h,state_c = tf.keras.layers.LSTM(d_model,activation='tanh',return_state=True)(x)

#Decoder
targets = tf.keras.layers.Input(shape=(None,))
embedding_layer = tf.keras.layers.Embedding(hindi_vocab_size, d_model, mask_zero=True)
x = embedding_layer(targets)
decoder_lstm = tf.keras.layers.LSTM(d_model,activation='tanh',return_sequences=True, return_state=True)
x,_,_ = decoder_lstm(x, initial_state=[state_h, state_c])
dense1 = tf.keras.layers.Dense(hindi_vocab_size, activation='softmax')
x = dense1(x)

model = tf.keras.models.Model(inputs=[inputs, targets],outputs=x)
loss = tf.keras.losses.SparseCategoricalCrossentropy()
optimize = tf.keras.optimizers.Adam(0.0001)
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

model.summary()

In [246]:
hindi_vocab_size

907

In [407]:
history = model.fit(X_train, y_train, epochs=100, validation_split=0.2)
     

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 464ms/step - accuracy: 0.2381 - loss: 1.4219 - val_accuracy: 0.0231 - val_loss: 9.0199
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 432ms/step - accuracy: 0.2318 - loss: 1.3720 - val_accuracy: 0.0231 - val_loss: 9.0526
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 431ms/step - accuracy: 0.2263 - loss: 1.3030 - val_accuracy: 0.0222 - val_loss: 9.0592
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 441ms/step - accuracy: 0.2332 - loss: 1.2967 - val_accuracy: 0.0214 - val_loss: 9.0736
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 429ms/step - accuracy: 0.2369 - loss: 1.2509 - val_accuracy: 0.0214 - val_loss: 9.0811
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 434ms/step - accuracy: 0.2291 - loss: 1.2291 - val_accuracy: 0.0231 - val_loss: 9.0948
Epoch 7/100
[1m3/3[0m [32m━━━━━

In [409]:

#Inference Model

#Encoder
encoder = tf.keras.models.Model(inputs, [state_h, state_c])

#Decoder
decoder_input_h = tf.keras.layers.Input(shape=(d_model,))
decoder_input_c = tf.keras.layers.Input(shape=(d_model,))
x = embedding_layer(targets)
x, decoder_output_h, decoder_output_c = decoder_lstm(x, initial_state=[decoder_input_h, decoder_input_c])
x = dense1(x)
decoder = tf.keras.models.Model([targets] + [decoder_input_h, decoder_input_c], 
                                [x] + [decoder_output_h, decoder_output_c])

In [283]:
decoder.summary()

In [419]:
# My indepth analysis about encoder and decoder outputs

X_train[0][0]
a, b = encoder.predict(X_train[[0][0]])

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step


In [420]:
# X_train[0][0]

In [421]:
a, b = encoder.predict(X_train[[0][0]])

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step


In [422]:
X_train[0]

array([[ 11, 142,   2, ...,   0,   0,   0],
       [ 10,   7, 149, ...,   0,   0,   0],
       [  7, 151, 152, ...,   0,   0,   0],
       ...,
       [136,  61,  70, ...,  70,   0,   0],
       [136, 709,  17, ...,   4, 728, 729],
       [101,  22, 730, ...,   0,   0,   0]], dtype=int32)

In [423]:
X_train[[0][0]]

array([[ 11, 142,   2, ...,   0,   0,   0],
       [ 10,   7, 149, ...,   0,   0,   0],
       [  7, 151, 152, ...,   0,   0,   0],
       ...,
       [136,  61,  70, ...,  70,   0,   0],
       [136, 709,  17, ...,   4, 728, 729],
       [101,  22, 730, ...,   0,   0,   0]], dtype=int32)

In [355]:
X_train[[0][0]]

array([[ 11, 142,   2, ...,   0,   0,   0],
       [ 10,   7, 149, ...,   0,   0,   0],
       [  7, 151, 152, ...,   0,   0,   0],
       ...,
       [136,  61,  70, ...,  70,   0,   0],
       [136, 709,  17, ...,   4, 728, 729],
       [101,  22, 730, ...,   0,   0,   0]], dtype=int32)

In [424]:
# original_texts = []
# X_test = X_train[0][0]

en_tokenizer.sequences_to_texts([X_train[0][0]])

# for sequence in X_test:
#     original_text = en_tokenizer.sequences_to_texts(sequence)[0]
#     original_texts.append(original_text)
    
# original_texts

['he declares the result and reports it to the election commission and to the secretarygeneral of the concerned house                                    ']

In [365]:
# a[0]

In [425]:
target_seq = np.zeros(1)
target_seq[0] = hi_tokenizer.word_index['start']

In [344]:
# target_seq.shape, a[0].reshape(1, -1).shape, b[0].shape

In [426]:
output_tokens, h, c = decoder.predict([target_seq.reshape(1, -1)] + [a[0].reshape(1, -1), b[0].reshape(1, -1)])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step


In [427]:
h.shape, c.shape, output_tokens.shape

((1, 256), (1, 256), (1, 1, 907))

In [428]:
output_tokens[0][0].shape

(907,)

In [430]:
np.argmax(output_tokens[0][0])

187

In [432]:
stop_condition = False
decoded_sentence = ''
# state_values = encoder_h, encoder_c
# h = a[0]
# c = b[0]
while not stop_condition:
    output_tokens, h, c = decoder.predict([target_seq.reshape(1, -1)] + [h.reshape(1, -1), c.reshape(1, -1)])
#         output_tokens, h, c = decoder.predict([target_seq.reshape(1, -1)] + [state_values])
    print(output_tokens.shape)

    # Sample a token
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
#         sampled_token_index = np.argmax(output_tokens[0][0])
    print(sampled_token_index)
#     sampled_char = reverse_target_char_index[sampled_token_index]
    sampled_char = hi_tokenizer.index_word[sampled_token_index]
    decoded_sentence += ' '+sampled_char

    # Exit condition: either hit max length
    # or find stop character.
    if (sampled_char == 'end' or
       len(decoded_sentence) > 50):
        stop_condition = True

    # Update the target sequence (of length 1).
    target_seq = np.zeros(1)
    target_seq[0] = sampled_token_index
    h = h
    c = c

#         states_value = [h, c]
#     return decoded_sentence

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
(1, 1, 907)
187
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
(1, 1, 907)
188
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
(1, 1, 907)
8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
(1, 1, 907)
189
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
(1, 1, 907)
64
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
(1, 1, 907)
7
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
(1, 1, 907)
6
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
(1, 1, 907)
190
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
(1, 1, 907)
191
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
(1, 1, 907)
10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
(1, 1, 907)
6


In [433]:
decoded_sentence

' वही परिणाम की घोषणा करता है और निर्वाचन आयोग को और'

In [330]:
# hi_tokenizer.index_word

In [447]:
def decode_sequence(input_seq):
    a, b = encoder.predict(input_seq)
    print(a.shape, b.shape)
    target_seq = np.zeros(1)
    target_seq[0] = hi_tokenizer.word_index['start']

    stop_condition = False
    decoded_sentence = ''
#     state_values = encoder_h, encoder_c
    h = a[0]
    c = b[0]
    # h = a[0]
    # c = b[0]
    while not stop_condition:
        output_tokens, h, c = decoder.predict([target_seq.reshape(1, -1)] + [h.reshape(1, -1), c.reshape(1, -1)])
#         output_tokens, h, c = decoder.predict([target_seq.reshape(1, -1)] + [state_values])
        print(output_tokens.shape)

        # Sample a token
#         sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token_index = np.argmax(output_tokens[0][0])
        print(sampled_token_index)
    #     sampled_char = reverse_target_char_index[sampled_token_index]
        sampled_char = hi_tokenizer.index_word[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == 'end' or
           len(decoded_sentence) > 150):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros(1)
        target_seq[0] = sampled_token_index
        h = h
        c = c

#         states_value = [h, c]
    return decoded_sentence

In [448]:
X_train[0][0].reshape(1,-1)

array([[ 11, 142,   2, 143,   5, 144,   9,   6,   2, 145, 146,   5,   6,
          2, 147,   3,   2, 148,  47,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0]], dtype=int32)

In [449]:
y_train[0][0]

187

In [450]:
print(en_tokenizer.sequences_to_texts([X_train[0][0]]))
print(hi_tokenizer.sequences_to_texts([y_train[0]]))

decode_sequence(X_train[[0][0]])

['he declares the result and reports it to the election commission and to the secretarygeneral of the concerned house                                    ']
['वही परिणाम की घोषणा करता है और निर्वाचन आयोग को और संबद्ध सदन के महासचिव को उसकी सूचना देता है end                                            ']
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
(90, 256) (90, 256)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
(1, 1, 907)
187
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
(1, 1, 907)
188
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
(1, 1, 907)
8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
(1, 1, 907)
189
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
(1, 1, 907)
64
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
(1, 1, 907)
7
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
(1, 1, 907

' वही परिणाम की घोषणा करता है और निर्वाचन आयोग को और संबद्ध सदन के महासचिव को उसकी सूचना देता है end'

In [452]:
print(en_tokenizer.sequences_to_texts([X_test[0][0]]))
print(hi_tokenizer.sequences_to_texts([y_test[0]]))

decode_sequence(X_test[[0][0]])

IndexError: invalid index to scalar variable.

In [453]:
X_test

array([ 11, 142,   2, 143,   5, 144,   9,   6,   2, 145, 146,   5,   6,
         2, 147,   3,   2, 148,  47,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0], dtype=int32)