In [1]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from keras.layers import Dense, Input, LSTM, Embedding
from keras.models import Model
from string import digits
from sklearn.model_selection import train_test_split

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [67]:
df = pd.read_csv('/content/drive/MyDrive/Hindi_English_Truncated_Corpus.csv')

In [68]:
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [69]:
df.shape

(127607, 3)

In [70]:
pd.isnull(df).sum()

source              0
english_sentence    2
hindi_sentence      0
dtype: int64

In [74]:
df.dropna(inplace=True)

In [72]:
df.drop_duplicates(inplace=True)

In [75]:
pd.isnull(df).sum()

source              0
english_sentence    0
hindi_sentence      0
dtype: int64

In [76]:
df = df.sample(n=25000, random_state=42)

In [77]:
Lemmatizer = WordNetLemmatizer()

In [78]:
def preprocessing_text(df):
    processed_text = list()
    lines = df['english_sentence'].values.tolist()
    for i in lines:
        text = i.lower()
        pattern = re.compile('http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:[0-9a-fA-F][0-9a-fA-F]))+')
        text = pattern.sub("", text)
        text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text)
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        tokens = nltk.word_tokenize(text)
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
#         stop_word = set(stopwords.words('english'))
#         stop_word.discard("not")
        words = [nltk.WordNetLemmatizer().lemmatize(w) for w in words]
        words = ' '.join(words)
        processed_text.append(words)
    return(processed_text)
english_sentenced_text = preprocessing_text(df)

In [79]:
english_sentenced_text

['islam is word from arabic and it full word is salamaits definition peace surrender',
 'everything is reliant on these computer working',
 'parliament doe not control the government',
 'race equality new law',
 'the provision would not affect the power of parliament to make law in respect of income from profession etc lrb article rrb',
 'from the top of the adhishthana at about the middle of the northern side an immense and carved waterspout is projected which discharge the abhisheka water flowing out of the sanctum floor',
 'there wa lasagna there wa casserole',
 'super power india source google writer vedpratap vedik',
 'each wa a blow to conservatism',
 'if he is satisfied that the matter is urgent and of sufficient importance to be raised in the house at an early date and an early opportunity is otherwise not available for discussing the matter he may admit the notice',
 'the colour of the drake is black at the neck and back',
 'in the mathare valley slum',
 'the second time wa a 

In [80]:
df['english_sentenced_text'] = english_sentenced_text

In [81]:
exclude = set(string.punctuation)
remove_digits = str.maketrans("", "", digits)

In [82]:
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: re.sub("'", '', x))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.translate(remove_digits))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.strip())
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [83]:
df['hindi_sentence'][0]

'START_ राजनीतिज्ञों के पास जो कार्य करना चाहिए वह करने कि अनुमति नहीं है _END'

In [84]:
english_words = set()
for eng in df['english_sentenced_text']:
  for word in eng.split():
    if word not in english_words:
      english_words.add(word)

hindi_words = set()
for hindi in df['hindi_sentence']:
  for word in hindi.split():
    if word not in hindi_words:
      hindi_words.add(word)

In [85]:
print(english_words.__len__())
print(hindi_words.__len__())

27128
36975


In [86]:
df['len_english_words'] = df['english_sentenced_text'].apply(lambda x: len(x.split()))
df['len_hindi_words'] = df['hindi_sentence'].apply(lambda x: len(x.split()))

In [87]:
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence,english_sentenced_text,len_english_words,len_hindi_words
25520,indic2012,Islam is word from arabic and it full word is ...,START_ इस्लाम शब्द अरबी भाषा का शब्द है जिसका ...,islam is word from arabic and it full word is ...,14,21
118633,ted,Everything is reliant on these computers working.,START_ इन कंप्यूटरों पर सब कुछ निर्भर है _END,everything is reliant on these computer working,7,9
113495,tides,Parliament does not control the government .,START_ संसद का सरकार पपर नियंत्रण नपहीं रहता _END,parliament doe not control the government,6,9
29783,tides,Race equality New laws,START_ नये कानून नस्ली समानता _END,race equality new law,4,6
111804,tides,The provision would not affect the power of Pa...,START_ व्यवसायों आदि से होने वाली आय के बारे म...,the provision would not affect the power of pa...,22,24


In [88]:
# Taking only text which are less than or equal to 20
df = df[df['len_english_words']<=20]
df = df[df['len_hindi_words']<=20]

In [89]:
df.shape

(16514, 6)

In [90]:
input_words = sorted(list(english_words))
target_words = sorted(list(hindi_words))
num_encoder_tokens = len(english_words)+1
num_decoder_tokens = len(hindi_words)+1
num_encoder_tokens, num_decoder_tokens

(27129, 36976)

In [91]:
print(input_words.__len__())
print(target_words.__len__())

27128
36975


In [92]:
input_token_index = dict([ (word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])                         

In [93]:
input_token_index

{'a': 1,
 'aa': 2,
 'aabb': 3,
 'aac': 4,
 'aacharan': 5,
 'aadam': 6,
 'aadi': 7,
 'aadikavya': 8,
 'aadimindoeuropean': 9,
 'aadiparva': 10,
 'aae': 11,
 'aaen': 12,
 'aag': 13,
 'aagra': 14,
 'aah': 15,
 'aai': 16,
 'aainaeakbary': 17,
 'aaj': 18,
 'aakashdeep': 19,
 'aakhr': 20,
 'aakhus': 21,
 'aakrnsa': 22,
 'aakul': 23,
 'aamer': 24,
 'aankhen': 25,
 'aanre': 26,
 'aap': 27,
 'aapsall': 28,
 'aapsallits': 29,
 'aarac': 30,
 'aarthie': 31,
 'aarti': 32,
 'aarun': 33,
 'aasan': 34,
 'aashram': 35,
 'aathri': 36,
 'aatre': 37,
 'aatri': 38,
 'aauu': 39,
 'aayats': 40,
 'aaygarh': 41,
 'aayouva': 42,
 'aaytmuhakamaat': 43,
 'aaytmutshahibat': 44,
 'ab': 45,
 'abad': 46,
 'abandon': 47,
 'abandoned': 48,
 'abandoning': 49,
 'abani': 50,
 'abbas': 51,
 'abbasi': 52,
 'abbasioan': 53,
 'abbasis': 54,
 'abbaskingdom': 55,
 'abbot': 56,
 'abbottabad': 57,
 'abbreviated': 58,
 'abbreviation': 59,
 'abbullah': 60,
 'abc': 61,
 'abcl': 62,
 'abdel': 63,
 'abdicate': 64,
 'abdicted': 65,
 'a

In [94]:
target_token_index

{'A': 1,
 'AACप्ले': 2,
 'ABaavaaoM': 3,
 'ALDE': 4,
 'AMDmaana': 5,
 'AMtriBaia': 6,
 'ANC': 7,
 'ANCने': 8,
 'APaIla': 9,
 'AQI': 10,
 'AQyaxaaa': 11,
 'ASCII': 12,
 'ASCIIISCII': 13,
 'ATT': 14,
 'AVCस्तेरेओ': 15,
 'AVI': 16,
 'AXIS': 17,
 'Aa': 18,
 'AaD': 19,
 'AaEama': 20,
 'AaOVaoigak': 21,
 'AaOr': 22,
 'AaQaar': 23,
 'AaQyaaimak': 24,
 'Aakr': 25,
 'Aankhen': 26,
 'Aaor': 27,
 'Aap': 28,
 'Aapkao': 29,
 'Aapko': 30,
 'Aapsa': 31,
 'Aaraop': 32,
 'AavaXa': 33,
 'Aavaana': 34,
 'Aavaasa': 35,
 'Aayaao': 36,
 'Abbasके': 37,
 'Act': 38,
 'Adalat': 39,
 'Adobe': 40,
 'Advanced': 41,
 'Affairs': 42,
 'AfsaraoM': 43,
 'Agency': 44,
 'Agochar': 45,
 'Ahmad': 46,
 'AiBanaoI': 47,
 'AiBanaoaa': 48,
 'AiBava': 49,
 'AiBavaadna': 50,
 'AiQakar': 51,
 'AiQaktma': 52,
 'Ainala': 53,
 'Air': 54,
 'AirPix': 55,
 'AjaI': 56,
 'Aks': 57,
 'Alberto': 58,
 'Alex': 59,
 'Alignment': 60,
 'Allenके': 61,
 'Alvida': 62,
 'Am': 63,
 'Amar': 64,
 'Ambient': 65,
 'America': 66,
 'AnauBava': 67,
 'Androi

In [95]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [96]:
df = shuffle(df)
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence,english_sentenced_text,len_english_words,len_hindi_words
94847,ted,"So I said, “It's very topical, very important....",START_ तो मैंने कहा “यह बहुत ही सामयिक है बहुत...,so i said it s very topical very important but...,15,18
93266,indic2012,Kumbhakarna also gave the futile advice of sur...,START_ कुम्भकर्ण ने भी रावण के शरण में जाने की...,kumbhakarna also gave the futile advice of sur...,10,14
103991,ted,There's a little hump here. But there are peop...,START_ यहाँ थोड़ी सी कमी है। लेकिन सभी तरफ़ लो...,there s a little hump here but there are peopl...,13,14
49152,tides,Many aspects of our general health are linked ...,START_ हमारे सामान्य स्वास्थ्य की अनेक बातें ह...,many aspect of our general health are linked t...,12,16
89718,ted,"child kidnapping,",START_ बच्चो के अपहरण के नमूने को _END,child kidnapping,2,8


In [97]:
X, Y = df['english_sentenced_text'], df['hindi_sentence']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((13211,), (3303,))

In [98]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')

In [99]:
def generate_batch(x = X_train, y=Y_train, batch_size=128):
  while True:
    for j in range(0, len(x), batch_size):
      encoder_input_data = np.zeros((batch_size, max_length_src), dtype='float32')
      decoder_input_data = np.zeros((batch_size, max_length_tar), dtype='float32')
      decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens), dtype='float32')
      for i, (input_text, target_text) in enumerate(zip(x[j:j+batch_size], y[j:j+batch_size])):
        for t, word in enumerate(input_text.split()):
          encoder_input_data[i,t] = input_token_index[word]
        for t, word in enumerate(target_text.split()):
          if t < len(target_text.split()) - 1:
            decoder_input_data[i, t] = target_token_index[word]
          if t > 0 :
            decoder_target_data[i, t-1, target_token_index[word]] = 1.
      yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [100]:
latent_dim = 256

In [101]:
encoder_inputs = Input(shape=(None,))
enc_emd = Embedding(num_encoder_tokens, latent_dim, mask_zero=True)(encoder_inputs)
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(enc_emd)

encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
dec_emd_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero=True)
dec_emd = dec_emd_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs,_, _ = decoder_lstm(dec_emd, initial_state=encoder_states) 
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs=20
max_length_src = max(df['len_english_words'])
max_length_tar = max(df['len_hindi_words'])

model.fit_generator(generator = generate_batch(X_train, Y_train, batch_size=batch_size), steps_per_epoch=train_samples//batch_size,
                    epochs = epochs, validation_data =  generate_batch(X_test, Y_test, batch_size=batch_size),
                    validation_steps = val_samples// batch_size)



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fc10ad20890>

In [102]:
model.save_weights('NMT_weights.h5')

In [103]:
model.load_weights('NMT_weights.h5')

In [104]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emd_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

# Decode sample sequeces

In [105]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

# Evaluation on Train Dataset

In [106]:
train_gen = generate_batch(X_train, Y_train, batch_size = 1)
k=-1

In [107]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', Y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-6])

Input English sentence: but though powerful the king who symbolised the state did not have the right to legislate
Actual Hindi Translation:  राजा शक्तिशाली था और राज्य का प्रतीक भी था किंतु उसे विधि निर्माण का अधिकार नहीं था 
Predicted Hindi Translation:  लेकिन इस दौरान अकबर के दौरान संसद के दौरान वे 
