In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

# Any results you write to the current directory are saved as output.



In [None]:
lines=pd.read_csv("../input/hindienglish-corpora/Hindi_English_Truncated_Corpus.csv",encoding='utf-8')

In [None]:
lines['source'].value_counts()

tides        50000
ted          39881
indic2012    37726
Name: source, dtype: int64

In [None]:
# extract indic2012 values only
lines=lines[lines['source']=='indic2012']

In [None]:
lines.head(20)

Unnamed: 0,source,english_sentence,hindi_sentence
2,indic2012,This percentage is even greater than the percentage in India.,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
4,indic2012,.The ending portion of these Vedas is called Upanishad.,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।
6,indic2012,In this lies the circumstances of people before you.,इसमें तुमसे पूर्व गुज़रे हुए लोगों के हालात हैं।
8,indic2012,“”Global Warming“” refer to warming caused in recent decades and probability of its continual presence and its indirect effect on human being.,ग्लोबल वॉर्मिंग से आशय हाल ही के दशकों में हुई वार्मिंग और इसके निरंतर बने रहने के अनुमान और इसके अप्रत्यक्ष रूप से मानव पर पड़ने वाले प्रभाव से है।
11,indic2012,Category: Religious Text,श्रेणी:धर्मग्रन्थ
12,indic2012,This period summarily is pepped up with devotion.,यह काल समग्रतः भक्ति भावना से ओतप्रोत काल है।
16,indic2012,"And now at present the naturecure, Ayurvedic and modern treatments are taking place through the government in Nepal.","हाल में नेपाल के हस्पताल सामन्यतया आयुर्वेद, प्राकृतिक चिकित्सा तथा आधुनिक चिकीत्सा करके सरकारी सेवा विद्यमान हे ।"
17,indic2012,Parliament time frame is 5 years and this will be dissolved before that.,लोकसभा की कार्यावधि 5 वर्ष है पर्ंतु इसे समय से पूर्व भंग किया जा सकता है
19,indic2012,"Extreme weather due to increased mortality; displacements and economic loss will be compounded through growing population. Although, temperate climate has some benefits out of it such as decreased mortality due to cold weather.",बढ़ती हुई मौतों displacements और आर्थिक नुकसान जो की अतिवादी मौसम (extreme weather)के कारण संभावित हैं बढती हुई जनसँख्या (growing population)के कारण और भी बदतर हो सकते हैं . हालांकि शीतोष्ण क्षेत्र में इसके कुछ फैदे भी हो सकते हैं जैसे की ठंड की वजह से कम मौतें होना .
22,indic2012,"Islam is the world's second-largest religion, after Christianity.",इस्लाम धर्म (الإسلام) ईसाई धर्म के बाद अनुयाइयों के आधार पर दुनिया का दूसरा सब से बड़ा धर्म है।


In [None]:
pd.isnull(lines).sum()

source              0
english_sentence    2
hindi_sentence      0
dtype: int64

In [None]:
# remove null values
lines=lines[~pd.isnull(lines['english_sentence'])]

In [None]:
lines.drop_duplicates(inplace=True)

#### We will pick any 25000 rows from the dataset.

In [None]:
lines=lines.sample(n=25000,random_state=42)
lines.shape

(25000, 3)

In [None]:
# Lowercase all characters
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.lower())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.lower())

In [None]:
# Remove quotes
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub("'", '', x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [None]:
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
lines['english_sentence']=lines['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [None]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.translate(remove_digits))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.strip())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.strip())
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))


In [None]:
# Add start and end tokens to target sequences
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [None]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence
43137,indic2012,occurring in the environment recently increased carbon di oxide co monthly co measurements dershat that if you saw all year get to see the seasonal changes are small every year paryaverton northern hemisphere northern hemisphere seasons in the aswh at the end of the northern hemisphere crops have become more bjne is the time get at the change of environment because the plants remove some co charge,START_ वातावरण में कार्बन डाइऑक्साइड में हाल ही में होने वाली बढोतरी co मासिक co मापन यह दर्शाते हैं कि अगर सारे वर्ष को देखा जाए तो छोटेछोटे मौसमी परिवर्तन देखने को मिलते हैं हर साल यह परिवर्तन उत्तरी गोलार्ध northern hemisphere में वसव्त मौसम के आख़िर में अधिक हो जाते हैं और जब उत्तरी गोलार्ध में फसलें बीजने का समय होता है तो यह परिवर्तन कम हो जाते हैं क्यूंकि पौधे वातावरण में से कुछ co हटा लेते हैंi _END
107772,indic2012,her father aghoranath chattopadhyay was a wellknown scholar and mother was a poet who wrote in bengali,START_ इनके पिता अघोरनाथ चट्टोपाध्याय एक नामी विद्वान तथा माँ कवयित्री थीं और बांग्ला में लिखती थीं । _END
120507,indic2012,nishadraj guh served those three very well in ringverpur,START_ ऋंगवेरपुर में निषादराज गुह ने तीनों की बहुत सेवा की। _END
28755,indic2012,the ordinance that was rejected by loksabha can be passed again before six weeks period,START_ लोकसभा एक अध्यादेश को अस्वीकृत करने वाला प्रस्ताव सप्ताह की अवधि समाप्त होने से पूर्व पास कर सकती है _END
86303,indic2012,number of,START_ अठारह की संख्या _END


In [None]:
### Get English and Hindi Vocabulary
# split words and add to set
all_eng_words=set()
for eng in lines['english_sentence']:
    for word in eng.split():
        # ignore duplicate
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in lines['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

In [None]:
len(all_eng_words)

30447

In [None]:
len(all_hindi_words)

20506

In [None]:
lines['length_eng_sentence']=lines['english_sentence'].apply(lambda x:len(x.split(" ")))
lines['length_hin_sentence']=lines['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [None]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
43137,indic2012,occurring in the environment recently increased carbon di oxide co monthly co measurements dershat that if you saw all year get to see the seasonal changes are small every year paryaverton northern hemisphere northern hemisphere seasons in the aswh at the end of the northern hemisphere crops have become more bjne is the time get at the change of environment because the plants remove some co charge,START_ वातावरण में कार्बन डाइऑक्साइड में हाल ही में होने वाली बढोतरी co मासिक co मापन यह दर्शाते हैं कि अगर सारे वर्ष को देखा जाए तो छोटेछोटे मौसमी परिवर्तन देखने को मिलते हैं हर साल यह परिवर्तन उत्तरी गोलार्ध northern hemisphere में वसव्त मौसम के आख़िर में अधिक हो जाते हैं और जब उत्तरी गोलार्ध में फसलें बीजने का समय होता है तो यह परिवर्तन कम हो जाते हैं क्यूंकि पौधे वातावरण में से कुछ co हटा लेते हैंi _END,67,81
107772,indic2012,her father aghoranath chattopadhyay was a wellknown scholar and mother was a poet who wrote in bengali,START_ इनके पिता अघोरनाथ चट्टोपाध्याय एक नामी विद्वान तथा माँ कवयित्री थीं और बांग्ला में लिखती थीं । _END,17,19
120507,indic2012,nishadraj guh served those three very well in ringverpur,START_ ऋंगवेरपुर में निषादराज गुह ने तीनों की बहुत सेवा की। _END,9,12
28755,indic2012,the ordinance that was rejected by loksabha can be passed again before six weeks period,START_ लोकसभा एक अध्यादेश को अस्वीकृत करने वाला प्रस्ताव सप्ताह की अवधि समाप्त होने से पूर्व पास कर सकती है _END,15,21
86303,indic2012,number of,START_ अठारह की संख्या _END,2,5


In [None]:
lines[lines['length_eng_sentence']>30].shape

(2030, 5)

In [None]:
lines=lines[lines['length_eng_sentence']<=20]
lines=lines[lines['length_hin_sentence']<=20]

In [None]:
lines.shape

(16970, 5)

In [None]:
print("maximum length of Hindi Sentence ",max(lines['length_hin_sentence']))
print("maximum length of English Sentence ",max(lines['length_eng_sentence']))

maximum length of Hindi Sentence  20
maximum length of English Sentence  20


In [None]:
# save max length of source and target sentences 
# example: 20
max_length_src=max(lines['length_hin_sentence'])
max_length_tar=max(lines['length_eng_sentence'])

In [None]:
# convert words into list for encoder-decoder

input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)

(30447, 20506)

In [None]:
print(input_words[:5])

['a', 'aa', 'aaber', 'aabu', 'aac']


In [None]:
num_decoder_tokens += 1 #for zero padding


In [None]:
# dictionary containing word and index

input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [None]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [None]:
lines = shuffle(lines)
lines.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
35273,indic2012,premchand and key to hindi,START_ प्रेमचंद हिंदीकुंज में _END,5,5
110967,indic2012,in last yudhirthir aboard on indra chariot with brother and went to havens,START_ अन्त में युधिष्ठिर इन्द्र के रथ पर आरूढ़ हो दिव्य रूप धारी भाइयों सहित स्वर्ग को चले गये। _END,13,20
32202,indic2012,he knew each and every bit in this type of poems,START_ इन क्षेत्रों का तो वे कोनाकोना झाँक आये। _END,11,10
84911,indic2012,indias mumbaiold name bombayit is the capital of maharashtra,START_ भारत के पश्चिमी तट पर स्थित मुम्बई पूर्व नाम बम्बई भारतीय राज्य महाराष्ट्र की राजधानी है। _END,9,18
126492,indic2012,emegadictionary cdac presented by indian government,START_ इमहाशब्दकोश सीडैक भारत सरकार की प्रस्तुत _END,6,8
25719,indic2012,main futures of parliament,START_ लोकसभा की विशेष शक्तियाँ _END,4,6
46734,indic2012,newracreestaoph and explain mobileepistemology of all over communication,START_ नईऋ क्रीस्टॉफ एड मोबाइल को समझना सर्वव्यापक संचार की एपीस्टेमोलोगी _END,8,12
50907,indic2012,subhashchandra bose short introduction world web,START_ सुभाषचंद्र बोस संक्षिप्त परिचय वेबदुनिया _END,6,7
20073,indic2012,the grand opening of all these messages was done by adam,START_ इन संदेशों का शुभारम्भ आदम से हुआ था। _END,11,10
53837,indic2012,in the insights of junaghar there is old museum which contains the historic things,START_ जूनागढ़ दुर्ग के अन्दर एक संग्रहालय है जिसमें बहुमूल्य पुरातन वस्तुओं का संग्रह है। _END,14,16


### Split the data into train and test

In [None]:
X, y = lines['english_sentence'], lines['hindi_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((13576,), (3394,))

### Let us save this data

In [None]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')


In [None]:
# make batch for encoder decoder model


def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            # make context vector using numpy zeros
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            # itna nhi puchenge
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

### Encoder-Decoder Architecture

In [None]:
latent_dim=300

In [None]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [None]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [None]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 300)    9134100     input_3[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 300)    6152100     input_4[0][0]                    
____________________________________________________________________________________________

In [None]:
train_samples = len(X_train)
val_samples = len(X_test)
# take 128 sequence
batch_size = 128

# steps
epochs = 20

In [None]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa64cf385d0>

## Run till 20 epochs.

In [None]:
model.save_weights('nmt_weights.h5')

In [None]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence


In [None]:
# get 1 sentence pair (english to hindi)
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1


In [None]:
k+=1
# get hindi sentence
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
# -4 for removing _END
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: pilgrim and pilgrimage
Actual Hindi Translation:  तीर्थ एवं तीर्थ यात्रा 
Predicted Hindi Translation:  तीर्थ एवं तीर्थ यात्रा 


In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: other types of cricket
Actual Hindi Translation:  क्रिकेट के अन्य प्रकार 
Predicted Hindi Translation:  चीन के प्रमुख प्रकार 


In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: at last in along with his disciples he has to leave makka for madina
Actual Hindi Translation:  अंत में में उन्हें अपने अनुयायियों के साथ मक्का से मदीना के लिए कूच करना पड़ा। 
Predicted Hindi Translation:  में उन्हें उपन्यास की जाती है जिसे वे श्यामा के 


In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: traditional indian family values are observed with great honor
Actual Hindi Translation:  पारंपरिक भारतीय पारिवारिक मूल्यों को काफी आदर की दृष्टि से देखा जाता है। 
Predicted Hindi Translation:  भारतीय भारतीय संस्कृति मे भारतीय संस्कार व पाक


In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

Input English sentence: city of agra uttar pradesh province adopt hahshial
Actual Hindi Translation:  आगरा उत्तर प्रदेश प्रान्त का एक ज़िला शहर व तहसील है। 
Predicted Hindi Translation:  आगरा उत्तर प्रदेश प्रान्त का एक ज़िला शहर व तहसील


**Bleu Score calculation**

In [None]:
a = y_train[k:k+1].values[0][6:-4]
b = decoded_sentence[:-4]

In [None]:
from nltk.translate.bleu_score import sentence_bleu
score = sentence_bleu( a, b)
print('Bleu score:', '%3f'%score)

Bleu score: 0.823549


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
