In [None]:
import numpy as np
import pandas as pd

import os
import string
import re
from string import digits
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from keras.layers import Input, LSTM, Embedding, Dense
from sklearn.model_selection import train_test_split
from keras.models import Model

In [None]:
lines = pd.read_csv("Hindi_English_Truncated_Corpus.csv")

In [None]:
lines

In [None]:
lines = lines[lines['source']=='ted']
lines = lines[~pd.isnull(lines['english_sentence'])]
lines.drop_duplicates(inplace=True)

lines = lines.sample(n=25000, random_state=42)
lines.shape

In [5]:
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: x.lower())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.lower())

In [None]:
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub("'", '',x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [None]:
exclude = set(string.punctuation)
lines['english_sentence']=lines['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [None]:
remove_digits = str.maketrans('', '', digits)
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.translate(remove_digits))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub("[1234567890]", "", x))

#remove extra spaces
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.strip())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.strip())
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub(" +"," ", x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))

lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: 'START_' + x + '_END')

In [None]:
all_eng_words = set()
for eng in lines['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in lines['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

lines['length_eng_sentence']= lines['english_sentence'].apply(lambda x: len(x.split(" ")))
lines['length_hin_sentence']=lines['hindi_sentence'].apply(lambda x: len(x.split(" ")))

In [None]:
lines = lines[lines['length_eng_sentence']<=20]
lines = lines[lines['length_hin_sentence']<=20]
max_length_src=max(lines['length_hin_sentence'])
max_length_tar=max(lines['length_eng_sentence'])

input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)
num_encoder_tokens = num_decoder_tokens

num_decoder_tokens += 1
input_token_index = dict([(word, i +1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i +1) for i, word in enumerate(target_words)])
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())
lines = shuffle(lines)

In [None]:
x, y = lines['english_sentence'], lines['hindi_sentence']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
x_train.to_pickle('x_train.pkl')
x_test.to_pickle('x_test.pkl')

In [None]:
def generate_batch(x=x_train, y=y_train, batch_size = 128):
    '''Generate a batch of data'''
    while True:
        for j in range(0, len(x), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src), dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar), dtype = 'float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens), dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(x[j:j+batch_size], y[j:j+batch_size])):
                for t , word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word]

                for t , word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word]
                        if t>0:
                            decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)


latent_dim = 300
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state = True)
encoder_output, state_h, state_c,  = encoder_lstm(enc_emb)

encoder_states = [state_h, state_c]
decoder_inputs = Input(shape= (None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state = True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state = encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

train_samples = len(x_train)
val_sample = len(x_test)
batch_size = 128
epochs = 100

In [None]:
model.fit_generator(generator = generate_batch(x_train, y_train, batch_size = batch_size),
                   steps_per_epoch = train_samples//batch_size,
                   epochs = epochs,
                   validation_data = generate_batch(x_test, y_test, batch_size = batch_size),
                   validation_steps = val_sample//batch_size)

model.save_weights(nmt_weights.h5)

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)

#decoder setup
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)

#predict the next word
decoder_outputs_2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state= decoder_states_inputs)
decoder_state2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_state2)

def decode_sequence(input_seq):
    state_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_token_index['START_']
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens, h, c = decoder_moel.predict([target_seq] + state_value)
        sampled_token_index = ap.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ''+ sampled_char
        
        if (sampled_char == '_END' or
           len(decoded_sentence) >50):
            stop_condition = True
            
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        state_value = [h, c]
        
    return decoded_sentence

train_gen = generate_batch(x_train, y_train, batch_size=1)
k =1

In [None]:
k +=1
(input_seq, actual_output), = next(train_gen)
decoded_sentence = decoded_sequence(input_seq)
print('Input English sentence:', x_train[k:k+1]. values[0])
print('Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Prediction Translation:', decoded_sentence[:-4])