In [1]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

In [None]:
lines = open('mar.txt', encoding='utf-8').read().split('\n')

In [2]:
!!curl -O http://www.manythings.org/anki/mar-eng.zip
!!unzip mar-eng.zip

['Archive:  mar-eng.zip',
 '  inflating: mar.txt                 ',
 '  inflating: _about.txt              ']

In [3]:
lines = open('mar.txt', encoding='utf-8').read().split('\n')

In [4]:
eng_sent = []
mar_sent = []
eng_chars = set()
mar_chars = set()
nb_samples = 10000

# Process english and french sentences
for line in range(nb_samples):
    
    eng_line = str(lines[line]).split('\t')[0]
    
    # Append '\t' for start of the sentence and '\n' to signify end of the sentence
    mar_line = '\t' + str(lines[line]).split('\t')[1] + '\n'
    eng_sent.append(eng_line)
    mar_sent.append(mar_line)
    
    for ch in eng_line:
        if (ch not in eng_chars):
            eng_chars.add(ch)
            
    for ch in mar_line:
        if (ch not in mar_chars):
            mar_chars.add(ch)

In [5]:

mar_chars = sorted(list(mar_chars))
eng_chars = sorted(list(eng_chars))

In [6]:
# dictionary to index each english character - key is index and value is english character
eng_index_to_char_dict = {}

# dictionary to get english character given its index - key is english character and value is index
eng_char_to_index_dict = {}

for k, v in enumerate(eng_chars):
    eng_index_to_char_dict[k] = v
    eng_char_to_index_dict[v] = k

In [7]:
# dictionary to index each french character - key is index and value is french character
mar_index_to_char_dict = {}

# dictionary to get french character given its index - key is french character and value is index
mar_char_to_index_dict = {}
for k, v in enumerate(mar_chars):
    mar_index_to_char_dict[k] = v
    mar_char_to_index_dict[v] = k

In [9]:
max_len_eng_sent = max([len(line) for line in eng_sent])
max_len_mar_sent = max([len(line) for line in mar_sent])

In [10]:

max_len_eng_sent
max_len_mar_sent

42

In [11]:
tokenized_eng_sentences = np.zeros(shape = (nb_samples,max_len_eng_sent,len(eng_chars)), dtype='float32')
tokenized_mar_sentences = np.zeros(shape = (nb_samples,max_len_mar_sent,len(mar_chars)), dtype='float32')
target_data = np.zeros((nb_samples, max_len_mar_sent, len(mar_chars)),dtype='float32')

In [12]:
for i in range(nb_samples):
    for k,ch in enumerate(eng_sent[i]):
        tokenized_eng_sentences[i,k,eng_char_to_index_dict[ch]] = 1
        
    for k,ch in enumerate(mar_sent[i]):
        tokenized_mar_sentences[i,k,mar_char_to_index_dict[ch]] = 1

        # decoder_target_data will be ahead by one timestep and will not include the start character.
        if k > 0:
            target_data[i,k-1,mar_char_to_index_dict[ch]] = 1

In [24]:
encoder_input = Input(shape=(None,len(eng_chars)))
encoder_LSTM = LSTM(256,return_state = True)
encoder_outputs, encoder_h, encoder_c = encoder_LSTM (encoder_input)
encoder_states = [encoder_h, encoder_c]

# from keras.layers import Input, LSTM, Embedding, Dense
# latent_dim = 50
# # Encoder
# encoder_inputs = Input(shape=(None,))
# enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
# encoder_lstm = LSTM(latent_dim, return_state=True)
# encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# # We discard `encoder_outputs` and only keep the states.
# encoder_states = [state_h, state_c]

In [14]:

decoder_input = Input(shape=(None,len(mar_chars)))
decoder_LSTM = LSTM(256,return_sequences=True, return_state = True)
decoder_out, _ , _ = decoder_LSTM(decoder_input, initial_state=encoder_states)
decoder_dense = Dense(len(mar_chars),activation='softmax')
decoder_out = decoder_dense (decoder_out)



# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [25]:
model = Model(inputs=[encoder_input, decoder_input],outputs=[decoder_out])

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.fit(x=[tokenized_eng_sentences,tokenized_mar_sentences], 
          y=target_data,
          batch_size=64,
          epochs=50,
          validation_split=0.1)

In [18]:
# Inference models for testing

# Encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, 
                                                 initial_state=decoder_input_states)

decoder_states = [decoder_h , decoder_c]

decoder_out = decoder_dense(decoder_out)

decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
                          outputs=[decoder_out] + decoder_states )

In [19]:
def decode_seq(inp_seq):
    
    # Initial states value is coming from the encoder 
    states_val = encoder_model_inf.predict(inp_seq)
    
    target_seq = np.zeros((1, 1, len(mar_chars)))
    target_seq[0, 0, mar_char_to_index_dict['\t']] = 1
    
    translated_sent = ''
    stop_condition = False
    
    while not stop_condition:
        
        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        
        max_val_index = np.argmax(decoder_out[0,-1,:])
        sampled_mar_char = mar_index_to_char_dict[max_val_index]
        translated_sent += sampled_mar_char
        
        if ( (sampled_mar_char == '\n') or (len(translated_sent) > max_len_mar_sent)) :
            stop_condition = True
        
        target_seq = np.zeros((1, 1, len(mar_chars)))
        target_seq[0, 0, max_val_index] = 1
        
        states_val = [decoder_h, decoder_c]
        
    return translated_sent

In [21]:
for seq_index in range(50):
    inp_seq = tokenized_eng_sentences[seq_index:seq_index+1]
    translated_sent = decode_seq(inp_seq)
    print('-')
    print('Input sentence:', eng_sent[seq_index])
    print('Decoded sentence:', translated_sent)

-
Input sentence: Go.
Decoded sentence: ऊ.

-
Input sentence: Run!
Decoded sentence: पा.

-
Input sentence: Run!
Decoded sentence: पा.

-
Input sentence: Run!
Decoded sentence: पा.

-
Input sentence: Run!
Decoded sentence: पा.

-
Input sentence: Who?
Decoded sentence: को.

-
Input sentence: Wow!
Decoded sentence: पा.

-
Input sentence: Fire!
Decoded sentence: पा.

-
Input sentence: Fire!
Decoded sentence: पा.

-
Input sentence: Help!
Decoded sentence: वा.

-
Input sentence: Help!
Decoded sentence: वा.

-
Input sentence: Jump!
Decoded sentence: उडी मार.

-
Input sentence: Jump!
Decoded sentence: उडी मार.

-
Input sentence: Jump.
Decoded sentence: उडी मार.

-
Input sentence: Jump.
Decoded sentence: उडी मार.

-
Input sentence: Stop!
Decoded sentence: थां.

-
Input sentence: Stop!
Decoded sentence: थां.

-
Input sentence: Wait!
Decoded sentence: वा.

-
Input sentence: Wait!
Decoded sentence: वा.

-
Input sentence: Hello!
Decoded sentence: सांग.

-
Input sentence: Hurry!
Decoded sentence: ल