# Neural Machine Translation

In [0]:
!wget http://www.manythings.org/anki/deu-eng.zip

--2019-12-13 06:38:28--  http://www.manythings.org/anki/deu-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.108.196, 104.24.109.196, 2606:4700:30::6818:6cc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.108.196|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7612057 (7.3M) [application/zip]
Saving to: ‘deu-eng.zip’


2019-12-13 06:38:31 (3.35 MB/s) - ‘deu-eng.zip’ saved [7612057/7612057]



In [0]:
!unzip deu-eng.zip

Archive:  deu-eng.zip
  inflating: deu.txt                 
  inflating: _about.txt              


In [0]:
# !git clone https://github.com/kmsravindra/ML-AI-experiments.git

Cloning into 'ML-AI-experiments'...
remote: Enumerating objects: 205, done.[K
remote: Total 205 (delta 0), reused 0 (delta 0), pack-reused 205[K
Receiving objects: 100% (205/205), 14.28 MiB | 12.97 MiB/s, done.
Resolving deltas: 100% (47/47), done.


In [0]:
# cd /content/ML-AI-experiments/AI/Neural\ Machine\ Translation

/content/ML-AI-experiments/AI/Neural Machine Translation


In [0]:
import warnings
warnings.filterwarnings("ignore")

In [0]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np
from keras.models import load_model

In [0]:
lines = open('/content/deu.txt', encoding='utf-8').read().split('\n')

In [0]:
len(lines)

200520

In [0]:
eng_sent = []
fra_sent = []
eng_chars = set()
fra_chars = set()
nb_samples = 20000

# Process english and french sentences
for line in range(nb_samples):
    
    eng_line = str(lines[line]).split('\t')[0]
    
    # Append '\t' for start of the sentence and '\n' to signify end of the sentence
    fra_line = '\t' + str(lines[line]).split('\t')[1] + '\n'
    eng_sent.append(eng_line.lower())
    fra_sent.append(fra_line.lower())
    
    for ch in eng_line:
        if (ch not in eng_chars):
            eng_chars.add(ch)
            
    for ch in fra_line:
        if (ch not in fra_chars):
            fra_chars.add(ch)

In [0]:
fra_chars = sorted(list(fra_chars))
eng_chars = sorted(list(eng_chars))

In [0]:
# dictionary to index each english character - key is index and value is english character
eng_index_to_char_dict = {}

# dictionary to get english character given its index - key is english character and value is index
eng_char_to_index_dict = {}

for k, v in enumerate(eng_chars):
    eng_index_to_char_dict[k] = v
    eng_char_to_index_dict[v] = k

In [0]:
# dictionary to index each french character - key is index and value is french character
fra_index_to_char_dict = {}

# dictionary to get french character given its index - key is french character and value is index
fra_char_to_index_dict = {}
for k, v in enumerate(fra_chars):
    fra_index_to_char_dict[k] = v
    fra_char_to_index_dict[v] = k

In [0]:
max_len_eng_sent = max([len(line) for line in eng_sent])
max_len_fra_sent = max([len(line) for line in fra_sent])

In [0]:
tokenized_eng_sentences = np.zeros(shape = (nb_samples,max_len_eng_sent,len(eng_chars)), dtype='float32')
tokenized_fra_sentences = np.zeros(shape = (nb_samples,max_len_fra_sent,len(fra_chars)), dtype='float32')
target_data = np.zeros((nb_samples, max_len_fra_sent, len(fra_chars)),dtype='float32')

In [0]:
len(eng_chars),len(fra_chars)

(72, 91)

In [0]:
# Vectorize the english and french sentences

for i in range(nb_samples):
    for k,ch in enumerate(eng_sent[i]):
        tokenized_eng_sentences[i,k,eng_char_to_index_dict[ch]] = 1
        
    for k,ch in enumerate(fra_sent[i]):
        tokenized_fra_sentences[i,k,fra_char_to_index_dict[ch]] = 1

        # decoder_target_data will be ahead by one timestep and will not include the start character.
        if k > 0:
            target_data[i,k-1,fra_char_to_index_dict[ch]] = 1

In [0]:
# Encoder model

encoder_input = Input(shape=(None,len(eng_chars)))
encoder_LSTM = LSTM(256,return_state = True)
encoder_outputs, encoder_h, encoder_c = encoder_LSTM (encoder_input)
encoder_states = [encoder_h, encoder_c]

In [0]:

# Decoder model

decoder_input = Input(shape=(None,len(fra_chars)))
decoder_LSTM = LSTM(256,return_sequences=True, return_state = True)
decoder_out, _ , _ = decoder_LSTM(decoder_input, initial_state=encoder_states)
decoder_dense = Dense(len(fra_chars),activation='softmax')
decoder_out = decoder_dense (decoder_out)

In [0]:
model = Model(inputs=[encoder_input, decoder_input],outputs=[decoder_out])

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')


In [0]:
for i in range(5):
    model.fit(x=[tokenized_eng_sentences,tokenized_fra_sentences], 
            y=target_data,
            batch_size=64,
            epochs=10,
            validation_split=0.2)
    print(f'{i}'*20)
    model.save('NMT.h5')

Train on 16000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
00000000000000000000
Train on 16000 samples, validate on 4000 samples
Epoch 1/10

KeyboardInterrupt: ignored

In [0]:
model.save('NMT_final.h5')

In [0]:
# Inference models for testing

# Encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, 
                                                 initial_state=decoder_input_states)

decoder_states = [decoder_h , decoder_c]

decoder_out = decoder_dense(decoder_out)

decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
                          outputs=[decoder_out] + decoder_states )

In [0]:
def decode_seq(inp_seq):
    
    # Initial states value is coming from the encoder 
    states_val = encoder_model_inf.predict(inp_seq)
    
    target_seq = np.zeros((1, 1, len(fra_chars)))
    target_seq[0, 0, fra_char_to_index_dict['\t']] = 1
    
    translated_sent = ''
    stop_condition = False
    
    while not stop_condition:
        
        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        
        max_val_index = np.argmax(decoder_out[0,-1,:])
        sampled_fra_char = fra_index_to_char_dict[max_val_index]
        translated_sent += sampled_fra_char
        
        if ( (sampled_fra_char == '\n') or (len(translated_sent) > max_len_fra_sent)) :
            stop_condition = True
        
        target_seq = np.zeros((1, 1, len(fra_chars)))
        target_seq[0, 0, max_val_index] = 1
        
        states_val = [decoder_h, decoder_c]
        
    return translated_sent

In [0]:
for seq_index in range(10):
    inp_seq = tokenized_eng_sentences[seq_index:seq_index+1]
    translated_sent = decode_seq(inp_seq)
    print('-')
    print('Input sentence:', eng_sent[seq_index])
    print('Decoded sentence:', translated_sent)

-
Input sentence: hi.
Decoded sentence: hallo!

-
Input sentence: hi.
Decoded sentence: hallo!

-
Input sentence: run!
Decoded sentence: weille!

-
Input sentence: wow!
Decoded sentence: donnerwetter!

-
Input sentence: wow!
Decoded sentence: donnerwetter!

-
Input sentence: fire!
Decoded sentence: feuer!

-
Input sentence: help!
Decoded sentence: hilfollo!

-
Input sentence: help!
Decoded sentence: hilfollo!

-
Input sentence: stop!
Decoded sentence: stopp!

-
Input sentence: wait!
Decoded sentence: wartet mal!



In [0]:
test_sent = "What is your name?".lower()
max_len_test = len(test_sent)
tokenized_test_sentences = np.zeros(shape = (nb_samples,max_len_test,len(eng_chars)), dtype='float32')
for k,ch in enumerate(test_sent):
    tokenized_test_sentences[0,k,eng_char_to_index_dict[ch]] = 1
test_sent2 = "How are you?".lower()
max_len_test = len(test_sent2)
tokenized_test_sentences2 = np.zeros(shape = (nb_samples,max_len_test,len(eng_chars)), dtype='float32')
for k,ch in enumerate(test_sent2):
    tokenized_test_sentences2[0,k,eng_char_to_index_dict[ch]] = 1

In [0]:
for i in range(10):
    model.fit(x=[tokenized_eng_sentences,tokenized_fra_sentences], 
            y=target_data,
            batch_size=64,
            epochs=2,
            validation_split=0.2)
    print('-'*50)
    print(f"Iteration: {i+1}")
    inp_seq = tokenized_test_sentences2
    translated_sent = decode_seq(inp_seq)
    print('Input sentence:', test_sent)
    print('Decoded sentence:', translated_sent)
    inp_seq = tokenized_test_sentences
    translated_sent = decode_seq(inp_seq)
    print('-')
    print('Input sentence:', test_sent)
    print('Decoded sentence:', translated_sent)
    model.save('NMT.h5')

Train on 16000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
--------------------------------------------------
Iteration: 1
Input sentence: what is your name?
Decoded sentence: ich bin den gute nachen.

-
Input sentence: what is your name?
Decoded sentence: was ist lieben wir?

Train on 16000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
--------------------------------------------------
Iteration: 2
Input sentence: what is your name?
Decoded sentence: ich benötige einen hut.

-
Input sentence: what is your name?
Decoded sentence: was ist los eine wetter?

Train on 16000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
--------------------------------------------------
Iteration: 3
Input sentence: what is your name?
Decoded sentence: ich bin der verlieren zu hart.

-
Input sentence: what is your name?
Decoded sentence: was ist lieben da?

Train on 16000 samples, validate on 4000 samples
Epoch 1/2
Epoch 2/2
--------------------------------------------------
Iterati

In [0]:
inp_seq = tokenized_test_sentences
translated_sent = decode_seq(inp_seq)
print('-')
print('Input sentence:', test_sent)
print('Decoded sentence:', translated_sent)

-
Input sentence: what is your name?
Decoded sentence: was ist lieben da?

