# Neural Machine Translation (NMT) -English to French



![NMT system](https://miro.medium.com/max/1928/1*CkeGXClZ5Xs0MhBc7xFqSA.png)


For better understood how seq2seq work your can look

[Andrew Ng course ](https://www.youtube.com/playlist?list=PL1F3ABbhcqa3BBWo170U4Ev2wfsF7FN8l) on youtube 

Keras Tutorial Machine Translation [lstm seq2seq](https://keras.io/examples/nlp/lstm_seq2seq/)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import  Adam , RMSprop
from tensorflow.keras import layers , activations , models , preprocessing , utils
import pandas as pd
from my_utils import load_data , split_data

In [None]:

!wget http://www.manythings.org/anki/fra-eng.zip -O fra-eng.zip
!unzip fra-eng.zip


In [None]:
data = load_data("fra.txt")
data[:100]

In [None]:
eng_corpus , french_corpus =  split_data(data)

In [None]:
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(eng_corpus) 
eng_word_index = tokenizer.word_index
num_eng_tokens = len( eng_word_index )+1
eng_sequences= tokenizer.texts_to_sequences(eng_corpus) 
max_input_length_sequences =  max([len(x) for x in  eng_sequences]) 


padded_eng_sequences = pad_sequences(eng_sequences , maxlen=max_input_length_sequences , padding='post' )
encoder_input_data = np.array(padded_eng_sequences )


In [None]:


tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts( french_corpus) 

french_word_index = tokenizer.word_index
num_french_tokens = len( french_word_index )+1

french_sequences = tokenizer.texts_to_sequences( french_corpus) 
max_output_length_sequences =  max([len(x) for x in  french_sequences]) 



padded_french_sequences = preprocessing.sequence.pad_sequences( french_sequences , maxlen=max_output_length_sequences, padding='post' )
decoder_input_data = np.array( padded_french_sequences)


In [None]:
decoder_target_data = []
for token_seq in french_sequences:
    decoder_target_data.append(token_seq[ 1 : ]) 
    
padded_french_lines = preprocessing.sequence.pad_sequences( decoder_target_data , maxlen=max_output_length_sequences, padding='post' )
onehot_french_lines = utils.to_categorical( padded_french_lines , num_french_tokens )
decoder_target_data = np.array( onehot_french_lines )



> Defining the Encoder-Decoder model

 






 

In [None]:

encoder_inputs = tf.keras.layers.Input(shape=(max_input_length_sequences,))
encoder_embedding = tf.keras.layers.Embedding( num_eng_tokens, 256 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 256 , return_state=True , dropout=0.2 )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=(max_output_length_sequences,))
decoder_embedding = tf.keras.layers.Embedding( num_french_tokens, 256 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 256 , return_state=True , return_sequences=True , dropout=0.2)
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( num_french_tokens , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(loss='categorical_crossentropy'  , optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

model.summary()


### 2) Training the model
We train the model for a number of epochs with RMSprop optimizer and categorical crossentropy loss function.

In [None]:
num_epoch=32
batch_size= 250
model.fit([encoder_input_data , decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=num_epoch ) 
model.save( 'model.h5' ) 


In [None]:

def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 256,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 256 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model


In [None]:

def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = []
    for word in words:
        tokens_list.append( eng_word_index[ word ] ) 
        
    return pad_sequences( [tokens_list] , maxlen=max_input_length_sequences , padding='post')


In [None]:

enc_model , dec_model = make_inference_models()

enc_model.save( 'enc_model.h5' ) 
dec_model.save( 'dec_model.h5' ) 
model.save( 'model.h5' ) 

for epoch in range( encoder_input_data.shape[0] ):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter eng sentence : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = french_word_index['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in french_word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > max_output_length_sequences:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )
    