# Seq2Seq Model using keras

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [3]:
import os
os.getcwd()

'/content'

###### Seq2seq model is a machine learning approach used mainly for language translation. This model has basically two components Encoder and Decoder. Both the Encoder and Decoder have their own LSTM layer where the input to the encoder is English sentences that collect all the information of the sentence. Then we pass this information to the LSTM layer of the decoder. However, decoder expects French sentences as the input. Finally, our target will be the French sentence one timestep ahead of the decoder inputs. So that at each timestep, the model will learn the corresponding word given the English sentence.

### Load the data - 1. English vacabulary 2. French vacabulary

In [0]:
data_path = "/content/gdrive/My Drive/Machine-Translation-Seq2Seq-Keras-master/Machine-Translation-Seq2Seq-Keras-master/data"

In [5]:
with open(data_path + "/" + "small_vocab_en", 'r') as f:
    eng_sentences = f.read().split('\n')
    f.close()
with open(data_path + "/" + "small_vocab_fr", 'r') as f:
    fre_sentences = f.read().split('\n')
    f.close()
print('Dataset Loaded')

Dataset Loaded


In [6]:
for i in range(2):
    print(f'english sentence {i+1}: {eng_sentences[i]}')
    print(f'fre_sentence {i+1}: {fre_sentences[i]}')
    print('\n')

english sentence 1: new jersey is sometimes quiet during autumn , and it is snowy in april .
fre_sentence 1: new jersey est parfois calme pendant l' automne , et il est neigeux en avril .


english sentence 2: the united states is usually chilly during july , and it is usually freezing in november .
fre_sentence 2: les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .




In [1]:
import numpy as np
from keras.layers import Input, Embedding, LSTM, Dense
from keras.models import Model
from keras.optimizers import Adam
#from keras.losses import frerse_categorical_crossentropy

from tensorflow.keras.losses import categorical_crossentropy

from keras.layers import LSTM

from keras.layers import TimeDistributed

Using TensorFlow backend.


## Preprocessing Steps

#### Append 'START_ ' and ' _END' at the beginning and the end of the french sentences. So that our model can distinguish the start and end of the sentence

In [0]:
fre_sentences = ['START_ ' + text + ' _END' for text in fre_sentences]
# Create vocabulary of words
all_eng_words=set()
for eng in eng_sentences:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)
    
all_french_words=set()
for fr in fre_sentences:
    for word in fr.split():
        if word not in all_french_words:
            all_french_words.add(word)
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_french_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_french_words)
# del all_eng_words, all_french_words

### Dictionary of word-index.

In [0]:
input_token_index = {w:i for i, w in enumerate(input_words)}
target_token_index = {w:i for i, w in enumerate(target_words)}

- The target sequence should be one timestep ahead of decoder inputs. Because the model takes starting word of the sentence as  input and predicts the next word in the sentence.
- Each sequence of the target data is converted to one-hot code representation vectors

In [0]:
eng_seq_len = 30
fre_seq_len = 30

In [0]:
encoder_input_data = np.zeros(
    (len(eng_sentences), eng_seq_len),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(fre_sentences), fre_seq_len),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(fre_sentences), fre_seq_len, num_decoder_tokens),
    dtype='float32')
# generate data
for i, (input_text, target_text) in enumerate(zip(eng_sentences, fre_sentences)):
    for t, word in enumerate(input_text.split()):
        encoder_input_data[i, t] = input_token_index[word]
    for t, word in enumerate(target_text.split()):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = target_token_index[word]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[word]] = 1

In [31]:
decoder_target_data[0].shape

(30, 357)

# Build the se2seq model

### 1. Encoder architecture

In [None]:
embedding_size = 50
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
from keras.utils import plot_model
encoder_inputs = Input(shape=(None,))
# English words embedding
en_x=  Embedding(num_encoder_tokens, embedding_size)(encoder_inputs)
# Encoder lstm
encoder = LSTM(50, return_state=True)
encoder_outputs, state_h, state_c = encoder(en_x)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

### 2. Decoder architecture

In [0]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
# french word embeddings
dex=  Embedding(num_decoder_tokens, embedding_size)
final_dex= dex(decoder_inputs)
# decoder lstm
decoder_lstm = LSTM(50, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(final_dex,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
# While training, model takes eng and french words and outputs #translated french word
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# rmsprop is preferred for nlp tasks
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [14]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 50)     11350       input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 50)     17850       input_2[0][0]                    
____________________________________________________________________________________________

## Train the model

In [15]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=128,
          epochs=50,
          validation_split=0.20)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 110288 samples, validate on 27573 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x7fb4330362b0>

# Inference Model

### 1.  Encoder

In [None]:
# define the encoder model 
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

### 2. Decoder

In [16]:
# Redefine the decoder model with decoder will be getting below inputs from encoder while in prediction
decoder_state_input_h = Input(shape=(50,))
decoder_state_input_c = Input(shape=(50,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
final_dex2= dex(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)
# sampling model will take encoder states and decoder_input(seed initially) and output the predictions(french word index) We dont care about decoder_states2
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_token_index = dict(
    (i, token) for token, i in input_token_index.items())
reverse_target_token_index = dict(
    (i, token) for token, i in target_token_index.items())

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 50)          11350     
_________________________________________________________________
lstm_1 (LSTM)                [(None, 50), (None, 50),  20200     
Total params: 31,550
Trainable params: 31,550
Non-trainable params: 0
_________________________________________________________________


## Function for making predictions using encoder and decoder

In [0]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']
# Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)
# Sample a token
        sampled_token_index = np.argmax(output_tokens[0, 0, :])
        sampled_token = reverse_target_token_index[sampled_token_index]
        decoded_sentence += ' '+sampled_token
# Exit condition: either hit max length
        # or find stop character.
        if (sampled_token == '_END' or
           len(decoded_sentence) > 31):
            stop_condition = True
# Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
# Update states
        states_value = [h, c]
    return decoded_sentence

## Making a prediction - Language translation.

In [18]:
input_seq = encoder_input_data[14077:14078]
decoder_sentence = decode_sequence(input_seq)
decoder_sentence

" paris est jamais chaud au mois d' octobre , et il est"