
To implement ```Bi-LSTM```, we will need the following code to build the encoder **in Section 3**. Do NOT use Bi-LSTM for the decoder. 

In [1]:
# from tensorflow.keras.layers import Bidirectional, Concatenate

# encoder_bilstm = Bidirectional(LSTM(latent_dim, return_state=True, 
#                                   dropout=0.5, name='encoder_lstm'))
# _, forward_h, forward_c, backward_h, backward_c = encoder_bilstm(encoder_inputs)

# state_h = Concatenate()([forward_h, backward_h])
# state_c = Concatenate()([forward_c, backward_c])

In [2]:
import warnings
warnings.filterwarnings("ignore")

## 1. Data preparation

1. Download spanish-english data from http://www.manythings.org/anki/
2. You may try to use other languages.
3. Unzip the .ZIP file.
4. Put the .TXT file (e.g., "deu.txt") in the directory "./Data/".
5. Fill in your data directory in section 1.1.

### 1.1. Load and clean text


In [3]:
import re
import string
from unicodedata import normalize
import numpy

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

def clean_data(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return numpy.array(cleaned)

In [4]:
# e.g., filename = 'Data/deu.txt'
# filename = <file path>
filename = '/content/spa.txt'

# e.g., n_train = 20000
n_train = 20000

In [5]:
# load dataset
doc = load_doc(filename)

# split into Language1-Language2 pairs
pairs = to_pairs(doc)

# clean sentences
clean_pairs = clean_data(pairs)[0:n_train, :]

In [6]:
for i in range(3000, 3010):
    print('[' + clean_pairs[i, 0] + '] => [' + clean_pairs[i, 1] + ']')

[were young] => [somos jovenes]
[weve eaten] => [hemos comido]
[what a bore] => [que aburrimiento]
[what a dope] => [que burro eres]
[what a dope] => [que burro]
[what a heel] => [que tipo tan arrastrado]
[what a jerk] => [que pendejo]
[what a jerk] => [que imbecil]
[what a jerk] => [que cretino]
[what a life] => [que vida]


In [7]:
input_texts = clean_pairs[:, 0]
target_texts = ['\t' + text + '\n' for text in clean_pairs[:, 1]]

print('Length of input_texts:  ' + str(input_texts.shape))
print('Length of target_texts: ' + str(input_texts.shape))

Length of input_texts:  (20000,)
Length of target_texts: (20000,)


In [8]:
max_encoder_seq_length = max(len(line) for line in input_texts)
max_decoder_seq_length = max(len(line) for line in target_texts)

print('max length of input  sentences: %d' % (max_encoder_seq_length))
print('max length of target sentences: %d' % (max_decoder_seq_length))

max length of input  sentences: 18
max length of target sentences: 48


**Remark:** To this end, you have two lists of sentences: input_texts and target_texts

## 2. Text processing

### 2.1. Convert texts to sequences

- Input: A list of $n$ sentences (with max length $t$).
- It is represented by a $n\times t$ matrix after the tokenization and zero-padding.

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# encode and pad sequences
def text2sequences(max_len, lines):
    tokenizer = Tokenizer(char_level=True, filters='')
    tokenizer.fit_on_texts(lines)
    seqs = tokenizer.texts_to_sequences(lines)
    seqs_pad = pad_sequences(seqs, maxlen=max_len, padding='post')
    return seqs_pad, tokenizer.word_index


encoder_input_seq, input_token_index = text2sequences(max_encoder_seq_length, 
                                                      input_texts)
decoder_input_seq, target_token_index = text2sequences(max_decoder_seq_length, 
                                                       target_texts)

print('shape of encoder_input_seq: ' + str(encoder_input_seq.shape))
print('shape of input_token_index: ' + str(len(input_token_index)))
print('shape of decoder_input_seq: ' + str(decoder_input_seq.shape))
print('shape of target_token_index: ' + str(len(target_token_index)))

shape of encoder_input_seq: (20000, 18)
shape of input_token_index: 27
shape of decoder_input_seq: (20000, 48)
shape of target_token_index: 29


In [10]:
# print(input_texts[1190:1210])
# print(encoder_input_seq[1190:1210])

In [11]:
num_encoder_tokens = len(input_token_index) + 1
num_decoder_tokens = len(target_token_index) + 1

print('num_encoder_tokens: ' + str(num_encoder_tokens))
print('num_decoder_tokens: ' + str(num_decoder_tokens))

num_encoder_tokens: 28
num_decoder_tokens: 30


**Remark:** To this end, the input language and target language texts are converted to 2 matrices. 

- Their number of rows are both n_train.
- Their number of columns are respective max_encoder_seq_length and max_decoder_seq_length.

The followings print a sentence and its representation as a sequence.

In [12]:
target_texts[100]

'\tsali\n'

In [13]:
decoder_input_seq[100, :]

array([ 6,  5,  4, 12, 11,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)

## 2.2. One-hot encode

- Input: A list of $n$ sentences (with max length $t$).
- It is represented by a $n\times t$ matrix after the tokenization and zero-padding.
- It is represented by a $n\times t \times v$ tensor ($t$ is the number of unique chars) after the one-hot encoding.

In [14]:
from tensorflow.keras.utils import to_categorical

# one hot encode target sequence
def onehot_encode(sequences, max_len, vocab_size):
    n = len(sequences)
    data = numpy.zeros((n, max_len, vocab_size))
    for i in range(n):
        data[i, :, :] = to_categorical(sequences[i], num_classes=vocab_size)
    return data

encoder_input_data = onehot_encode(encoder_input_seq, max_encoder_seq_length, num_encoder_tokens)
decoder_input_data = onehot_encode(decoder_input_seq, max_decoder_seq_length, num_decoder_tokens)

decoder_target_seq = numpy.zeros(decoder_input_seq.shape)
decoder_target_seq[:, 0:-1] = decoder_input_seq[:, 1:]
decoder_target_data = onehot_encode(decoder_target_seq, 
                                    max_decoder_seq_length, 
                                    num_decoder_tokens)

print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_target_data.shape)

(20000, 18, 28)
(20000, 48, 30)
(20000, 48, 30)


## 3. Build the networks (for training)


- Build encoder, decoder, and connect the two modules to get "model". 

- Fit the model on the bilingual data to train the parameters in the encoder and decoder.



### 3.1. Encoder network

- Input:  one-hot encode of the input language

- Return: 

    -- output (all the hidden states   $h_1, \cdots , h_t$) are always discarded
    
    -- the final hidden state  $h_t$
    
    -- the final conveyor belt $c_t$

In [15]:
from tensorflow.keras.layers import Input, LSTM, Bidirectional, Concatenate
from tensorflow.keras.models import Model

latent_dim = 256

# inputs of the encoder network
encoder_inputs = Input(shape=(None, num_encoder_tokens), 
                       name='encoder_inputs')


# set the LSTM layer
# encoder_lstm = Bidirectional(LSTM(latent_dim, return_state=True, 
#                     dropout=0.5, name='encoder_lstm'))
# _, state_h, state_c = encoder_lstm(encoder_inputs)


encoder_bilstm = Bidirectional(LSTM(latent_dim, return_state=True, 
                                  dropout=0.5, name='encoder_bilstm'))
_, forward_h, forward_c, backward_h, backward_c = encoder_bilstm(encoder_inputs)

state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])

# build the encoder network model
encoder_model = Model(inputs=encoder_inputs, 
                      outputs=[state_h, state_c],
                      name='encoder')

Print a summary and save the encoder network structure to "./encoder.pdf"

In [16]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

SVG(model_to_dot(encoder_model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(
    model=encoder_model, show_shapes=False,
    to_file='encoder.pdf'
)

encoder_model.summary()

Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None, 28)]   0           []                               
                                                                                                  
 bidirectional (Bidirectional)  [(None, 512),        583680      ['encoder_inputs[0][0]']         
                                 (None, 256),                                                     
                                 (None, 256),                                                     
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                            

### 3.2. Decoder network

- Inputs:  

    -- one-hot encode of the target language
    
    -- The initial hidden state $h_t$ 
    
    -- The initial conveyor belt $c_t$ 

- Return: 

    -- output (all the hidden states) $h_1, \cdots , h_t$

    -- the final hidden state  $h_t$ (discarded in the training and used in the prediction)
    
    -- the final conveyor belt $c_t$ (discarded in the training and used in the prediction)

In [17]:
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model

# inputs of the decoder network
decoder_input_h = Input(shape=(2*latent_dim,), name='decoder_input_h')
decoder_input_c = Input(shape=(2*latent_dim,), name='decoder_input_c')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# set the LSTM layer
decoder_lstm = LSTM(2*latent_dim, return_sequences=True, 
                    return_state=True, dropout=0.5, name='decoder_lstm')
decoder_lstm_outputs, state_h, state_c = decoder_lstm(decoder_input_x, 
                                                      initial_state=[decoder_input_h, decoder_input_c])

# set the dense layer
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_lstm_outputs)

# build the decoder network model
decoder_model = Model(inputs=[decoder_input_x, decoder_input_h, decoder_input_c],
                      outputs=[decoder_outputs, state_h, state_c],
                      name='decoder')

Print a summary and save the encoder network structure to "./decoder.pdf"

In [18]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

SVG(model_to_dot(decoder_model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(
    model=decoder_model, show_shapes=False,
    to_file='decoder.pdf'
)

decoder_model.summary()

Model: "decoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder_input_x (InputLayer)   [(None, None, 30)]   0           []                               
                                                                                                  
 decoder_input_h (InputLayer)   [(None, 512)]        0           []                               
                                                                                                  
 decoder_input_c (InputLayer)   [(None, 512)]        0           []                               
                                                                                                  
 decoder_lstm (LSTM)            [(None, None, 512),  1112064     ['decoder_input_x[0][0]',        
                                 (None, 512),                     'decoder_input_h[0][0]',  

### 3.3. Connect the encoder and decoder

In [19]:
# input layers
encoder_input_x = Input(shape=(None, num_encoder_tokens), name='encoder_input_x')
decoder_input_x = Input(shape=(None, num_decoder_tokens), name='decoder_input_x')

# connect encoder to decoder
encoder_final_states = encoder_model([encoder_input_x])
decoder_lstm_output, _, _ = decoder_lstm(decoder_input_x, initial_state=encoder_final_states)
decoder_pred = decoder_dense(decoder_lstm_output)

model = Model(inputs=[encoder_input_x, decoder_input_x], 
              outputs=decoder_pred, 
              name='model_training')

In [20]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot, plot_model

SVG(model_to_dot(model, show_shapes=False).create(prog='dot', format='svg'))

plot_model(
    model=model, show_shapes=False,
    to_file='model_training.pdf'
)

model.summary()

Model: "model_training"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input_x (InputLayer)   [(None, None, 28)]   0           []                               
                                                                                                  
 decoder_input_x (InputLayer)   [(None, None, 30)]   0           []                               
                                                                                                  
 encoder (Functional)           [(None, 512),        583680      ['encoder_input_x[0][0]']        
                                 (None, 512)]                                                     
                                                                                                  
 decoder_lstm (LSTM)            [(None, None, 512),  1112064     ['decoder_input_x[0]

### 3.4. Fit the model on the bilingual dataset

- encoder_input_data: one-hot encode of the input language

- decoder_input_data: one-hot encode of the input language

- decoder_target_data: labels (left shift of decoder_input_data)

- tune the hyper-parameters

- stop when the validation loss stop decreasing.

In [21]:
print('shape of encoder_input_data' + str(encoder_input_data.shape))
print('shape of decoder_input_data' + str(decoder_input_data.shape))
print('shape of decoder_target_data' + str(decoder_target_data.shape))

shape of encoder_input_data(20000, 18, 28)
shape of decoder_input_data(20000, 48, 30)
shape of decoder_target_data(20000, 48, 30)


In [22]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

model.fit([encoder_input_data, decoder_input_data],  # training data
          decoder_target_data,                       # labels (left shift of the target sequences)
          batch_size=64, epochs=50, validation_split=0.2)

model.save('seq2seq.h5')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


## 4. Make predictions

- In this section, you need to complete section 4.2 to translate English to the target language.


### 4.1. Translate English to Spanish

1. Encoder read a sentence (source language) and output its final states, $h_t$ and $c_t$.
2. Take the [star] sign "\t" and the final state $h_t$ and $c_t$ as input and run the decoder.
3. Get the new states and predicted probability distribution.
4. sample a char from the predicted probability distribution
5. take the sampled char and the new states as input and repeat the process (stop if reach the [stop] sign "\n").

In [23]:
# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [24]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = numpy.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # this line of code is greedy selection
        # try to use multinomial sampling instead (with temperature)
        sampled_token_index = numpy.argmax(output_tokens[0, -1, :])
        
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        target_seq = numpy.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]

    return decoded_sentence


In [25]:
for seq_index in range(2100, 2120):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('English:       ', input_texts[seq_index])
    print('Spanish (true): ', target_texts[seq_index][1:-1])
    print('Spanish (pred): ', decoded_sentence[0:-1])


-
English:        dont smoke
Spanish (true):  no fumeis
Spanish (pred):  no seas nana
-
English:        dont smoke
Spanish (true):  no fumais
Spanish (pred):  no seas nana
-
English:        dont speak
Spanish (true):  no hables
Spanish (pred):  no seas derado
-
English:        dont worry
Spanish (true):  no te preocupes
Spanish (pred):  no seas nana
-
English:        dont worry
Spanish (true):  no os preocupeis
Spanish (pred):  no seas nana
-
English:        dont worry
Spanish (true):  no se preocupen
Spanish (pred):  no seas nana
-
English:        finish this
Spanish (true):  termine esto
Spanish (pred):  lo hicid esto
-
English:        finish this
Spanish (true):  termina esto
Spanish (pred):  lo hicid esto
-
English:        finish this
Spanish (true):  termina esto
Spanish (pred):  lo hicid esto
-
English:        finish this
Spanish (true):  terminen esto
Spanish (pred):  lo hicid esto
-
English:        finish this
Spanish (true):  terminad esto
Spanish (pred):  lo hicid esto
-
Engl

### 4.2. Translate an English sentence to Spanish Sentence

1. Tokenization
2. One-hot encode
3. Translate

In [26]:
input_sentence = 'I love you'

input_low = input_sentence.lower()
input_sequence = []

for token in  input_low:
  # Using the individual words(tokens) in the input sentence, find the corresponding dictionary index created earlier in Section 2.1 ("text to Sequence")
  token_index = input_token_index[token]
  # Appending the token indices in the input_sequence list 
  input_sequence.append(token_index)

# Need to pad the sequences as the lenght will vary for each sequence
# Pad_sequences is provided a list of sequences as an input and sequence is a list of integers. Therefore [input_sequence]
input_sequence_pad = pad_sequences([input_sequence], maxlen=max_encoder_seq_length, padding='post')

# One hot encode the Input sequence
input_x = onehot_encode(input_sequence_pad, max_encoder_seq_length, num_encoder_tokens)

# Translate the sentence using decode

translated_sentence = decode_sequence(input_x)

print('source sentence is: ' + input_sentence)
print('translated sentence is: ' + translated_sentence)

source sentence is: I love you
translated sentence is: me encanta



# 5. Evaluate the translation using BLEU score


- Randomly partition the dataset to training, validation, and test.

- Evaluate the BLEU score using the test set. Report the average.


### 5.1. Partition the dataset to training, validation, and test. Build new token index.

1. You may try to load more data/lines from text file.
2. Convert text to sequences and build token index using training data.
3. One-hot encode your training and validation text sequences.

In [27]:
import numpy as np
n_train = 40000

# load dataset
doc1 = load_doc('/content/spa.txt')

# split into Language1-Language2 pairs
pairs1 = to_pairs(doc1)
# clean sentences
clean_pairs1 = clean_data(pairs1)[0:n_train, :]

input_texts1 = clean_pairs1[:, 0]
target_texts1 = np.array(['\t' + text1 + '\n' for text1 in clean_pairs1[:, 1]]) #Change the np.array to list later
print('Length of input_texts:  ' + str(input_texts1.shape))
print('Length of target_texts: ' + str(target_texts1.shape))

Length of input_texts:  (40000,)
Length of target_texts: (40000,)


### Splitting the dataset into test and train

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(input_texts1, target_texts1, test_size=0.005, random_state=42, shuffle = True)

print('Train Feature Shape: ', X_train.shape)
# print('Train Target Shape: ', len(y_train))
print('Train Target Shape: ', y_train.shape)

print('----------------------------------------')
print('Test Feature Shape: ', X_test.shape)
# print('Test Target Shape: ', len(y_test))
print('Test Target Shape: ', y_test.shape)

Train Feature Shape:  (39800,)
Train Target Shape:  (39800,)
----------------------------------------
Test Feature Shape:  (200,)
Test Target Shape:  (200,)


In [29]:
max_encoder_seq_length1 = max(len(line) for line in X_train)
max_decoder_seq_length1 = max(len(line) for line in y_train)

print('Max length of input  sentences: %d' % (max_encoder_seq_length1))
print('Max length of target sentences: %d' % (max_decoder_seq_length1))

Max length of input  sentences: 22
Max length of target sentences: 68


## Text processing

### Convert texts to sequences

- Input: A list of $n$ sentences (with max length $t$).
- It is represented by a $n\times t$ matrix after the tokenization and zero-padding.

In [30]:
encoder_input_seq1, input_token_index1 = text2sequences(max_encoder_seq_length1, 
                                                      X_train)
decoder_input_seq1, target_token_index1 = text2sequences(max_decoder_seq_length1, 
                                                       y_train)

print('shape of encoder_input_seq: ' + str(encoder_input_seq1.shape))
print('shape of input_token_index: ' + str(len(input_token_index1)))
print('shape of decoder_input_seq: ' + str(decoder_input_seq1.shape))
print('shape of target_token_index: ' + str(len(target_token_index1)))

shape of encoder_input_seq: (39800, 22)
shape of input_token_index: 27
shape of decoder_input_seq: (39800, 68)
shape of target_token_index: 29


In [31]:
num_encoder_tokens1 = len(input_token_index1) + 1
num_decoder_tokens1 = len(target_token_index1) + 1
print('num_encoder_tokens: ' + str(num_encoder_tokens1))
print('num_decoder_tokens: ' + str(num_decoder_tokens1))

num_encoder_tokens: 28
num_decoder_tokens: 30


### Partitioning Data into Training and Validation Set

In [32]:
X_train1, X_val, y_train1, y_val = train_test_split(encoder_input_seq1, decoder_input_seq1, test_size=0.2, random_state=42, shuffle = True)

print('Train Feature Shape: ', X_train1.shape)
print('Train Target Shape: ', y_train1.shape)
print('-------------------------------------------')
print('Val Feature Shape: ', X_val.shape)
print('Val Target Shape: ', y_val.shape)

Train Feature Shape:  (31840, 22)
Train Target Shape:  (31840, 68)
-------------------------------------------
Val Feature Shape:  (7960, 22)
Val Target Shape:  (7960, 68)


In [33]:
max_encoder_seq_length1 = max(len(line) for line in X_train1)
max_decoder_seq_length1 = max(len(line) for line in y_train1)

print('max length of input  sentences: %d' % (max_encoder_seq_length1))
print('max length of target sentences: %d' % (max_decoder_seq_length1))

max length of input  sentences: 22
max length of target sentences: 68


### One Hot Encoding Training Dataset




In [34]:
encoder_input_data_train = onehot_encode(X_train1, max_encoder_seq_length1, num_encoder_tokens1)
decoder_input_data_train = onehot_encode(y_train1, max_decoder_seq_length1, num_decoder_tokens1)

decoder_target_seq_train = numpy.zeros(y_train1.shape)
decoder_target_seq_train[:, 0:-1] = y_train1[:, 1:]
decoder_target_data_train = onehot_encode(decoder_target_seq_train, 
                                    max_decoder_seq_length1, 
                                    num_decoder_tokens)

print(encoder_input_data_train.shape)
print(decoder_input_data_train.shape)
print(decoder_target_data_train.shape)

(31840, 22, 28)
(31840, 68, 30)
(31840, 68, 30)


In [35]:
print(encoder_input_seq)

[[17  4  0 ...  0  0  0]
 [17  4  0 ...  0  0  0]
 [17  4  0 ...  0  0  0]
 ...
 [ 8  2  1 ...  7  5 25]
 [ 8  2  1 ...  5  9 13]
 [ 8  2  1 ... 14  2  7]]


### One Hot Encoding Validation Dataset




In [36]:
encoder_input_data_val = onehot_encode(X_val, max_encoder_seq_length1, num_encoder_tokens1)
decoder_input_data_val = onehot_encode(y_val, max_decoder_seq_length1, num_decoder_tokens1)

decoder_target_seq_val = numpy.zeros(y_val.shape)
decoder_target_seq_val[:, 0:-1] = y_val[:, 1:]
decoder_target_data_val = onehot_encode(decoder_target_seq_val, 
                                    max_decoder_seq_length1, 
                                    num_decoder_tokens1)

print(encoder_input_data_val.shape)
print(decoder_input_data_val.shape)
print(decoder_target_data_val.shape)

(7960, 22, 28)
(7960, 68, 30)
(7960, 68, 30)


### 5.2 Retrain the previous Bidirectional LSTM model with training and validation data and tune the parameters (learning rate, optimizer, etc) based on validation score

1. Use the model structure in section 3 to train a new model with new training and validation datasets.
2. Based on validation BLEU score or loss to tune parameters.

In [37]:
latent_dim = 256

# inputs of the encoder network
encoder_inputs_full = Input(shape=(None, num_encoder_tokens1), 
                       name='encoder_inputs_full')

encoder_bilstm_full = Bidirectional(LSTM(latent_dim, return_state=True, 
                                  dropout=0.5, name='encoder_bilstm_full'))
_, forward_h_full, forward_c_full, backward_h_full, backward_c_full = encoder_bilstm_full(encoder_inputs_full)

state_h_full = Concatenate()([forward_h_full, backward_h_full])
state_c_full = Concatenate()([forward_c_full, backward_c_full])

# build the encoder network model
encoder_model_full = Model(inputs=encoder_inputs_full, 
                      outputs=[state_h_full, state_c_full],
                      name='encoder_full')
encoder_model_full.summary()

Model: "encoder_full"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs_full (InputLaye  [(None, None, 28)]  0           []                               
 r)                                                                                               
                                                                                                  
 bidirectional_1 (Bidirectional  [(None, 512),       583680      ['encoder_inputs_full[0][0]']    
 )                               (None, 256),                                                     
                                 (None, 256),                                                     
                                 (None, 256),                                                     
                                 (None, 256)]                                          

In [38]:
# inputs of the decoder network
decoder_input_h_full = Input(shape=(2*latent_dim,), name='decoder_input_h_full')
decoder_input_c_full = Input(shape=(2*latent_dim,), name='decoder_input_c_full')
decoder_input_x_full = Input(shape=(None, num_decoder_tokens1), name='decoder_input_x_full')

# set the LSTM layer
decoder_lstm_full = LSTM(2*latent_dim, return_sequences=True, 
                    return_state=True, dropout=0.5, name='decoder_lstm_full')
decoder_lstm_outputs_full, state_h_full, state_c_full = decoder_lstm_full(decoder_input_x_full, 
                                                      initial_state=[decoder_input_h_full, decoder_input_c_full])

# set the dense layer
decoder_dense = Dense(num_decoder_tokens1, activation='softmax', name='decoder_dense_full')
decoder_outputs = decoder_dense(decoder_lstm_outputs_full)

# build the decoder network model
decoder_model_full = Model(inputs=[decoder_input_x_full, decoder_input_h_full, decoder_input_c_full],
                      outputs=[decoder_outputs, state_h_full, state_c_full],
                      name='decoder_full')
decoder_model_full.summary()

Model: "decoder_full"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder_input_x_full (InputLay  [(None, None, 30)]  0           []                               
 er)                                                                                              
                                                                                                  
 decoder_input_h_full (InputLay  [(None, 512)]       0           []                               
 er)                                                                                              
                                                                                                  
 decoder_input_c_full (InputLay  [(None, 512)]       0           []                               
 er)                                                                                   

### Connect the encoder and decoder

In [39]:
# input layers
encoder_input_x_full = Input(shape=(None, num_encoder_tokens1), name='encoder_input_x_full')
decoder_input_x_full = Input(shape=(None, num_decoder_tokens1), name='decoder_input_x_full')

# connect encoder to decoder
encoder_final_states = encoder_model_full([encoder_input_x_full])
decoder_lstm_output, _, _ = decoder_lstm_full(decoder_input_x_full, initial_state=encoder_final_states)
decoder_pred = decoder_dense(decoder_lstm_output)

model_full = Model(inputs=[encoder_input_x_full, decoder_input_x_full], 
              outputs=decoder_pred, 
              name='model_training_full')

In [40]:
model_full.summary()

Model: "model_training_full"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input_x_full (InputLay  [(None, None, 28)]  0           []                               
 er)                                                                                              
                                                                                                  
 decoder_input_x_full (InputLay  [(None, None, 30)]  0           []                               
 er)                                                                                              
                                                                                                  
 encoder_full (Functional)      [(None, 512),        583680      ['encoder_input_x_full[0][0]']   
                                 (None, 512)]                                   

In [41]:
decoder_target_data_train.shape

(31840, 68, 30)

In [42]:
from tensorflow import keras
model_full.compile(keras.optimizers.Adam(learning_rate=3e-3), loss='categorical_crossentropy')

model_full.fit([encoder_input_data_train, decoder_input_data_train],  # training data
          decoder_target_data_train,                       # labels (left shift of the target sequences)
          batch_size=128, epochs=50, validation_data = ([encoder_input_data_val, decoder_input_data_val],  # validation data
          decoder_target_data_val))

model.save('seq2seq_full.h5')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [43]:
# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index1.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index1.items())

In [44]:
def decode_sequence1(input_seq):
    states_value = encoder_model_full.predict(input_seq)

    target_seq = numpy.zeros((1, 1, num_decoder_tokens1))
    target_seq[0, 0, target_token_index1['\t']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model_full.predict([target_seq] + states_value)

        # this line of code is greedy selection
        # try to use multinomial sampling instead (with temperature)
        sampled_token_index = numpy.argmax(output_tokens[0, -1, :])
        # print(sampled_token_index)
        if sampled_token_index in reverse_target_char_index.keys():
          sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length1):
            stop_condition = True

        target_seq = numpy.zeros((1, 1, num_decoder_tokens1))
        target_seq[0, 0, sampled_token_index] = 1.

        states_value = [h, c]

    return decoded_sentence


In [45]:
for seq_index in range(2100, 2120):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data_train[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence1(input_seq)
    print('-')
    print('English:       ', input_texts[seq_index])
    print('Spanish (true): ', target_texts[seq_index][1:-1])
    print('Spanish (pred): ', decoded_sentence[0:-1])


-
English:        dont smoke
Spanish (true):  no fumeis
Spanish (pred):  el no comparto
-
English:        dont smoke
Spanish (true):  no fumais
Spanish (pred):  me gusta esta comida
-
English:        dont speak
Spanish (true):  no hables
Spanish (pred):  sos muy enteligente
-
English:        dont worry
Spanish (true):  no te preocupes
Spanish (pred):  extrano a mis amigos
-
English:        dont worry
Spanish (true):  no os preocupeis
Spanish (pred):  no esta buscando trabajar
-
English:        dont worry
Spanish (true):  no se preocupen
Spanish (pred):  hice mi conscinn
-
English:        finish this
Spanish (true):  termine esto
Spanish (pred):  todavia esta confundido
-
English:        finish this
Spanish (true):  termina esto
Spanish (pred):  no le vi nada
-
English:        finish this
Spanish (true):  termina esto
Spanish (pred):  estas equivocado
-
English:        finish this
Spanish (true):  terminen esto
Spanish (pred):  dame un asiento
-
English:        finish this
Spanish (true

In [46]:
input_sentence = 'I love you'

input_low = input_sentence.lower()
input_sequence = []

for token in  input_low:
  # Using the individual words(tokens) in the input sentence, find the corresponding dictionary index created earlier in Section 2.1 ("text to Sequence")
  token_index = input_token_index1[token]
  # Appending the token indices in the input_sequence list 
  input_sequence.append(token_index)

# Need to pad the sequences as the lenght will vary for each sequence
# Pad_sequences is provided a list of sequences as an input and sequence is a list of integers. Therefore [input_sequence]
input_sequence_pad = pad_sequences([input_sequence], maxlen=max_encoder_seq_length1, padding='post')

# One hot encode the Input sequence
input_x = onehot_encode(input_sequence_pad, max_encoder_seq_length1, num_encoder_tokens1)

# Translate the sentence using decode
translated_sentence = decode_sequence1(input_x)
print('source sentence is: ' + input_sentence)
print('translated sentence is: ' + translated_sentence)

source sentence is: I love you
translated sentence is: yo amo



### 5.3 Evaluate the BLEU score using the test set.

1. Use trained model above to calculate the BLEU score with testing dataset.
2. A reasonable should be 0.1-0.3. The higher, the better.

In [47]:
from nltk.translate.bleu_score import corpus_bleu


# Generating a two dimesnional list of all the encoded test datapoints 
input_sequence_final = []
for input in X_test:
  input_sequence = []
  for token in input.lower():
    # Using the individual words(tokens) in the input sentence, find the corresponding dictionary index created earlier in Section 5.1 ("text to Sequence")
    token_index = input_token_index1[token]
    # Appending the token indices in the input_sequence list 
    input_sequence.append(token_index)
  input_sequence_final.append(input_sequence)

# Need to pad the sequences as the lenght will vary for each sequence
input_sequence_pad = np.array(pad_sequences(input_sequence_final, maxlen=max_encoder_seq_length1, padding='post'))

# One hot encode the Input sequence
input_x = onehot_encode(input_sequence_pad, max_encoder_seq_length1, num_encoder_tokens1)

# Translate the sentence using decode
translated_sentence_final = []
# for i in range(0,input_x.shape[0]):
# print(input_x.shape[0])
for i in range(0,input_x.shape[0]):
   translated_sentence = decode_sequence1(input_x[i:i+1])
   translated_sentence_final.append(translated_sentence)



In [48]:
# print('Source sentence is: ' + X_test[10] +'\n')
# print('Translated sentence is: ' + translated_sentence_final[10])
# print('Actual Sentence is:' + y_test[10])

for i in range(10,20):
  print('Source sentence is:  '+ X_test[i].rstrip())
  print("Actual Sentence:-    ",y_test[i].rstrip())
  print("Translated Sentence:-",translated_sentence_final[i].strip())
  print("\n")


Source sentence is:  this ought to help
Actual Sentence:-     	esto deberia ayudar
Translated Sentence:- esto perro es de ayuda


Source sentence is:  its not unusual
Actual Sentence:-     	no es inusual
Translated Sentence:- no es insulto


Source sentence is:  i burned them
Actual Sentence:-     	yo los queme
Translated Sentence:- lo abraze


Source sentence is:  you may return
Actual Sentence:-     	puede que vuelvas
Translated Sentence:- puede que tengas razon


Source sentence is:  its a dead giveaway
Actual Sentence:-     	es una clara senal de la verdad
Translated Sentence:- es una cara de calera


Source sentence is:  i hate beans
Actual Sentence:-     	detesto las habas
Translated Sentence:- odio las persos


Source sentence is:  i didnt clean it
Actual Sentence:-     	yo no lo limpie
Translated Sentence:- no lo he locado


Source sentence is:  are you productive
Actual Sentence:-     	eres productivo
Translated Sentence:- eres prisiona


Source sentence is:  she rubbed her ey

In [65]:
translated_sentence_final_token = []
for i in translated_sentence_final:
  translated_sequence_token = []
  # print(list(i.replace("\n","").split(" ")))
  translated_sentence_final_token.append(list(i.replace("\n","").split(" ")))
#   translated_sentence_final_token.append(list(i.split()))
# # print(translated_sentence_final_token[0:10])

actual_sentence_final_token = []
for i in range(0,y_test.shape[0]):
  # print(list(y_test[i].replace("\t","").replace("\n","").split(" ")))
  actual_sentence_final_token.append(list(y_test[i].replace("\t","").replace("\n","").split(" ")))
#   actual_sentence_final_token.append(list(y_test[i].replace('\t','')))
# # print(actual_sentence_final_token[0:10])

# # for i in range(0,20):
# #   print("Actual Sentence:-    ", len(translated_sentence_final[i].strip()), ":" ,translated_sentence_final[i].rstrip())
# #   print("Translated Sentence:-", len(y_test[i].strip()), ":"  ,y_test[i].strip())
# #   print("\n")




In [79]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

# Here the translation and references are split into words to be used for Blue Scores

bleu_scores = 0
for i in range(0,y_test.shape[0]):
  bleu_scores += sentence_bleu(actual_sentence_final_token[i],translated_sentence_final_token[i],weights=(1, 0, 0, 0))
  # print("Actual Sentence",  actual_sentence_final_token[i],  "\n")
  # print("Translated Sentence",  translated_sentence_final_token[i],  "\n")
  # print(sentence_bleu(actual_sentence_final_token[i],translated_sentence_final_token[i],weights=(1, 0, 0, 0)))
score = bleu_scores/y_test.shape[0]
# print(str(bleu_scores/150))
print("Blue Score of the model at Word level is:", str(score))

Blue Score of the model at Word level is: 0.019067904264967454


In [80]:
translated_sentence_final_token1 = []
for i in translated_sentence_final[0:X_test.shape[0]]:
  translated_sequence_token = []
  # print(list(i))
  translated_sentence_final_token1.append(list(i))
# print(translated_sentence_final_token[0:10])

actual_sentence_final_token1 = []
for i in range(0,X_test.shape[0]):
  # print(list(y_test[i]))
  actual_sentence_final_token1.append(list(y_test[i].replace('\t','')))
# print(actual_sentence_final_token[0:10])

# for i in range(0,20):
#   print("Actual Sentence:-    ", len(translated_sentence_final[i].strip()), ":" ,translated_sentence_final[i].rstrip())
#   print("Translated Sentence:-", len(y_test[i].strip()), ":"  ,y_test[i].strip())
#   print("\n")




In [81]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

# Here the translation and references are split into characters to be used for Blue Scores

bleu_scores1 = 0
for i in range(0,X_test.shape[0]):
  bleu_scores1 += sentence_bleu(actual_sentence_final_token1[i],translated_sentence_final_token1[i],weights=(1, 0, 0, 0))
  # print("Actual Sentence",  actual_sentence_final_token[i],  "\n")
  # print("Translated Sentence",  translated_sentence_final_token[i],  "\n")
  # print(sentence_bleu(actual_sentence_final_token[i],translated_sentence_final_token[i],weights=(1, 0, 0, 0)))
score1 = bleu_scores1/X_test.shape[0]
# print(str(bleu_scores/150))
print("Blue Score of the model at Character level is:", str(score1))

Blue Score of the model at Character level is: 0.4886178155083214
