In [1]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense


## Data Preparation 

In [2]:
# Creating 2 dictionaries each (word-index and index-word) for the source and target languages

with open('../data/english_vocabulary.txt', 'r') as f:
  words = f.readlines()

english_index_to_word_dict = dict([(index, word.rstrip('\n')) for index, word in enumerate(words)])
english_word_to_index_dict = dict([(word.rstrip('\n'), index) for index, word in enumerate(words)])

with open('../data/yoda_vocabulary.txt', 'r') as f:
  words = f.readlines()

yoda_index_to_word_dict = dict([(index, word.rstrip('\n')) for index, word in enumerate(words)])
yoda_word_to_index_dict = dict([(word.rstrip('\n'), index) for index, word in enumerate(words)])

# Obtaining the number of unique tokens in each vocabulary
english_vocab_length = len(english_word_to_index_dict)
yoda_vocab_length = len(yoda_word_to_index_dict)

'''
Limiting the lengths of the sequences (in terms of number of words) in both the source and target languages 
For source language, max. length = 15 (97% of the sentences have length <= 15)
For target language, max. length = 20 (97% of the sentence have length <= 20)
'''
max_length_source, max_length_target = 15, 20 

english_sentences, yoda_english_sentences = [], []
with open('../data/english_sentences.txt', 'r') as fp: 
  line = fp.readline()
  while line:
    line = line.rstrip("\n")
    english_sentences.append([int(token) for token in line.split(" ")])
    line = fp.readline()

with open('../data/yoda_english_sentences.txt', 'r') as fp:
  line = fp.readline()
  while line:
    # Adding the start and end tokens to the target sentences
    line = "1 " + line.rstrip("\n") + " 2"
    yoda_english_sentences.append([int(token) for token in line.split(" ")])
    line = fp.readline() 


# Padding the source and target sentences to ensure that all of them have the same length
encoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(english_sentences, maxlen=max_length_source, padding='post')
decoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(yoda_english_sentences, maxlen=max_length_target, padding='post')

# Splitting the data into training, validation/dev and test sets
encoder_input_train, encoder_input_test, decoder_input_train, decoder_input_test = train_test_split(encoder_input_data, decoder_input_data, test_size=0.1)
encoder_input_train, encoder_input_val, decoder_input_train, decoder_input_val = train_test_split(encoder_input_train, decoder_input_train, test_size=0.1)


In [3]:
'''
 The output of the decoder uses the one-hot representation of each word in a sentence because the output of the seq2seq model
 is obtained from a softmax unit. 
 The size of the decoder output sentence would be (max_length_target, yoda_vocab_length).
 The decoder output does not start with the "_GO" token. The rest of the content is the same as that of decoder input. 
 So the decoder output can be defined as the decoder input shifted or offset by one timestep.
'''
def generate_decoder_output(decoder_input_data):
    
    number_of_sentences = len(decoder_input_data)
    decoder_output_data = np.zeros((number_of_sentences, max_length_target, yoda_vocab_length), dtype='float32')

    for i in range(number_of_sentences):
        for j in range(1, max_length_target):
            decoder_output_data[i, j - 1, decoder_input_data[i, j]] = 1

    
    return decoder_output_data

# Generating the output data for the train, validation and test sets
decoder_output_test = generate_decoder_output(decoder_input_test)
decoder_output_val = generate_decoder_output(decoder_input_val)
decoder_output_train = generate_decoder_output(decoder_input_train)

## Building the Model