# Encoder Decoder - YouTube Tutorial

Example for machine translation using encoder-decoder. The dataset is set in a way of [english word,french word]

The input (encoder) will be character based and one-hot encoded

The decoder input will be the vector outputted from the encoder and the one-hot encoded characters for the decoder.

In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
import numpy as np

In [34]:
# Initial parameters

batch_size = 64
epochs = 100

# LSTM dimension
latent_dim = 256
num_samples = 10000

data_path = '../datasets/fra-eng/fra.txt'

In [18]:
# Vectorize the data

# English 
input_texts = []
input_characters = set()

# French
target_texts = []
target_characters = set()

with open(data_path, 'r', encoding='utf-8') as f:
  lines = f.read().split('\n')

num_lines = min(num_samples, len(lines) - 1)
for line in lines[:num_lines]:
  input_text, target_text, _ = line.split('\t')
  
  # we use "tab" as the start sequence character 
  # we use "\n" as end sequence character
  target_text = '\t' + target_text + '\n'

  input_texts.append(input_text)
  target_texts.append(target_text)

  for char in input_text:
    if char not in input_characters:
      input_characters.add(char)
  
  for char in target_text:
    if char not in target_characters:
      target_characters.add(char)


input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print ("Num unique tokens input: ", len(input_characters))
print ("Length of input texts: ", len(input_texts))
print ("Max input sequence length: ", max_encoder_seq_length)
print ("Max target sequence length: ", max_decoder_seq_length)
print ("Num unique tokens target: ", len(target_characters))
print ("Length of target texts: ", len(target_texts))

Num unique tokens input:  70
Length of input texts:  10000
Max input sequence length:  14
Max target sequence length:  59
Num unique tokens target:  93
Length of target texts:  10000


In [37]:
# Create a map from character to index

input_token_index = dict(
  [(char, i) for i,char in enumerate(input_characters)]
)

target_token_index = dict(
  [(char, i) for i,char in enumerate(target_characters)]
)

reverse_target_char_index = dict(
  [(i, char) for i,char in enumerate(target_characters)]
)

In [30]:
# Create a one-hot representation of the input characters

# Each text gets a one-hot matrix win which the row represents the character index in the text (num rows is equal to max
# seq length) and the number of columns is equal to the number of unique tokens.

encoder_input_data = np.zeros(
  shape=(len(input_texts), max_encoder_seq_length + 1, num_encoder_tokens),
  dtype='float32'
)

decoder_input_data = np.zeros(
  shape=(len(target_texts), max_decoder_seq_length + 1, num_decoder_tokens),
  dtype='float32'
)

decoder_target_data = np.zeros(
  shape=(len(target_texts), max_decoder_seq_length + 1, num_decoder_tokens),
  dtype='float32'
)

# Fill in the one-hot matrices

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
  
  for t, char in enumerate(input_text):
    # Processing the ith sequence, tth character and marking the column in which it equals 1
    encoder_input_data[i, t, input_token_index[char]] = 1
  encoder_input_data[i, t + 1, input_token_index[' ']] = 1

  for t, char in enumerate(target_text): 
    decoder_input_data[i, t, target_token_index[char]] = 1
    if t > 0:
      # decoder target data will be ahead by one timestep and will
      # not include the start character. For example, if in the decoder input data the character
      # V is in place 1, in the target data the same character is in place 0.
      decoder_target_data[i, t - 1, target_token_index[char]] = 1
  
  decoder_input_data[i, t + 1, target_token_index[' ']] = 1
  decoder_target_data[i, t:, target_token_index[' ']] = 1

print ("encoder_input_data: ", encoder_input_data.shape)
print ("decoder_input_data: ", decoder_input_data.shape)
print ("decoder_target_data: ", decoder_target_data.shape)


encoder_input_data:  (10000, 15, 70)
decoder_input_data:  (10000, 60, 93)
decoder_target_data:  (10000, 60, 93)


In [31]:
# Encoder definitions

# The shape argument indicates that the input sequences can have variable length (None) and each time step has 
# num_encoder_tokens features.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)

# We will pass this as the initial state for the decoder
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

In [32]:
# Decoder definitions

decoder_inputs = Input(shape=(None, num_decoder_tokens))

# We take the whole LSTM output sequence and pass it to a Dense layer.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

# The dense layer runs a softmax on the decoder outputs which will give the 
# probability of the next token
decoder_dense = Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

In [36]:
# Model definitions

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

model.fit([encoder_input_data, decoder_input_data], decoder_target_data, 
          batch_size=batch_size, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2da19d950>

In [51]:
# Inference

encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

def decode_sequence(input_seq):
    encoded_seq = [input_token_index[char.lower()] for char in input_seq]

    one_hot_input = np.zeros(shape=(1, len(input_seq) + 1, num_encoder_tokens))
    for t, char in enumerate(input_seq):
        one_hot_input[0, t, input_token_index[char]] = 1

    # Encode the input as state vectors.
    states_value = encoder_model.predict(one_hot_input)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))

    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

print (decode_sequence("I am"))

Prez-vous !

