# Machine Translation with Attention Char-based
- Word-based machine translation with attention seq-to-seq model
- Bahdanau Attention Layber developed in [Thushan](https://github.com/thushv89/attention_keras)
- Thushan Ganegedara's
[Attention in Deep Networks with Keras](https://towardsdatascience.com/light-on-math-ml-attention-with-keras-dc8dbc1fad39)
- This notebook implements the example of English-to-Chinese neural machine translation. (It took 14h 26min 15s to train the model on CPU)

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# import os
# os.chdir('/content/drive/My Drive/_MySyncDrive/Repository/python-notes/nlp')

In [3]:
%pwd

'/Users/Alvin/GoogleDrive/_MySyncDrive/Repository/ENC2045/temp'

## Set up Dependencies

In [4]:
import re
import tensorflow as tf
from tensorflow import keras # use tensorflow keras
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, LSTM, Dense, GRU
import numpy as np
from random import randint, randrange
from numpy import array
from numpy import argmax
from numpy import array_equal
from keras import Model
from keras.models import Sequential
from keras.layers import LSTM, GRU, Concatenate
from keras.layers import Attention
from keras.layers import Dense, Masking
from keras.layers import TimeDistributed
from keras.layers import RepeatVector
from keras import Input
from keras.utils import to_categorical, plot_model

In [5]:
print("Tensorflow Version:",tf.__version__)

Tensorflow Version: 2.4.1


## DL Hyerparameters

In [6]:
#### hyperparameters
batch_size = 128 # Batch size for training
latent_dim = 256 # Latent dimensionality of the encoder and decoder
epochs = 100 # Number of epochs to train for
num_samples=10000 # Number of samples to train on

## Data Preprocessing

- A character-based processing

In [7]:
# Path to the data txt file on disk.
data_path = '../../../RepositoryData/data/cmn.txt'

# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text = line.split('\t')
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

# Sort Dictionary
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)

# Find maxinum sent lengths 
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of lines:', len(lines))
print('Number of samples:', len(input_texts))
print('Number of unique input tokens (char):', num_encoder_tokens)
print('Number of unique output tokens (char):', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of lines: 19578
Number of samples: 10000
Number of unique input tokens (char): 73
Number of unique output tokens (char): 2640
Max sequence length for inputs: 31
Max sequence length for outputs: 22


## Keras Processing

### Text to Sequences

In [16]:
## input texts
input_tokenizer = keras.preprocessing.text.Tokenizer(oov_token='UNK', char_level=True, filters=None, lower=False)
input_tokenizer.fit_on_texts(input_texts)
encoder_input_sequences = input_tokenizer.texts_to_sequences(input_texts)
input_maxlen = np.max([len(l) for l in encoder_input_sequences])
encoder_input_sequences = pad_sequences(encoder_input_sequences, padding='post', maxlen=input_maxlen)

## target texts

target_tokenizer = keras.preprocessing.text.Tokenizer(oov_token='UNK', char_level=True, filters=None, lower=False)
target_tokenizer.fit_on_texts(target_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)
target_maxlen = np.max([len(l) for l in target_sequences])
decoder_sequences = pad_sequences(target_sequences, padding='post', maxlen = target_maxlen)

In [19]:
# Shapes of Input and Target Sequences
print(encoder_input_sequences.shape)
print(decoder_sequences.shape)

(10000, 31)
(10000, 22)


In [18]:
# ### vocab size
input_vsize = max(input_tokenizer.index_word.keys()) + 1
target_vsize = max(target_tokenizer.index_word.keys()) + 1
print(input_vsize)
print(target_vsize)

75
2642


In [20]:
print(num_encoder_tokens)
print(num_decoder_tokens)

## The differences between the tokenizer vocab size and the num_encoder_tokens
## came from the padding and unknown character

73
2640


### One-hot Encoding

In [21]:
encoder_input_data = to_categorical(encoder_input_sequences, num_classes=input_vsize)
decoder_data = to_categorical(decoder_sequences, num_classes=target_vsize)

decoder_input_data = decoder_data[:, :-1, :]
decoder_output_data = decoder_data[:,1:,:]

In [22]:
print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_output_data.shape)

(10000, 31, 75)
(10000, 21, 2642)
(10000, 21, 2642)


### Indices of Word Tokens

In [23]:
reverse_input_char_index = dict(
    (char, i) for char, i in enumerate(input_tokenizer.word_index))

reverse_target_char_index = dict(
    (char, i) for char, i in enumerate(target_tokenizer.word_index))

input_token_index = dict(
    (i, char) for char, i in enumerate(input_tokenizer.word_index))

target_token_index = dict(
    (i, char) for char, i in enumerate(target_tokenizer.word_index))

## Manual Processing

### Token Indices

In [24]:
# Create char index dictionary
## char as the key and index as the value
input_token_index2 = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index2 = dict(
    [(char, i) for i, char in enumerate(target_characters)])

In [25]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index2 = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index2 = dict(
    (i, char) for char, i in target_token_index.items())

### One-Hot Encoding of Tokens in Sequences

- For each token of the input and target sequences, convert it into a one-hot encoding vector.
- The size of the one-hot vector is the vocabulary size of the input/target language.

In [None]:
# Initialize encoder/decoder
## Both input output are three dimensional tensors,
## consisting of each sentence, with all words encoded in one-hot.

## Input tensor dimensions: [input_batch_size, input_sequence_length, input_vecob/char_size]
encoder_input_data2 = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')

## Output tensor dimensions: [output_batch_size, output_sequence_length, output_vecob/char_size]
decoder_input_data2 = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data2 = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

In [None]:
### Creating Masks
## Input tensor dimensions: [input_batch_size, input_sequence_length, input_vecob/char_size]
encoder_input_data_mask = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')

## Output tensor dimensions: [output_batch_size, output_sequence_length, output_vecob/char_size]
decoder_input_data_mask = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data_mask = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

In [None]:
print(decoder_inputs_mask(encoder_input_data[0:1,:,:])._keras_mask)

In [None]:
decoder_inputs_mask(encoder_input_data[0:1,:,:])

In [None]:
print(encoder_input_data.shape)
print(decoder_input_data.shape)
print(decoder_target_data.shape)

In [None]:
print(input_texts[:10])
print(target_texts[:10])

In [None]:
# One-hot encode input and output texts
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    
    ## One-hot encode input_text
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    
    ## End of encoder_input_data
    #encoder_input_data[i, t + 1:, input_token_index[' ']] = 1. 
    encoder_input_data[i, t + 1:, :] = -1. 
    encoder_input_data[i, t+1:t+2, input_token_index[' ']]= 1.0
    
    ## One-hot encode target_text
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        
        # When t > 0, this is the starting character of decoder_target_data
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
    
    ## End of decoder_input_data and decoder_output_data
#     decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
#     decoder_target_data[i, t:, target_token_index[' ']] = 1.
    decoder_input_data[i, t + 1:, :] = -1.
    decoder_target_data[i, t:, :] = -1.

In [None]:
## Creating Masks
# One-hot encode input and output texts
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    
    ## One-hot encode input_text
    for t, char in enumerate(input_text):
        encoder_input_data_mask[i, t, :] = 1.
    
    ## End of encoder_input_data
#     encoder_input_data_mask[i, t+1:, :] = 0 

    ## One-hot encode target_text
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data_mask[i, t, :] = 1.
        
        # When t > 0, this is the starting character of decoder_target_data
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data_mask[i, t - 1, :] = 1.
    
    # End of decoder_input_data and decoder_output_data
#     decoder_input_data_mask[i, t + 1:, :] = 0
#     decoder_target_data_mask[i, t:, :] = 0



In [None]:
# check  sequence
print([reverse_input_char_index.get(np.argmax(i)) for i in list(encoder_input_data[134,:,:])])
print([reverse_target_char_index.get(np.argmax(i)) for i in list(decoder_input_data[134,:,:])])
print([reverse_target_char_index.get(np.argmax(i)) for i in list(decoder_target_data[134,:,:])])


In [None]:
#check ont-hot inputs/targets
[reverse_input_char_index.get(np.argmax(i)) for i in encoder_input_data[0]]

In [None]:
[reverse_target_char_index.get(np.argmax(i)) for i in decoder_input_data[0]]

In [None]:
[reverse_target_char_index.get(np.argmax(i)) for i in decoder_target_data[0]]

## Define Model Architecture

### Encoder

In [None]:
# encoder_inputs_mask.compute_mask(encoder_input_data[0:1,:,:])

In [None]:
## Define Model

## Set up encoder
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens)) # one word at a time, with vocab_size dimension, i.e., one-hot encoding

encoder_inputs_masking = Masking(mask_value=-1.0, input_shape=(None, num_encoder_tokens)) # one word at a time, with vocab_size dimension, i.e., one-hot encoding
encoder_inputs_mask = encoder_inputs_masking(encoder_inputs)
encoder = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs_mask)


# The `encoder_outputs` will be used in AttentionLayer
## By default, in LSTM/GRU, when return_outputs=False, the `encoder_outputs` = `state_h`.
encoder_states = [state_h, state_c] # Two tensors, states_h and states_c, from LSTM

### Decoder

In [None]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens)) # one word at a time, with vocab_size dimension,

decoder_inputs_masking = Masking(mask_value=-1.0, input_shape=(None, num_decoder_tokens)) # one word at a time, with vocab_size dimension, i.e., one-hot encoding
decoder_inputs_mask = decoder_inputs_masking(decoder_inputs)
# We set up our decoder to return full output sequences, (i.e, `return_sequences=True`)
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_lstm_outputs, _, _ = decoder_lstm(decoder_inputs_mask,
                                     initial_state=encoder_states)

## Attention Layer
attn_layer = Attention()
attn_outputs= attn_layer([decoder_lstm_outputs,encoder_outputs])
    ## attention will output a tensor of same shapes as the first tensor in the input


## Use both Attention Outputs and Decoder Outputs to make prediction
# Concat attention input and decoder LSTM output
decoder_concat_input = Concatenate(axis=-1)([decoder_lstm_outputs,attn_outputs])

# Dense layer
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_dense_time = TimeDistributed(decoder_dense)
decoder_outputs = decoder_dense_time(decoder_concat_input)

### Note: This output Dense layer does not need to be wrapped within TimeDistributed.
### because the Dense layer takes in the decoder LSTM's outputs as the inputs.

### In training, the decoder LSTM's inputs would be the entire target sequences
###    and its outputs would be the all the hidden states h's.
###    and Dense Layer softmax all h's into one-hot of the target language characters

### In inferencing, the decoder LSTM's inputs would be one single character of the previous predicted target char
###    and its output would be the all the hidden states h's (but in fact only one becuase intput has only 1 char)
###    and Dense Layer softmax the h into one-hot of the target language character.

In [None]:
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)

In [None]:
plot_model(model,show_shapes=True)

In [None]:
model.summary()

In [None]:
# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history=model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=100,
          validation_split=0.2)

In [None]:
# # Save model
# model.save('keras_models/s2s-cmn-attention.h5')

In [None]:
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
# Plotting results
def plot1(history):

    matplotlib.rcParams['figure.dpi'] = 100
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(1, len(acc)+1)
    ## Accuracy plot
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    ## Loss plot
    plt.figure()

    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()

    
def plot2(history):
    pd.DataFrame(history.history).plot(figsize=(8,5))
    plt.grid(True)
    #plt.gca().set_ylim(0,1)
    plt.show()

In [None]:
plot1(history)

## Load Trained Model

In [None]:
## If the model is loaded via external files
## Load the encoder_model, decoder_model this way
from keras.models import load_model
model.load_weights('keras_models/s2s-cmn-attention.h5')


## Define Inference Model

### Inference Encoder Model

- When using the trained model (Encoder-Decoder), we need to define Encoder and Decoder for inferencing.
- For **Encoder Model**:
    - We use the input layer of the trained model's Encoder, which includes the encoder_input_data (input sequences one-hot)
    - We use the output last_h and last_c from the trained model's Encoder LSTM.
    

In [None]:
## Create Inference model
encoder_inputs = model.input[0] #input_1

encoder_outputs, state_h_enc, state_c_enc = model.layers[4].output # trained encoder's lstm outputs
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs, encoder_states])

In [None]:
plot_model(encoder_model, show_shapes=True)

### Inference Decoder Model

- For **Decoder Model**:
    - We use the input layer of the trained model's Decoder, which includes the decoder_input_data (target sequences one-hot)
    - We create two more Inputs, representing the Encoder's LSTM output, [last_h and last_c]
    - We use the trained Decoder's LSTM. In the training stage, the decoder LSTM takes in the output of Encoder's last_h and last_c as well as the entire decoder_inputs (complete target sequences). It outputs directly the predicted h's at each time step.
    - But in inferencing stage, the Inference Decoder decodes one token at a time, and returns the predicted h, as well as the last_h and last_c. These last_h and last_c will turn out to be the initial states of the Inference Decoder.

In [None]:

decoder_inputs = model.input[1]

decoder_state_input_h = Input(shape=(latent_dim,)) # state_h
decoder_state_input_c = Input(shape=(latent_dim,)) # state_c
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c] # concat state_h and state_c

decoder_lstm = model.layers[5] ## trained decoder's LSTM
decoder_attention = model.layers[6] ## trained decoder's Attention Layer

decoder_encoder_outputs_for_attention = Input(shape=(max_encoder_seq_length, latent_dim,))


## In training, we use `decoder_ouputs` only.

## In inferencing, we need `decoder_c, and decoder_h`
## because these c and h form the basis for next decoder input
decoder_lstm_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h_dec, state_c_dec]


attn_outputs, attn_weights = decoder_attention([decoder_lstm_outputs,decoder_encoder_outputs_for_attention], return_attention_scores=True)
    ## attention will output a tensor of same shapes as the first tensor in the input


## Use both Attention Outputs and Decoder Outputs to make prediction
# Concat attention input and decoder LSTM output
decoder_concat_input = Concatenate(axis=-1)([decoder_lstm_outputs,attn_outputs])


decoder_time_dense = model.layers[8] ## trained decoder's TimeDistributed(Dense)
decoder_outputs=decoder_time_dense(decoder_concat_input)


## Inference Model
decoder_model = Model(
    [decoder_inputs, decoder_encoder_outputs_for_attention] + decoder_states_inputs, # target sentence + encoder output h+c
    [decoder_outputs,attn_weights]+ decoder_states ) # decoder predicted char + decoder predicted h+c











#     """ Decoder (Inference) model """
#     decoder_inf_inputs = Input(batch_shape=(batch_size, 1, fr_vsize), name='decoder_word_inputs')
#     encoder_inf_states = Input(batch_shape=(batch_size, en_timesteps, hidden_size), name='encoder_inf_states')
#     decoder_init_state = Input(batch_shape=(batch_size, hidden_size), name='decoder_init')

#     decoder_inf_out, decoder_inf_state = decoder_gru(decoder_inf_inputs, initial_state=decoder_init_state)
#     attn_inf_out, attn_inf_states = attn_layer([encoder_inf_states, decoder_inf_out])
#     decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_inf_out, attn_inf_out])
#     decoder_inf_pred = TimeDistributed(dense)(decoder_inf_concat)
#     decoder_model = Model(inputs=[encoder_inf_states, decoder_init_state, decoder_inf_inputs],
#                           outputs=[decoder_inf_pred, attn_inf_states, decoder_inf_state])


In [None]:
plot_model(decoder_model, show_shapes=True)

In [None]:
decoder_model.summary()

### Decode Sequence

- Get the outputs [`last_h`, `last_c`] from the Inference Encoder given the input sequence.
- In the decoding stage, initialize the `target_seq` with `\t` and the initial states of Inference Decoder to be [`last_h`, `last_c`] from Inference Encoder.
- During the decoding stage:
    - Inference Decoder takes in one `target_seq` and [`last_h`, `last_c`] to predict next `target_seq`.
    - At the same time, Inference Decoder returns its [`last_h`, `last_c`].
    - These predicted `target_set` and [`last_h`, `last_c`] are recycled to be the inputs of next-round Inference Decoder.


In [None]:
def decode_sequence(input_seq):
    
    # Encode the input as state vectors.
    all_encoder_states, states_value = encoder_model.predict(input_seq) # output: [encoder_last_h, encoder_last_c]

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    
    
    # Within the WHILE-LOOP
    ## Update `target_seq`, `states_value, i.e., [h,c]`
    while not stop_condition:
        # inference starts at the first target char
        # first target char + encoder output h + c
        output_tokens, output_weights, h, c = decoder_model.predict(
            [target_seq, all_encoder_states ] + states_value)

        # Sample a token
        ## Choose the output char of the argmax prob
        ## one-hot decode the char and append to the `decoded_sentence`
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        ## everytime the target_seq is the cur_t char, one char a time
        ## the shape should be [1, ,1 vocab_size]
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        ## the h and c output from decoder at cur_t
        states_value = [h, c]

    return decoded_sentence

In [None]:
for seq_index in range(100,120):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

### Plot Attention?

In [None]:
input_texts_len = [len(i) for i in input_texts]
input_texts_len[np.argmax(input_texts_len)]

In [None]:
seq_index=500

input_seq = encoder_input_data[seq_index:seq_index+1,]

input_seq_len = input_texts_len[seq_index:seq_index+1]
# Encode the input as state vectors.
all_encoder_states, states_value = encoder_model.predict(input_seq) # output: [encoder_last_h, encoder_last_c]

print(input_texts[seq_index:seq_index+1])
print(input_texts_len[seq_index : seq_index+1])
print(input_seq)
print(all_encoder_states.shape)

In [None]:
# Generate empty target sequence of length 1.
target_seq = np.zeros((1, 1, num_decoder_tokens))

# Populate the first character of target sequence with the start character.
target_seq[0, 0, target_token_index['\t']] = 1.0

# Sampling loop for a batch of sequences
# (to simplify, here we assume a batch of size 1).
stop_condition = False
decoded_sentence = ''

decoder_weights=[]

# Within the WHILE-LOOP
## Update `target_seq`, `states_value, i.e., [h,c]`
while not stop_condition:
    # inference starts at the first target char
    # first target char + encoder output h + c
    output_tokens, output_weights, h, c = decoder_model.predict(
        [target_seq, all_encoder_states ] + states_value)

    decoder_weights.append(output_weights)
    # Sample a token
    ## Choose the output char of the argmax prob
    ## one-hot decode the char and append to the `decoded_sentence`
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_char = reverse_target_char_index[sampled_token_index]
    decoded_sentence += sampled_char

    # Exit condition: either hit max length
    # or find stop character.
    if (sampled_char == '\n' or
       len(decoded_sentence) > max_decoder_seq_length):
        stop_condition = True

    # Update the target sequence (of length 1).
    ## everytime the target_seq is the cur_t char, one char a time
    ## the shape should be [1, ,1 vocab_size]
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, sampled_token_index] = 1.0

    # Update states
    ## the h and c output from decoder at cur_t
    states_value = [h, c]

In [None]:
print(len(decoded_sentence)) # for each character in the target
print(len(decoder_weights)) # there is an attention weight

# each attention weight measures the relevance of each input sequence's character with the target sequence chacter at time t




In [None]:
np.log(decoder_weights)

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=["PingFang HK"]

input_char = [reverse_input_char_index.get(np.argmax(i)) for i in input_seq[0,:,:]] ##y-axis
target_char = list(decoded_sentence) ## x-axis
atten_weights=np.transpose(np.array(decoder_weights).reshape(len(target_char),max_encoder_seq_length)) ## harvest
# atten_weights ## 


## remove padding characters
input_char = input_char[:int(input_seq_len[0])]
atten_weights = atten_weights[:input_seq_len[0],:]

plt.rcParams['font.sans-serif']=["PingFang HK"]

fig, ax = plt.subplots(figsize=(10,10))
im = ax.imshow(atten_weights)

# We want to show all ticks...
ax.set_xticks(np.arange(len(target_char)))
ax.set_yticks(np.arange(len(input_char)))
# ... and label them with the respective list entries
ax.set_xticklabels(target_char)
ax.set_yticklabels(input_char)

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

# # Loop over data dimensions and create text annotations.
# for i in range(len(input_char)):
#     for j in range(len(target_char)):
#         text = ax.text(j, i, atten_weights[i, j],
#                        ha="center", va="center", color="w")

ax.set_title("Attention Weights of Each Target Char")
ax.tick_params(labelsize=12)
ax.tick_params(axis='x', labelrotation=90)
fig.tight_layout()
plt.show()


## End-to-End Implementation

In [None]:
## in_texts: a list of input sequences
def trans(in_texts):
    ## Input tensor dimensions: [input_batch_size, input_sequence_length, input_vecob/char_size]
    in_texts_onehot = np.zeros(
        (len(in_texts), max_encoder_seq_length, num_encoder_tokens),
     
        dtype='float32')
    
    ## Ont-hot encoding in_texts
    for i, text in enumerate(in_texts):
        ## One-hot encode input_text
        for t, char in enumerate(text):
            in_texts_onehot[i, t, input_token_index[char]] = 1.
        ## End of encoder_input_data
        in_texts_onehot[i, t + 1:, input_token_index[' ']] = 1. 
    
    # Char Holder
    target_texts = []
    
    
    ## Decoding Sequence
    for seq_index in range(len(in_texts)):
        input_seq = in_texts_onehot[seq_index: seq_index + 1]
        target_texts.append(decode_sequence(input_seq))
    
    
    return target_texts

In [None]:
trans(['How are you?', 'Ok!', 'Hurry!',"How's the weather?", "My name is Alvin."])

In [None]:

# Define an input sequence and process it.
if batch_size:
    encoder_inputs = Input(batch_shape=(batch_size, en_timesteps, en_vsize), name='encoder_inputs')
    decoder_inputs = Input(batch_shape=(batch_size, fr_timesteps - 1, fr_vsize), name='decoder_inputs')
else:
    encoder_inputs = Input(shape=(en_timesteps, en_vsize), name='encoder_inputs')
    if fr_timesteps:
        decoder_inputs = Input(shape=(fr_timesteps - 1, fr_vsize), name='decoder_inputs')
    else:
        decoder_inputs = Input(shape=(None, fr_vsize), name='decoder_inputs')

# Encoder GRU
encoder_gru = GRU(hidden_size, return_sequences=True, return_state=True, name='encoder_gru')
encoder_out, encoder_state = encoder_gru(encoder_inputs)

# Set up the decoder GRU, using `encoder_states` as initial state.
decoder_gru = GRU(hidden_size, return_sequences=True, return_state=True, name='decoder_gru')
decoder_out, decoder_state = decoder_gru(decoder_inputs, initial_state=encoder_state)

# Attention layer
attn_layer = AttentionLayer(name='attention_layer')
attn_out, attn_states = attn_layer([encoder_out, decoder_out])

# Concat attention input and decoder GRU output
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_out, attn_out])

# Dense layer
dense = Dense(fr_vsize, activation='softmax', name='softmax_layer')
dense_time = TimeDistributed(dense, name='time_distributed_layer')
decoder_pred = dense_time(decoder_concat_input)

# Full model
full_model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_pred)
full_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])

full_model.summary()


## How to Further Improve the Model?

- Use word embeddings
- Increase the training data size
- Try other sequence models (GRU)
- Try bidirectional RNN
- Increase the widths and depths of the sequence models

## References

## Functions

In [None]:
import re
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, LSTM, Dense, GRU
import numpy as np
from random import randint
from numpy import array
from numpy import argmax
from numpy import array_equal
from keras import Model
from keras.models import Sequential
from keras.layers import LSTM, GRU, Concatenate
from keras.layers import Attention
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import RepeatVector
from keras import Input
from attention import AttentionLayer
from keras.utils import to_categorical, plot_model


def clean_en_data(en_text):
  w = re.sub(r"([?.!,¿])", r" \1 ", en_text)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  # w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
  w = w.strip()
  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  #w = '<start> ' + w + ' <end>'
  return w


import jieba
def clean_zh_data(fr_text):
    return [' '.join(jieba.lcut(l)) for l in fr_text]


# Path to the data txt file on disk.
def get_data(data_path, train_test = 0.9):
#   data_path = '../../../RepositoryData/data/cmn.txt'
    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

    en_text=[clean_en_data(l.split('\t')[0]) for l in lines]
    fr_text=[l.split('\t')[-1] for l in lines]

    #fr_text = ['sos ' + sent[:-1] + 'eos .'  if sent.endswith('。') else 'sos ' + sent + ' eos .' for sent in fr_text]

    fr_text = clean_zh_data(fr_text)
    fr_text = ['sos ' + sent + ' eos' for sent in fr_text]
    
    np.random.seed(123)
    inds = np.arange(len(en_text))
    np.random.shuffle(inds)
        
    train_size = int(round(len(lines)*train_test))
    train_inds = inds[:train_size]
    test_inds = inds[train_size:]
    tr_en_text = [en_text[ti] for ti in train_inds]
    tr_fr_text = [fr_text[ti] for ti in train_inds]

    ts_en_text = [en_text[ti] for ti in test_inds]
    ts_fr_text = [fr_text[ti] for ti in test_inds]
    
    return tr_en_text, tr_fr_text, ts_en_text, ts_fr_text


## when the max_len is known, use this func to convert text to seq
def sents2sequences(tokenizer, sentences, reverse=False, pad_length=None, padding_type='post'):
    encoded_text = tokenizer.texts_to_sequences(sentences)
    preproc_text = pad_sequences(encoded_text, padding=padding_type, maxlen=pad_length)
    if reverse:
        preproc_text = np.flip(preproc_text, axis=1)
    return preproc_text



def preprocess_data(en_tokenizer, fr_tokenizer, en_text, fr_text):
    en_seq = en_tokenizer.texts_to_sequences(tr_en_text)
    en_timesteps = np.max([len(l) for l in en_seq])
    en_seq = pad_sequences(en_seq, padding='post', maxlen = en_timesteps)
    fr_seq = fr_tokenizer.texts_to_sequences(tr_fr_text)
    fr_timesteps = np.max([len(l) for l in fr_seq])
    fr_seq = pad_sequences(fr_seq, padding='post', maxlen = fr_timesteps)
    return en_seq, fr_seq

In [None]:
def define_nmt(hidden_size, batch_size, en_timesteps, en_vsize, fr_timesteps, fr_vsize):
    """ Defining a NMT model """

    # Define an input sequence and process it.
    if batch_size:
        encoder_inputs = Input(batch_shape=(batch_size, en_timesteps, en_vsize), name='encoder_inputs')
        decoder_inputs = Input(batch_shape=(batch_size, fr_timesteps - 1, fr_vsize), name='decoder_inputs')
    else:
        encoder_inputs = Input(shape=(en_timesteps, en_vsize), name='encoder_inputs')
        if fr_timesteps:
            decoder_inputs = Input(shape=(fr_timesteps - 1, fr_vsize), name='decoder_inputs')
        else:
            decoder_inputs = Input(shape=(None, fr_vsize), name='decoder_inputs')

    # Encoder GRU
    encoder_gru = GRU(hidden_size, return_sequences=True, return_state=True, name='encoder_gru')
    encoder_out, encoder_state = encoder_gru(encoder_inputs)

    # Set up the decoder GRU, using `encoder_states` as initial state.
    decoder_gru = GRU(hidden_size, return_sequences=True, return_state=True, name='decoder_gru')
    decoder_out, decoder_state = decoder_gru(decoder_inputs, initial_state=encoder_state)

    # Attention layer
    attn_layer = AttentionLayer(name='attention_layer')
    attn_out, attn_states = attn_layer([encoder_out, decoder_out])

    # Concat attention input and decoder GRU output
    decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_out, attn_out])

    # Dense layer
    dense = Dense(fr_vsize, activation='softmax', name='softmax_layer')
    dense_time = TimeDistributed(dense, name='time_distributed_layer')
    decoder_pred = dense_time(decoder_concat_input)

    # Full model
    full_model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_pred)
    full_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])

    full_model.summary()

    """ Inference model """
    batch_size = 1

    """ Encoder (Inference) model """
    encoder_inf_inputs = Input(batch_shape=(batch_size, en_timesteps, en_vsize), name='encoder_inf_inputs')
    encoder_inf_out, encoder_inf_state = encoder_gru(encoder_inf_inputs)
    encoder_model = Model(inputs=encoder_inf_inputs, outputs=[encoder_inf_out, encoder_inf_state])

    """ Decoder (Inference) model """
    decoder_inf_inputs = Input(batch_shape=(batch_size, 1, fr_vsize), name='decoder_word_inputs')
    encoder_inf_states = Input(batch_shape=(batch_size, en_timesteps, hidden_size), name='encoder_inf_states')
    decoder_init_state = Input(batch_shape=(batch_size, hidden_size), name='decoder_init')

    decoder_inf_out, decoder_inf_state = decoder_gru(decoder_inf_inputs, initial_state=decoder_init_state)
    attn_inf_out, attn_inf_states = attn_layer([encoder_inf_states, decoder_inf_out])
    decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_inf_out, attn_inf_out])
    decoder_inf_pred = TimeDistributed(dense)(decoder_inf_concat)
    decoder_model = Model(inputs=[encoder_inf_states, decoder_init_state, decoder_inf_inputs],
                          outputs=[decoder_inf_pred, attn_inf_states, decoder_inf_state])

    return full_model, encoder_model, decoder_model

def train(full_model, en_seq, fr_seq, batch_size, n_epochs=10):
    """ Training the model """

    for ep in range(n_epochs):
        losses = []
        for bi in range(0, en_seq.shape[0] - batch_size, batch_size):

            en_onehot_seq = to_categorical(
                en_seq[bi:bi + batch_size, :], num_classes=en_vsize)
            fr_onehot_seq = to_categorical(
                fr_seq[bi:bi + batch_size, :], num_classes=fr_vsize)

            full_model.train_on_batch(
                [en_onehot_seq, fr_onehot_seq[:, :-1, :]], fr_onehot_seq[:, 1:, :])

            l,a = full_model.evaluate([en_onehot_seq, fr_onehot_seq[:, :-1, :]], fr_onehot_seq[:, 1:, :],
                                    batch_size=batch_size, verbose=0)

            losses.append(l)
        if (ep + 1) % 1 == 0:
            print("Loss in epoch {}: {}".format(ep + 1, np.mean(losses)))


def infer_nmt(encoder_model, decoder_model, test_en_seq, en_vsize, fr_vsize, fr_timesteps):
    """
    Infer logic
    :param encoder_model: keras.Model
    :param decoder_model: keras.Model
    :param test_en_seq: sequence of word ids
    :param en_vsize: int
    :param fr_vsize: int
    :return:
    """

    test_fr_seq = sents2sequences(fr_tokenizer, ['sos'], fr_vsize)
    test_en_onehot_seq = to_categorical(test_en_seq, num_classes=en_vsize)
    test_fr_onehot_seq = np.expand_dims(
        to_categorical(test_fr_seq, num_classes=fr_vsize), 1)

    enc_outs, enc_last_state = encoder_model.predict(test_en_onehot_seq)
    dec_state = enc_last_state
    attention_weights = []
    fr_text = ''
    for i in range(fr_timesteps):

        dec_out, attention, dec_state = decoder_model.predict(
            [enc_outs, dec_state, test_fr_onehot_seq])
        dec_ind = np.argmax(dec_out, axis=-1)[0, 0]

        if dec_ind == 0:
            break
        test_fr_seq = sents2sequences(
            fr_tokenizer, [fr_index2word[dec_ind]], fr_vsize)
        test_fr_onehot_seq = np.expand_dims(
            to_categorical(test_fr_seq, num_classes=fr_vsize), 1)

        attention_weights.append((dec_ind, attention))
        fr_text += fr_index2word[dec_ind] + ' '

    return fr_text, attention_weights


In [None]:
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=["PingFang HK"]
def plot_attention_weights(encoder_inputs, attention_weights, en_id2word, fr_id2word, filename=None):
    """
    Plots attention weights
    :param encoder_inputs: Sequence of word ids (list/numpy.ndarray)
    :param attention_weights: Sequence of (<word_id_at_decode_step_t>:<attention_weights_at_decode_step_t>)
    :param en_id2word: dict
    :param fr_id2word: dict
    :return:
    """

    if len(attention_weights) == 0:
        print('Your attention weights was empty. No attention map saved to the disk. ' +
              '\nPlease check if the decoder produced  a proper translation')
        return

    mats = []
    dec_inputs = []
    for dec_ind, attn in attention_weights:
        mats.append(attn.reshape(-1))
        dec_inputs.append(dec_ind)
    attention_mat = np.transpose(np.array(mats))

    fig, ax = plt.subplots(figsize=(16, 16))
    ax.imshow(attention_mat)

    ax.set_xticks(np.arange(attention_mat.shape[1]))
    ax.set_yticks(np.arange(attention_mat.shape[0]))

    ax.set_xticklabels([fr_id2word[inp] if inp != 0 else "<Res>" for inp in dec_inputs])
    ax.set_yticklabels([en_id2word[inp] if inp != 0 else "<Res>" for inp in encoder_inputs.ravel()])

    ax.tick_params(labelsize=32)
    ax.tick_params(axis='x', labelrotation=90)

#     if not os.path.exists(config.RESULTS_DIR):
#         os.mkdir(config.RESULTS_DIR)
#     if filename is None:
#         plt.savefig( 'attention.png'))
#     else:
#         plt.savefig(os.path.join(config.RESULTS_DIR, '{}'.format(filename)))

In [None]:
### Get data
tr_en_text, tr_fr_text, ts_en_text, ts_fr_text = get_data(data_path='../../../RepositoryData/data/cmn.txt')

""" Defining tokenizers """
en_tokenizer = keras.preprocessing.text.Tokenizer(oov_token='UNK')
en_tokenizer.fit_on_texts(tr_en_text)

fr_tokenizer = keras.preprocessing.text.Tokenizer(oov_token='UNK')
fr_tokenizer.fit_on_texts(tr_fr_text)

### Getting sequence integer data
en_seq, fr_seq = preprocess_data(en_tokenizer, fr_tokenizer, tr_en_text, tr_fr_text)

### timestesps
en_timesteps = en_seq.shape[1]
fr_timesteps = fr_seq.shape[1]

### vocab size
en_vsize = max(en_tokenizer.index_word.keys()) + 1
fr_vsize = max(fr_tokenizer.index_word.keys()) + 1






In [None]:
###""" Defining the full model """
full_model, infer_enc_model, infer_dec_model = define_nmt(
    hidden_size=hidden_size,
    batch_size=batch_size,
    en_timesteps=en_timesteps,
    fr_timesteps=fr_timesteps,
    en_vsize=en_vsize,
    fr_vsize=fr_vsize)

In [None]:
from keras.utils import plot_model
plot_model(full_model, show_shapes=True)

In [None]:
# %%time

# train(full_model, en_seq, fr_seq, batch_size, n_epochs)


### Model Saving

In [None]:
# full_model.save('../../../RepositoryData/output/nmt-en-zh/nmt-en-zh-full-model.h5')
# infer_enc_model.save('../../../RepositoryData/output/nmt-en-zh/nmt-en-zh-infer-enc-model.h5')
# infer_dec_model.save('../../../RepositoryData/output/nmt-en-zh/nmt-en-zh-infer-dec-model.h5')

### Prediction

In [None]:
## load model
full_model.load_weights('../../../RepositoryData/output/nmt-en-zh/nmt-en-zh-full-model.h5')
infer_enc_model.load_weights('../../../RepositoryData/output/nmt-en-zh/nmt-en-zh-infer-enc-model.h5')
infer_dec_model.load_weights('../../../RepositoryData/output/nmt-en-zh/nmt-en-zh-infer-dec-model.h5')

In [None]:
plot_model(infer_enc_model,show_shapes=True)

In [None]:
plot_model(infer_dec_model, show_shapes=True)

In [None]:
""" Index2word """
en_index2word = dict(
    zip(en_tokenizer.word_index.values(), en_tokenizer.word_index.keys()))
fr_index2word = dict(
    zip(fr_tokenizer.word_index.values(), fr_tokenizer.word_index.keys()))


In [None]:
def translate(infer_enc_model, infer_dec_model, test_en_text, 
              en_vsize, fr_vsize, en_timesteps, fr_timesteps,
              en_tokenizer, fr_tokenizer):
    """ Inferring with trained model """
    test_en = test_en_text
    print('Translating: {}'.format(test_en))

    test_en_seq = sents2sequences(
        en_tokenizer, [test_en], pad_length=en_timesteps)

    test_fr, attn_weights = infer_nmt(
        encoder_model=infer_enc_model, decoder_model=infer_dec_model,
        test_en_seq=test_en_seq, en_vsize=en_vsize, fr_vsize=fr_vsize, fr_timesteps = fr_timesteps)
    print('\tFrench: {}'.format(test_fr))
    return test_en_seq, test_fr, attn_weights

In [None]:
test_en_seq, test_fr, attn_weights=translate(infer_enc_model=infer_enc_model,
          infer_dec_model=infer_dec_model,
          test_en_text=ts_en_text[2],
          en_vsize=en_vsize,
          fr_vsize=fr_vsize,
          en_timesteps=en_timesteps,
          fr_timesteps=fr_timesteps,
          en_tokenizer=en_tokenizer,
          fr_tokenizer=fr_tokenizer)



In [None]:
for i in range(10):
    test_en_seq, test_fr, attn_weights=translate(infer_enc_model=infer_enc_model,
              infer_dec_model=infer_dec_model,
              test_en_text=ts_en_text[i],
              en_vsize=en_vsize,
              fr_vsize=fr_vsize,
              en_timesteps=en_timesteps,
              fr_timesteps=fr_timesteps,
              en_tokenizer=en_tokenizer,
              fr_tokenizer=fr_tokenizer)

In [None]:
ts_fr_text[:10]

In [None]:
""" Attention plotting """
plot_attention_weights(test_en_seq, attn_weights,
                       en_index2word, fr_index2word)

In [None]:
%%time
def test(full_model, ts_enc_text, ts_dec_text, enc_tokenizer, dec_tokenizer, enc_vsize, dec_vsize, batch_size):
    # ### Getting sequence integer data
    ts_enc_seq, ts_dec_seq = preprocess_data(enc_tokenizer, dec_tokenizer, ts_enc_text, ts_dec_text)
    losses = []
    accuracies = []
    for bi in range(0, ts_enc_seq.shape[0] - batch_size, batch_size):
        enc_onehot_seq = to_categorical(
            ts_enc_seq[bi:bi + batch_size, :], num_classes=enc_vsize)
        dec_onehot_seq = to_categorical(
            ts_dec_seq[bi:bi + batch_size, :], num_classes=dec_vsize)

        # full_model.train_on_batch(
        #     [enc_onehot_seq, dec_onehot_seq[:, :-1, :]], dec_onehot_seq[:, 1:, :])
        l,a = full_model.evaluate([enc_onehot_seq, dec_onehot_seq[:, :-1, :]], dec_onehot_seq[:, 1:, :],
                                batch_size=batch_size, verbose=0)
        losses.append(l)
        accuracies.append(a)
    print('Average Loss:{}'.format(np.mean(losses)))
    print('Average Accuracy:{}'.format(np.mean(accuracies)))

test(full_model, ts_enc_text = ts_en_text, ts_dec_text = ts_fr_text, 
     enc_tokenizer = en_tokenizer, dec_tokenizer = fr_tokenizer, enc_vsize = en_vsize, dec_vsize = fr_vsize, batch_size = batch_size)