# Seqeunce Model with Attention

- Keras built-in layers

In [2]:
import tensorflow as tf
import keras as keras
print(tf.__version__)
print(keras.__version__)

2.2.0
2.4.3


In [160]:
from random import randint
from numpy import array
from numpy import argmax
from numpy import array_equal
from keras import Model
from keras.models import Sequential
from keras.layers import LSTM, GRU, Concatenate
from keras.layers import Attention
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import RepeatVector
from keras import Input
from attention import AttentionLayer

# generate a sequence of random integers
def generate_sequence(length, n_unique):
    return [randint(0, n_unique - 1) for _ in range(length)]


# one hot encode sequence
def one_hot_encode(sequence, n_unique):
    encoding = list()
    for value in sequence:
        vector = [0 for _ in range(n_unique)]
        vector[value] = 1
        encoding.append(vector)
    return array(encoding)


# decode a one hot encoded string
def one_hot_decode(encoded_seq):
    return [argmax(vector) for vector in encoded_seq]


# prepare data for the LSTM
def get_pair(n_in, n_out, cardinality):
    # generate random sequence
    sequence_in = generate_sequence(n_in, cardinality)
    sequence_out = sequence_in[:n_out] + [0 for _ in range(n_in - n_out)]
    # one hot encode
    X = one_hot_encode(sequence_in, cardinality)
    y = one_hot_encode(sequence_out, cardinality)
    # reshape as 3D
    X = X.reshape((1, X.shape[0], X.shape[1]))
    y = y.reshape((1, y.shape[0], y.shape[1]))
    return X, y


# # define the encoder-decoder model
# def baseline_model(n_timesteps_in, n_features):
#     model = Sequential()
#     model.add(LSTM(150, input_shape=(n_timesteps_in, n_features)))
#     model.add(RepeatVector(n_timesteps_in))
#     model.add(LSTM(150, return_sequences=True))
#     model.add(TimeDistributed(Dense(n_features, activation='softmax')))
#     model.compile(loss='categorical_crossentropy',
#                   optimizer='adam',
#                   metrics=['accuracy'])
#     return model


# # define the encoder-decoder with attention model
# def attention_model(n_timesteps_in, n_features):
#     model = Sequential()
#     model.add(
#         LSTM(150,
#              input_shape=(n_timesteps_in, n_features),
#              return_sequences=True))
#     model.add(AttentionDecoder(150, n_features))
#     model.compile(loss='categorical_crossentropy',
#                   optimizer='adam',
#                   metrics=['accuracy'])
#     return model


# # train and evaluate a model, return accuracy
# def train_evaluate_model(model, n_timesteps_in, n_timesteps_out, n_features):
#     # train LSTM
#     for epoch in range(5000):
#         # generate new random sequence
#         X, y = get_pair(n_timesteps_in, n_timesteps_out, n_features)
#         # fit model for one epoch on this sequence
#         model.fit(X, y, epochs=1, verbose=0)
#     # evaluate LSTM
#     total, correct = 100, 0
#     for _ in range(total):
#         X, y = get_pair(n_timesteps_in, n_timesteps_out, n_features)
#         yhat = model.predict(X, verbose=0)
#         if array_equal(one_hot_decode(y[0]), one_hot_decode(yhat[0])):
#             correct += 1
#     return float(correct) / float(total) * 100.0

In [161]:
n_features = 50
n_timesteps_in = 5
n_timesteps_out = 2
X, y = get_pair(n_timesteps_in, n_timesteps_out, n_features)

In [162]:
print(one_hot_decode(X[0]))
print(one_hot_decode(y[0]))
print(X.shape)
print(y.shape)

[35, 30, 25, 24, 47]
[35, 30, 0, 0, 0]
(1, 5, 50)
(1, 5, 50)


In [163]:
get_pair(n_timesteps_in, n_timesteps_out, n_features)

(array([[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          1, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0]]]),
 array([[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
          0, 0, 0, 

In [178]:
batch_size=1
en_timesteps=5
fr_timesteps=2
en_vsize=50
fr_vsize=50
hidden_size=150


encoder_inputs = Input(batch_shape=(batch_size,en_timesteps, en_vsize), name='encoder_inputs') 
decoder_inputs = Input(batch_shape=(batch_size, fr_timesteps, fr_vsize),name='decoder_inputs')

#encoder_inputs = X
#decoder_inputs = y

encoder_gru =GRU(hidden_size, return_sequences=True, return_state=True, name='encoder_gru') 
encoder_out, encoder_state = encoder_gru(encoder_inputs)

decoder_gru =GRU(hidden_size, return_sequences=True, return_state=True, name='decoder_gru') 
decoder_out, decoder_state = decoder_gru(decoder_inputs,initial_state=encoder_state)


attn_layer = AttentionLayer(name='attention_layer') 
attn_out, attn_states = attn_layer([encoder_out, decoder_out])


decoder_concat_input =Concatenate(axis=-1, name='concat_layer')([decoder_out, attn_out])

dense =Dense(fr_vsize, activation='softmax', name='softmax_layer') 
dense_time = TimeDistributed(dense, name='time_distributed_layer') 
decoder_pred = dense_time(decoder_concat_input)
full_model =Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_pred) 
full_model.compile(optimizer='adam', loss='categorical_crossentropy')

In [179]:
full_model.summary()

Model: "model_17"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     [(1, 5, 50)]         0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     [(1, 5, 50)]         0                                            
__________________________________________________________________________________________________
encoder_gru (GRU)               [(1, 5, 20), (1, 20) 4320        encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_gru (GRU)               [(1, 5, 20), (1, 20) 4320        decoder_inputs[0][0]             
                                                                 encoder_gru[0][1]         

In [181]:
losses = []
for epoch in range(100):  
    X, y = get_pair(en_timesteps, fr_timesteps,en_vsize)
    #     X = array(one_hot_decode(X[0])).reshape(1, X[0].shape[0])
    #     y = array(one_hot_decode(y[0])).reshape(1,y[0].shape[0])
    #     full_model.fit(X,y, epochs=1, verbose=1)

    #     en_onehot_seq = to_categorical(
    #         en_seq[bi:bi + batch_size, :], num_classes=en_vsize)
    #     fr_onehot_seq = to_categorical(
    #         fr_seq[bi:bi + batch_size, :], num_classes=fr_vsize)

#     full_model.fit([X, y[:, :-1, :]], y[:, 1:, :])
    full_model.fit([X,y],y)

#     l = full_model.evaluate([X, y[:, :-1, :]], y[:, 1:, :],
#                             batch_size=batch_size, verbose=2)
    l = full_model.evaluate([X,y],y, batch_size=batch_size, verbose=2)

    losses.append(l)

1/1 - 0s - loss: 3.8247
1/1 - 0s - loss: 3.7739
1/1 - 0s - loss: 3.8382
1/1 - 0s - loss: 3.8196
1/1 - 0s - loss: 3.7958
1/1 - 0s - loss: 3.8420
1/1 - 0s - loss: 3.8095
1/1 - 0s - loss: 3.8428
1/1 - 0s - loss: 3.8432
1/1 - 0s - loss: 3.7790
1/1 - 0s - loss: 3.8351
1/1 - 0s - loss: 3.8435
1/1 - 0s - loss: 3.8539
1/1 - 0s - loss: 3.8156
1/1 - 0s - loss: 3.8289
1/1 - 0s - loss: 3.8397
1/1 - 0s - loss: 3.9150
1/1 - 0s - loss: 3.7489
1/1 - 0s - loss: 3.8430
1/1 - 0s - loss: 3.8034
1/1 - 0s - loss: 3.8793
1/1 - 0s - loss: 3.8648
1/1 - 0s - loss: 3.8271
1/1 - 0s - loss: 3.7741
1/1 - 0s - loss: 3.8058
1/1 - 0s - loss: 3.8363
1/1 - 0s - loss: 3.7898
1/1 - 0s - loss: 3.8136
1/1 - 0s - loss: 3.8395
1/1 - 0s - loss: 3.8326
1/1 - 0s - loss: 3.8415
1/1 - 0s - loss: 3.8188
1/1 - 0s - loss: 3.8561
1/1 - 0s - loss: 3.7906
1/1 - 0s - loss: 3.8790
1/1 - 0s - loss: 3.8017
1/1 - 0s - loss: 3.7722
1/1 - 0s - loss: 3.8135
1/1 - 0s - loss: 3.8108
1/1 - 0s - loss: 3.8549
1/1 - 0s - loss: 3.8365
1/1 - 0s - loss:

1/1 - 0s - loss: 3.7561
1/1 - 0s - loss: 3.7723
1/1 - 0s - loss: 3.7311
1/1 - 0s - loss: 3.8103
1/1 - 0s - loss: 3.7448
1/1 - 0s - loss: 3.8286
1/1 - 0s - loss: 3.7435
1/1 - 0s - loss: 3.7209
1/1 - 0s - loss: 3.7386
1/1 - 0s - loss: 3.7687


In [182]:
print(np.mean(losses))

3.795807890892029


In [184]:
total, correct= 100,0
for _ in range(10):
    X,y = get_pair(en_timesteps, fr_timesteps,en_vsize)
    yhat = full_model.predict([X,y], verbose=0)
    print('Expected', one_hot_decode(y[0]), 
          'Predicted', one_hot_decode(yhat[0]))

Expected [26, 20, 48, 31, 19] Predicted [39, 39, 41, 41, 19]
Expected [38, 38, 38, 49, 1] Predicted [42, 42, 42, 38, 1]
Expected [32, 1, 28, 30, 8] Predicted [32, 36, 42, 42, 8]
Expected [49, 19, 7, 0, 31] Predicted [18, 49, 7, 39, 39]
Expected [6, 29, 19, 7, 27] Predicted [6, 29, 32, 7, 7]
Expected [29, 43, 16, 14, 24] Predicted [29, 47, 39, 14, 39]
Expected [1, 19, 33, 36, 35] Predicted [1, 19, 33, 36, 36]
Expected [8, 3, 30, 20, 37] Predicted [8, 7, 6, 39, 35]
Expected [49, 33, 10, 31, 32] Predicted [40, 43, 49, 35, 32]
Expected [37, 46, 4, 22, 6] Predicted [18, 17, 4, 29, 6]
