[Reference](https://jscriptcoder.github.io/date-translator/Machine%20Translation%20with%20Attention%20model.html)

**Importing stuff**

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from keras.layers import RepeatVector, Dense, Activation, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from keras.models import load_model, Model
import keras.backend as K
import numpy as np
import random
import pandas as pd
from tqdm import tqdm
from babel.dates import format_date
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


**Reading stuff**

In [None]:
df = pd.read_csv(r"/content/drive/MyDrive/Data/DLNLP/Assignment4aDataset.txt",names=['human_date','machine_date'],sep=',')
df = df.sample(frac=1) #Shuffeling the rows of dataframe
df.shape[0]

40000

Making vocab-int mapping for machine and human dates:

In [None]:
def create_dataset(df):
    human_vocab = set()
    machine_vocab = set()
    dataset = []
    
    for index, row in df.iterrows():
        h = row['human_date']
        m = row['machine_date']
        dataset.append((h, m))
        human_vocab.update(tuple(h))
        machine_vocab.update(tuple(m))
    
    # We also add two special chars, <unk> for unknown characters, and <pad> to add padding at the end
    human = dict(zip(sorted(human_vocab) + ['<unk>', '<pad>'], list(range(len(human_vocab) + 2))))
    inv_machine = dict(enumerate(sorted(machine_vocab)))
    machine = {v: k for k, v in inv_machine.items()}
 
    return dataset, human, machine, inv_machine

In [None]:
dataset, human_vocab, machine_vocab, inv_machine_vocab = create_dataset(df)

In [None]:
dataset[:10]

[("'26 october 1662'", " '1662-10-26'"),
 ("'29 september 1533'", " '1533-09-29'"),
 ("'12 apr 1955'", " '1955-04-12'"),
 ("'15 january 1707'", " '1707-01-15'"),
 ("'1556 14 apr'", " '1556-04-14'"),
 ("'september 25 1731'", " '1731-09-25'"),
 ("'dec 5 1757'", " '1757-12-05'"),
 ("'24 january 1793'", " '1793-01-24'"),
 ("'tuesday august 21 1827'", " '1827-08-21'"),
 ("'30 september 2043'", " '2043-09-30'")]

In [None]:
human_vocab

{' ': 0,
 "'": 1,
 '/': 2,
 '0': 3,
 '1': 4,
 '2': 5,
 '3': 6,
 '4': 7,
 '5': 8,
 '6': 9,
 '7': 10,
 '8': 11,
 '9': 12,
 'a': 13,
 'b': 14,
 'c': 15,
 'd': 16,
 'e': 17,
 'f': 18,
 'g': 19,
 'h': 20,
 'i': 21,
 'j': 22,
 'l': 23,
 'm': 24,
 'n': 25,
 'o': 26,
 'p': 27,
 'r': 28,
 's': 29,
 't': 30,
 'u': 31,
 'v': 32,
 'w': 33,
 'y': 34,
 '<unk>': 35,
 '<pad>': 36}

In [None]:
machine_vocab

{' ': 0,
 "'": 1,
 '-': 2,
 '0': 3,
 '1': 4,
 '2': 5,
 '3': 6,
 '4': 7,
 '5': 8,
 '6': 9,
 '7': 10,
 '8': 11,
 '9': 12}

In [None]:
inv_machine_vocab

{0: ' ',
 1: "'",
 2: '-',
 3: '0',
 4: '1',
 5: '2',
 6: '3',
 7: '4',
 8: '5',
 9: '6',
 10: '7',
 11: '8',
 12: '9'}

**Preprocessing**

In [None]:
def string_to_int(string, length, vocab):
    string = string.lower()
    string = string.replace(',','')
    string = string.replace('\'','')
    
    if len(string) > length:
        string = string[:length]
        
    rep = list(map(lambda x: vocab.get(x, '<unk>'), string))
    
    if len(string) < length:
        rep += [vocab['<pad>']] * (length - len(string))
    
    return rep

def preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty):
    X, Y = zip(*dataset)
    
    X = np.array([string_to_int(i, Tx, human_vocab) for i in X])
    Y = [string_to_int(t, Ty, machine_vocab) for t in Y]
    
    Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
    Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))

    return X, np.array(Y), Xoh, Yoh

In [None]:
string_to_int('September 10, 1978', 30, human_vocab)

[29,
 17,
 27,
 30,
 17,
 24,
 14,
 17,
 28,
 0,
 4,
 3,
 0,
 4,
 12,
 10,
 11,
 36,
 36,
 36,
 36,
 36,
 36,
 36,
 36,
 36,
 36,
 36,
 36,
 36]

In [None]:
Tx = 30
Ty = 10
X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)

print("X.shape:", X.shape)
print("Y.shape:", Y.shape)
print("Xoh.shape:", Xoh.shape)
print("Yoh.shape:", Yoh.shape)

X.shape: (40000, 30)
Y.shape: (40000, 10)
Xoh.shape: (40000, 30, 37)
Yoh.shape: (40000, 10, 13)


In [None]:
index = 0
print("Source date:", dataset[index][0])
print("Target date:", dataset[index][1])
print()
print("Source after preprocessing (indices):", X[index])
print("Target after preprocessing (indices):", Y[index])
print()
print("Source after preprocessing (one-hot):", Xoh[index])
print("Target after preprocessing (one-hot):", Yoh[index])

Source date: '26 october 1662'
Target date:  '1662-10-26'

Source after preprocessing (indices): [ 1  5  9  0 26 15 30 26 14 17 28  0  4  9  9  5  1 36 36 36 36 36 36 36
 36 36 36 36 36 36]
Target after preprocessing (indices): [0 1 4 9 9 5 2 4 3 2]

Source after preprocessing (one-hot): [[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]
Target after preprocessing (one-hot): [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


**Define Model**



In [None]:
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor1 = Dense(10, activation = "tanh")
densor2 = Dense(1, activation = "relu")
activator = Activation('softmax', name='attention_vec')
dotor = Dot(axes = 1)

In [None]:
def one_step_attention(a, s_prev):
    s_prev = repeator(s_prev)
    concat = concatenator([a, s_prev])
    e = densor1(concat)
    energies = densor2(e)
    alphas = activator(energies)
    context = dotor([alphas, a])
    return context, alphas         #here

In [None]:
n_a = 32
n_s = 64
val_size = 0.1
post_activation_LSTM_cell = LSTM(n_s, return_state = True, name='final_LSTM')
output_layer = Dense(len(machine_vocab), activation='softmax')

In [None]:
def model(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size, gib_attention=False):
    X = Input(shape=(Tx, human_vocab_size))
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')
    s = s0
    c = c0
    
    outputs1 = []
    outputs2 = []
    
    a = Bidirectional(LSTM(n_a, return_sequences = True),name='bi_LSTM')(X)
    
    for t in range(Ty):
        context, alphas = one_step_attention(a, s)         #here
        s, _, c = post_activation_LSTM_cell(context, initial_state=[s, c])
        out = output_layer(s)
        outputs1.append(out)
        # outputs2.append(alphas.reshape) #(Ty=13,(none,30,1))
    # model = Model([X, s0, c0], outputs=[outputs1,outputs2]) #,outputs2])
    # if gib_attention == False:

    model = Model([X, s0, c0], outputs1) #,outputs2])
    # else:
    #   model = Model([X, s0, c0], outputs2)
    # final_alphas= model.get_layer('attention_vec').output     #here
    return model #, final_alphas          #here

Write custome class for overall accuracy 

In [None]:
mod = model(Tx, Ty, n_a, n_s, len(human_vocab), len(machine_vocab), gib_attention=False)

In [None]:
# model = Model(inputs=model.input, outputs=[model.output, model.get_layer('attention_vec').output])
# attention_map = plot_attention_map(model, human_vocab, inv_machine_vocab, "Tuesday 09 Oct 1993", num = 7, n_s = 64)

AttributeError: ignored

In [None]:
mod.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 30, 37)]     0           []                               
                                                                                                  
 s0 (InputLayer)                [(None, 64)]         0           []                               
                                                                                                  
 bi_output (Bidirectional)      (None, 30, 64)       17920       ['input_2[0][0]']                
                                                                                                  
 repeat_vector (RepeatVector)   (None, 30, 64)       0           ['s0[0][0]',                     
                                                                  'lstm[10][0]',            

**Train Model**

In [None]:
# mod1 = model(Tx, Ty, n_a, n_s, len(human_vocab), len(machine_vocab), gib_attention=True)
# mod1.summary()

In [None]:
opt = Adam(learning_rate=0.005, beta_1=0.9, beta_2=0.999, decay=0.01)
mod.compile(optimizer=opt, loss=['categorical_crossentropy'], metrics=['accuracy','overall_accuray'])
# mod1.compile(optimizer=opt, loss=['categorical_crossentropy'], metrics=['accuracy'])

In [None]:
s0 = np.zeros((df.shape[0], n_s))
c0 = np.zeros((df.shape[0], n_s))
outputs = list(Yoh.swapaxes(0,1))
# op=np.array(outputs)
# op.shape
# dummy = np.zeros((13,None,30,1))

In [None]:

# b = 

(10, 40000, 13)

In [None]:
mod.fit([Xoh, s0, c0], outputs, epochs=1, batch_size=100, validation_split = 0.1)

ValueError: ignored

In [None]:
final_alphas= mod.get_layer('attention_vec').output

In [None]:
import tensorflow as tf
print(final_alphas.numpy())
#  output_before_att = new_model.predict(x_test_sample) #extract layer output

AttributeError: ignored

In [None]:
# output_before_att = new_model.predict(x_test_sample) #extract layer output

In [None]:
EXAMPLES = ['3 May 1979', '5 April 09', '21th of August 2016', 'Tue 10 Jul 2007', 'Saturday May 9 2018', 'March 3 2001', 'March 3rd 2001', '1 March 2001']
for example in EXAMPLES:
    
    source = string_to_int(example, Tx, human_vocab)
    source = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), source)))
    source = source.reshape((1, ) + source.shape)
    prediction = mod.predict([source, s0, c0])
    prediction = np.argmax(prediction, axis = -1)
    output = [inv_machine_vocab[int(i)] for i in prediction]
    
    print("source:", example)
    print("output:", ''.join(output))

ValueError: ignored

In [None]:
mod.save('/content/drive/MyDrive/Data/DLNLP/dates_model.h5')
# !tensorflowjs_converter --input_format keras dates_model.h5 tfjsmodel

In [None]:
# a = tf.keras.utils.to_categorical([0, 1, 2, 3], num_classes=4)
# print(a)

https://jacobgil.github.io/deeplearning/class-activation-maps

In [None]:
# mod.load_weights('/content/drive/MyDrive/Data/DLNLP/dates_model.h5')

In [None]:
model = load_model('/content/drive/MyDrive/Data/DLNLP/dates_model.h5')
# model.summary()
model = Model(inputs=model.input, outputs=[model.output, model.get_layer('attention_vec').output])

In [None]:
input = Xoh[0:1]
# input.shape[0]
s0 = np.zeros((input.shape[0], n_s))
c0 = np.zeros((input.shape[0], n_s))
new_model = Model(inputs=mod.input, outputs=mod.get_layer('attention_vec').output)
output_before_att = new_model.predict([input, s0, c0]) #extract layer output

In [None]:
# b = mod1.predict([input, s0, c0])
# b = mod.predict([input, s0, c0])
model = Model(inputs=mod.input, outputs=[mod.output, mod.get_layer('attention_vec').output])
ouputs,alphas1 = model.predict([input, s0, c0])
model_outputs = outputs
attention_outputs = alphas1
print(attention_outputs)
# b=np.array(b)
# print(b.shape)

ValueError: ignored

In [None]:
# def visualize_class_activation_map(model_path, input):
#         model = load_model(model_path)
#         model = Model(inputs=model.input, outputs=[model.output, model.get_layer('attention_vec').output])
#         ouputs,alphas1 = model.predict(input)
#         model_outputs = outputs
#         attention_outputs = alphas1
#         print(attention_outputs)

In [None]:
b = mod1.predict([input, s0, c0])

In [None]:
b=np.array(b)
print(b)

[[[[1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]]]


 [[[1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]]]


 [[[1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]]]


 [[[1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]
   [1.]]]


 [[[1.]
   [1.]
   [1.]


In [None]:
Xoh[0].shape

In [None]:
# For making attention map
example = '1 March 2001'
# source = string_to_int(example, Tx, human_vocab)
# source = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), source))).swapaxes(0,1)
# prediction = model.predict([sample, s0, c0])

# model_path = '/content/drive/MyDrive/Data/DLNLP/dates_model.h5'
# source = string_to_int(example, Tx, human_vocab)
# source = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), source))).swapaxes(0,1)

# visualize_class_activation_map(model_path, [source, s0, c0])


In [None]:
# prediction = mod.predict([np.reshape(Xoh[0],(1,37,30)), s0, c0])

In [None]:
# source = np.reshape(source,(1,37,30))
# source.shape

In [None]:
# ouputs,alphas1 = mod.predict([source, s0, c0])

In [None]:
# EXAMPLES = ['3 May 1979', '5 April 09', '21th of August 2016', 'Tue 10 Jul 2007', 'Saturday May 9 2018', 'March 3 2001', 'March 3rd 2001', '1 March 2001']
# for example in EXAMPLES:
    
#     source = string_to_int(example, Tx, human_vocab)
#     source = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), source))).swapaxes(0,1)
#     prediction = mod.predict([source, s0, c0])
#     prediction = np.argmax(prediction, axis = -1)
#     output = [inv_machine_vocab[int(i)] for i in prediction]
    
#     print("source:", example)
#     print("output:", ''.join(output))

In [None]:
import tensorflow as tf
from keras import backend as K
from keras import regularizers, constraints, initializers, activations
from keras.layers.recurrent import Recurrent, _time_distributed_dense
from keras.engine import InputSpec

tfPrint = lambda d, T: tf.Print(input_=T, data=[T, tf.shape(T)], message=d)

class AttentionDecoder(Recurrent):

    def __init__(self, units, output_dim,
                 activation='tanh',
                 return_probabilities=False,
                 name='AttentionDecoder',
                 kernel_initializer='glorot_uniform',
                 recurrent_initializer='orthogonal',
                 bias_initializer='zeros',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 **kwargs):
        """
        Implements an AttentionDecoder that takes in a sequence encoded by an
        encoder and outputs the decoded states
        :param units: dimension of the hidden state and the attention matrices
        :param output_dim: the number of labels in the output space

        references:
            Bahdanau, Dzmitry, Kyunghyun Cho, and Yoshua Bengio.
            "Neural machine translation by jointly learning to align and translate."
            arXiv preprint arXiv:1409.0473 (2014).
        """
        self.units = units
        self.output_dim = output_dim
        self.return_probabilities = return_probabilities
        self.activation = activations.get(activation)
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.recurrent_initializer = initializers.get(recurrent_initializer)
        self.bias_initializer = initializers.get(bias_initializer)

        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.recurrent_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)

        self.kernel_constraint = constraints.get(kernel_constraint)
        self.recurrent_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)

        super(AttentionDecoder, self).__init__(**kwargs)
        self.name = name
        self.return_sequences = True  # must return sequences

    def build(self, input_shape):
        """
          See Appendix 2 of Bahdanau 2014, arXiv:1409.0473
          for model details that correspond to the matrices here.
        """

        self.batch_size, self.timesteps, self.input_dim = input_shape

        if self.stateful:
            super(AttentionDecoder, self).reset_states()

        self.states = [None, None]  # y, s

        """
            Matrices for creating the context vector
        """

        self.V_a = self.add_weight(shape=(self.units,),
                                   name='V_a',
                                   initializer=self.kernel_initializer,
                                   regularizer=self.kernel_regularizer,
                                   constraint=self.kernel_constraint)
        self.W_a = self.add_weight(shape=(self.units, self.units),
                                   name='W_a',
                                   initializer=self.kernel_initializer,
                                   regularizer=self.kernel_regularizer,
                                   constraint=self.kernel_constraint)
        self.U_a = self.add_weight(shape=(self.input_dim, self.units),
                                   name='U_a',
                                   initializer=self.kernel_initializer,
                                   regularizer=self.kernel_regularizer,
                                   constraint=self.kernel_constraint)
        self.b_a = self.add_weight(shape=(self.units,),
                                   name='b_a',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
        """
            Matrices for the r (reset) gate
        """
        self.C_r = self.add_weight(shape=(self.input_dim, self.units),
                                   name='C_r',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_r = self.add_weight(shape=(self.units, self.units),
                                   name='U_r',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_r = self.add_weight(shape=(self.output_dim, self.units),
                                   name='W_r',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_r = self.add_weight(shape=(self.units, ),
                                   name='b_r',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)

        """
            Matrices for the z (update) gate
        """
        self.C_z = self.add_weight(shape=(self.input_dim, self.units),
                                   name='C_z',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_z = self.add_weight(shape=(self.units, self.units),
                                   name='U_z',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_z = self.add_weight(shape=(self.output_dim, self.units),
                                   name='W_z',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_z = self.add_weight(shape=(self.units, ),
                                   name='b_z',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
        """
            Matrices for the proposal
        """
        self.C_p = self.add_weight(shape=(self.input_dim, self.units),
                                   name='C_p',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_p = self.add_weight(shape=(self.units, self.units),
                                   name='U_p',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_p = self.add_weight(shape=(self.output_dim, self.units),
                                   name='W_p',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_p = self.add_weight(shape=(self.units, ),
                                   name='b_p',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
        """
            Matrices for making the final prediction vector
        """
        self.C_o = self.add_weight(shape=(self.input_dim, self.output_dim),
                                   name='C_o',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_o = self.add_weight(shape=(self.units, self.output_dim),
                                   name='U_o',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_o = self.add_weight(shape=(self.output_dim, self.output_dim),
                                   name='W_o',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_o = self.add_weight(shape=(self.output_dim, ),
                                   name='b_o',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)

        # For creating the initial state:
        self.W_s = self.add_weight(shape=(self.input_dim, self.units),
                                   name='W_s',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)

        self.input_spec = [
            InputSpec(shape=(self.batch_size, self.timesteps, self.input_dim))]
        self.built = True

    def call(self, x):
        # store the whole sequence so we can "attend" to it at each timestep
        self.x_seq = x

        # apply the a dense layer over the time dimension of the sequence
        # do it here because it doesn't depend on any previous steps
        # thefore we can save computation time:
        self._uxpb = _time_distributed_dense(self.x_seq, self.U_a, b=self.b_a,
                                             input_dim=self.input_dim,
                                             timesteps=self.timesteps,
                                             output_dim=self.units)

        return super(AttentionDecoder, self).call(x)

    def get_initial_state(self, inputs):
        # apply the matrix on the first time step to get the initial s0.
        s0 = activations.tanh(K.dot(inputs[:, 0], self.W_s))

        # from keras.layers.recurrent to initialize a vector of (batchsize,
        # output_dim)
        y0 = K.zeros_like(inputs)  # (samples, timesteps, input_dims)
        y0 = K.sum(y0, axis=(1, 2))  # (samples, )
        y0 = K.expand_dims(y0)  # (samples, 1)
        y0 = K.tile(y0, [1, self.output_dim])

        return [y0, s0]

    def step(self, x, states):

        ytm, stm = states

        # repeat the hidden state to the length of the sequence
        _stm = K.repeat(stm, self.timesteps)

        # now multiplty the weight matrix with the repeated hidden state
        _Wxstm = K.dot(_stm, self.W_a)

        # calculate the attention probabilities
        # this relates how much other timesteps contributed to this one.
        et = K.dot(activations.tanh(_Wxstm + self._uxpb),
                   K.expand_dims(self.V_a))
        at = K.exp(et)
        at_sum = K.sum(at, axis=1)
        at_sum_repeated = K.repeat(at_sum, self.timesteps)
        at /= at_sum_repeated  # vector of size (batchsize, timesteps, 1)

        # calculate the context vector
        context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1)
        # ~~~> calculate new hidden state
        # first calculate the "r" gate:

        rt = activations.sigmoid(
            K.dot(ytm, self.W_r)
            + K.dot(stm, self.U_r)
            + K.dot(context, self.C_r)
            + self.b_r)

        # now calculate the "z" gate
        zt = activations.sigmoid(
            K.dot(ytm, self.W_z)
            + K.dot(stm, self.U_z)
            + K.dot(context, self.C_z)
            + self.b_z)

        # calculate the proposal hidden state:
        s_tp = activations.tanh(
            K.dot(ytm, self.W_p)
            + K.dot((rt * stm), self.U_p)
            + K.dot(context, self.C_p)
            + self.b_p)

        # new hidden state:
        st = (1-zt)*stm + zt * s_tp

        yt = activations.softmax(
            K.dot(ytm, self.W_o)
            + K.dot(stm, self.U_o)
            + K.dot(context, self.C_o)
            + self.b_o)

        if self.return_probabilities:
            return at, [yt, st]
        else:
            return yt, [yt, st]

    def compute_output_shape(self, input_shape):
        """
            For Keras internal compatability checking
        """
        if self.return_probabilities:
            return (None, self.timesteps, self.timesteps)
        else:
            return (None, self.timesteps, self.output_dim)

    def get_config(self):
        """
            For rebuilding models on load time.
        """
        config = {
            'output_dim': self.output_dim,
            'units': self.units,
            'return_probabilities': self.return_probabilities
        }
        base_config = super(AttentionDecoder, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

ImportError: ignored