<a href="https://colab.research.google.com/github/abdalrahmenyousifMohamed/NLP/blob/main/attention_mechanism.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
vocab_size = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocab_size)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [8]:
print('max seq : {}'.format(max(len(l) for l in X_train)))
print('avg seq: {}'.format(sum(map(len, X_train))/len(X_train)))

max seq : 2494
avg seq: 238.71364


In [9]:
max_len = 500
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [1]:
import tensorflow as tf

In [18]:
class BahdanauAttention(tf.keras.Model):
  def __init__(self,units):
    super(BahdanauAttention,self).__init__()
    self.W1 = Dense(units)
    self.W2 = Dense(units)
    self.V = Dense(1)

  def call(self , values , query):
      #However, key and value are the same

      # query shape == (batch_size , hidden size)

      # hidden_with_time_axis shape == (batch_size , 1 , hidden size)
      # Change the dimension for the later addition to calculate the score

      hidden_with_time_axis = tf.expand_dims(query , 1)

      # score shape == (batch_size , max_length , 1)

      # we get 1 at the last axis because we are applying score to self.V
      # the shape of the tensor before applying self.V is (batch_size, max_length, units)

      score = self.V(tf.nn.tanh(
          self.W1(values)  + self.W2(hidden_with_time_axis)
      ))

      # attention_weights shape == (batch_size , max_length , 1)

      attention_weights = tf.nn.softmax(score , axis=1)

      # context_vector shape after sum == (batch_size , hidden_size)

      context_vector = attention_weights * values
      context_vector = tf.reduce_sum(context_vector , axis=1)

      return context_vector , attention_weights



In [None]:
#

# BiLSTM with Attention Mechanism

In [4]:
from tensorflow.keras.layers import Dense , Embedding , Bidirectional , LSTM , Concatenate , Dropout

from tensorflow.keras import Input , Model
from tensorflow.keras import optimizers
import os

In [10]:
from tensorflow.python.ops.array_ops import sequence_mask

sequence_input = Input(shape=(max_len,) , dtype='int32')

embedded_sequences = Embedding(vocab_size , 128 ,
input_length=max_len , mask_zero=True )(sequence_input)

In [13]:
from tensorflow._api.v2.nn import dropout
lstm = Bidirectional(LSTM(64 , dropout=0.5 , return_sequences=True))(embedded_sequences)

In [14]:
# ret state

lstm , forward_h , forward_c , backward_h , backward_c = Bidirectional(
    LSTM(64,dropout=0.5,return_sequences=True , return_state=True)

)(lstm)

In [15]:
# size of each state
lstm.shape , forward_h.shape , forward_c.shape , backward_h.shape , backward_c.shape

(TensorShape([None, 500, 128]),
 TensorShape([None, 64]),
 TensorShape([None, 64]),
 TensorShape([None, 64]),
 TensorShape([None, 64]))

In the case of each hidden state or cell state, it has 128 dimensions, and in the case of lstm, it has a size of (500 × 128). This means that a hidden state vector with forward and backward directions connected exists for all views.

when using a bidirectional LSTM , forward LSTM and backward LSTM each have a hidden state and cell state. To use the hidden state and cell state of a bidirectional LSTM , concatenate the states of the two LSTMs.

In [16]:
state_h = Concatenate() ([forward_h , backward_h]) # hidden state
state_c = Concatenate() ([forward_c , backward_c]) # cell state

In [19]:
attention = BahdanauAttention(64) # weight size definition
context_vector  , attention_weights = attention(lstm , state_h)

In [20]:
dense1 = Dense(20, activation="relu")(context_vector)
dropout = Dropout(0.5)(dense1)
output = Dense(1, activation="sigmoid")(dropout)
model = Model(inputs=sequence_input, outputs=output)

In [21]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs = 3, batch_size = 256, validation_data=(X_test, y_test), verbose=1)

Epoch 1/3