In [1]:
import tensorflow as tf
import tensorflow_addons as tfa

from tensorflow.keras import backend as K

#from tensorflow.keras.engine import Layer, InputSpec, InputLayer

from tensorflow.keras.models import Model, Sequential

from tensorflow.keras.layers import Dropout, Embedding, concatenate
from tensorflow.keras.layers import Conv1D, MaxPool1D, Conv2D, MaxPool2D, ZeroPadding1D, GlobalMaxPool1D
from tensorflow.keras.layers import Dense, Input, Flatten, BatchNormalization, Activation
from tensorflow.keras.layers import Concatenate, Dot, Concatenate, Multiply, RepeatVector
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split

from gensim.models import KeyedVectors
from gensim.models import Word2Vec

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import sys
root_project = '../../SaRaH/'
sys.path.append(root_project)
sys.path.append('.')
from src.data.utils import load_data, set_unkmark
from src.features.word_embedding import get_index_key_association, get_int_seq, build_keras_embedding_matrix, get_data_to_emb

%load_ext autoreload
%autoreload 2

 The versions of TensorFlow you are currently using is 2.4.1 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


<h2> Data

In [2]:
dataset_path   = root_project + "dataset/haspeede2/preprocessed/dev/dev.csv"
w2v_path       = root_project + "results/model/word2vec/word2vec.wordvectors"

df = pd.read_csv(dataset_path, sep=',')
dataset = load_data(df, True)
w2v = KeyedVectors.load(w2v_path)
set_unkmark(dataset["tokens"], w2v)
index_to_key, key_to_index = get_index_key_association(w2v)
dataset["int_tokens"] = get_int_seq(dataset["tokens"], index_to_key)  # for embedding layer
X = get_data_to_emb(dataset["tokens"], w2v, 40, True)

In [3]:
X_train, X_val, X_train2, X_val2, y_train, y_val = train_test_split(X, dataset["extra_features"] , dataset["target"], test_size=0.10, random_state=128)
X_train, X_test, X_train2, X_test2, y_train, y_test = train_test_split(X_train, X_train2 , y_train, test_size=0.05, random_state=128)
y_train = np.asarray(y_train)
y_val = np.asarray(y_val)
y_test = np.asarray(y_test)

X_train = np.asarray(X_train)
X_val = np.asarray(X_val)
X_test = np.asarray(X_test)

X_train2 = np.asarray(X_train2)
X_val2 = np.asarray(X_val2)
X_test2 = np.asarray(X_test2)

X_train.shape

(5845, 40, 128)

<h2> Model

In [5]:
def build_model(print_model=False):
  """ HyperParameters """
  FILTERS = 256
  pooling_units = 10
  output_dims = 1
  hidden_dims= 1

  text_seq_input = Input(shape=(40,128,), name="text")
  #text_embedding = Embedding(vocab_size, WORD_EMB_SIZE, input_length=MAX_TEXT_LEN,
  #                            weights=[embedding_matrix], trainable=False)(text_seq_input)
  extra_feature = Input(shape=(5,), name = "extra")

  #text_embedding = Embedding(vocab_size, WORD_EMB_SIZE, input_length=MAX_TEXT_LEN,
  #                            weights=[embedding_matrix], trainable=False)(text_seq_input)
  #text_dropout = Dropout(0.25)(text_embedding)

  filter_sizes = [2,3,4]
  convs = []
  for filter_size in filter_sizes:
      l_conv = Conv1D(filters=FILTERS, kernel_size=filter_size, activation='relu')(text_seq_input)
      POOL_SIZE = l_conv.get_shape()[-2] // pooling_units
      l_pool = MaxPool1D(pool_size=POOL_SIZE, strides =3, padding='valid')(l_conv)   #Dynamic pooling
      #l_conv = Conv1D(filters=64, kernel_size=filter_size, activation='relu')(l_pool)
      #POOL_SIZE = l_conv.get_shape()[-2] // pooling_units
      #l_pool = MaxPool1D(pool_size=POOL_SIZE, strides =1, padding='valid')(l_conv)   #Dynamic pooling
      convs.append(l_pool)

  l_merge = Concatenate(axis=1)(convs)
  l_cov1= Conv1D(110, 5, activation='relu')(l_merge)
  # since the text is too long we are maxooling over 100
  # and not GlobalMaxPool1D
  l_pool1 = MaxPool1D(10)(l_cov1)
  l_flat = Flatten()(l_pool1)
  l_flat = Concatenate(axis=1)([l_flat, extra_feature])
  l_hidden = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.007))(l_flat)
  l_hidden = Dense(64, activation='relu',  kernel_regularizer=tf.keras.regularizers.l2(0.007))(l_hidden)
  l_out = Dense(1, activation='sigmoid')(l_hidden)  #dims output
  model_cnn = Model(inputs=[text_seq_input, extra_feature], outputs=l_out)
  if print_model:
    model_cnn.summary()
    tf.keras.utils.plot_model(model_cnn, "my_first_model.png", show_shapes=True)
  return model_cnn


In [6]:
model_cnn = build_model(True)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None, 40, 128)]    0                                            
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 39, 256)      65792       text[0][0]                       
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 38, 256)      98560       text[0][0]                       
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 37, 256)      131328      text[0][0]                       
______________________________________________________________________________________________

<h2> Attention 1

In [6]:
class peel_the_layer(tf.keras.layers.Layer): 
    def __init__(self, units):    
        # Nothing special to be done here
        #super(peel_the_layer, self).__init__()
        
        self.units = units
        
    def build(self, input_shape): #input_shape = (?,19,256) = (batch_size, #time_steps, #features)
        # Define the shape of the weights and bias in this layer
        
        #input_shape[-1] = last index of the input_shape (256) = #weights in the layer
        self.inp_dimensions = input_shape[-1]
        
        #input_shape[-2] = 19 = #time steps
        self.seq_length = input_shape[-2]
        
        # As we discussed the layer has just 1 lonely neuron
        #num_units = 1 (self.units)
        
        #LAYER-WEIGHT 
        self.w=self.add_weight(shape=(inp_dimensions, self.units), initializer="normal")
        
        #ATTENTION WEIGHTS
        self.b=self.add_weight(shape=(seq_length, self.units), initializer="zeros") 
        
        #super(peel_the_layer, self).build(input_shape)
        
    def call(self, x):
        # x is the input tensor of 256 dimensions (19*256)
        
        # Below is the main processing done during training
        # K is the Keras Backend import
        #‘w’ is the weight of the layer (256*1), and ‘a’ is the attention weights
        
        e = K.tanh(K.dot(x,self.w)+self.b) #x*w = (19*1)
        
        #You may need to explicitly ‘squeeze’ the (19 * 1) attention vector that you got above... 
        #...into a 1-D array of (19) before computing the softmax. -> Flatten()
        e = Flatten()(e)
        
        #'a' are the 19 attention weights
        #a = K.softmax(e, axis=1) #Softmax squashes 'e' into values in the range between 0, and 1 whose sum is 1.
        a = Activation('softmax')(e)
        
        #After calculating the softmax, you need to ‘expand’ back the... 
        #...attention weights from (19) to (19 * 1)  
        #Don't manipulate 'a'. It needs to be 'return'ed intact.
        temp = RepeatVector(256)(a)   #(?,19) becomes (?,256,19)
        temp = Permute([2,1])(temp)   #change from (?,256,19) to (?,19,256)
        
        #We now need to multiply the first word (by word, I mean all of its 256 dimensions)...
        #...with a0, 2nd word with a1 and so on for all 19 words. This is done by the below 3 lines.
        #We multiply each attention weight by the respective word...   
        output = Multiply()([x,temp])  #Apply weight to each of the 256 dim
        
        # ...and sum up and we are done!
        #The Lambda layer wraps up any arbitrary function and gives it a ‘layer-like’ look.
        output = Lambda(lambda values: K.sum(values, axis=1))(output)
        
        # the second variable is the 'attention adjusted o/p state' ready to be fed to the next layer.
        return a, output

In [None]:
#To invoke this layer between others

# lstm_out is o/p of step 3 and is an i/p to step 3.5
a, attn_adjusted_op = peel_the_layer()(lstm_out)
# attn_adjusted_op is o/p of step 3.5 and is an i/p to step 4

<h2> Attention 2

Same of Attention 1

In [None]:
from keras import backend as K
from keras.layers import Flatten, Activation, RepeatVector, Permute, Multiply, Lambda, Dense, merge

# Define a regular layer instead of writing a custom layer
# This layer should have just one neuron - like before
# The weights and bias shapes are automatically calculated 
# by the Framework, based on the input
# This layer is defined at step 3.5 directly
e=Dense(1, activation='tanh')(lstm_out) #lstm_out = (?,19,256)

# Now do all the softmax business taking the above o/p
e=Flatten()(e)
a=Activation('softmax')(e)
temp=RepeatVector(256)(a)
temp=Permute([2, 1])(temp)

# multiply weight with lstm layer o/p
output = merge.Multiply()([lstm_out, temp])

# Get the attention adjusted output state
output = Lambda(lambda values: K.sum(values, axis=1))(output)

# Pass output to step 4 and 'a' to any nice display

<h2> Attention 3

<p>With 'context' (is supposed to best summarize the sentiment of the sentence or the tweet in one word).
<p>Let the additional feedforward layer determine the value of this context along with the weights and bias during training.
<p>(one line in build method to define the context ‘u’, another in call method to do the dot-product of the output with the self.u)
<p>Note that the product of 2 vectors gives a measure of their similarity. Softmaxing it returns a set of 19 probabilities adding up to 1. Each probability indicates how close the word is to the context vector. The rest of the processing is the same and we finally end up with the ‘attention adjusted output’.

In [7]:
#https://www.kaggle.com/sermakarevich/hierarchical-attention-network

<h2> Attention 4

<p>Yang et al. proposed the hierarchical model here (https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf), which uses two levels of attention — one at the word level and one at the sentence level.

<p>The attention mechanism here can also function as a pure ‘reduction’ operation, which could be used in place of any pooling step. This is because the ‘context’ that is derived, is 1 word and it best summarises the sentiment of the 19-word tweet — a classic ‘reduction’.