Encoder with Attention

- Encoder: The encoder is responsible for stepping through the input time steps and encoding the entire sequence into a fixed length vector called a context vector.
- Decoder: The decoder is responsible for stepping through the output time steps while reading from the context vector.

Attention is an extension to the architecture that addresses the poor performance. It works by first providing a richer context from the encoder to the decoder and a learning mechanism where the decoder can learn where to pay attention in the richer encoding when predicting each time step in the output sequence.

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [0]:
# dfcl for data frame with cleaned text
dfcl = pd.read_excel("/content/drive/My Drive/Colab Notebooks/NLP/Capstone Project/AUTOMATIC TICKET ASSIGNMENT/Input Data Synthetic CleanedV2.xlsx")

In [0]:
dfcl.head()

Unnamed: 0.1,Unnamed: 0,Short description,Description,Caller,Assignment group,description_cleaned,short_description_cleaned,tmp_target_count,target1,spacy_col,language,score,lang_textblob
0,0,login issue,-verified user details.(employee# & manager na...,spxjnwir pjlcoqds,GRP_0,verify user detail employee manager name check...,login issue,3975,GRP_0,verify user detail employee manager name check...,en,0.999996,en
1,1,outlook,\n\nreceived from: hmjdrvpb.komuaywn@gmail.com...,hmjdrvpb komuaywn,GRP_0,team meeting skype meeting not appear outlook ...,outlook,3975,GRP_0,team meeting skype meeting not appear outlook ...,en,0.999994,en
2,2,cant log in to vpn,\n\nreceived from: eylqgodm.ybqkwiam@gmail.com...,eylqgodm ybqkwiam,GRP_0,can not log vpn,can not log vpn,3975,GRP_0,can not log vpn,en,0.571427,en
3,3,unable to access hr_tool page,unable to access hr_tool page,xbkucsvz gcpydteq,GRP_0,unable access hr tool page,unable access hr tool page,3975,GRP_0,unable access hr tool page,fr,0.428573,en
4,4,skype error,skype error,owlgqjme qhcozdfx,GRP_0,skype error,skype error,3975,GRP_0,skype error,no,0.999994,no


In [0]:
dfcl['lang_textblob'].value_counts()[0:15]

en       5294
sl        537
fr        487
af        418
de        412
it        174
sv        153
da        144
no        132
nl        131
ca        129
ro         75
error      73
es         70
pt         64
Name: lang_textblob, dtype: int64

In [4]:
## data only contains english language
dfen = dfcl[dfcl['lang_textblob']=='en']
dfen.reset_index(inplace=True)

dfen.shape

(5294, 14)

In [0]:
### Paramters
max_words = 55 ### based on 90 percentile
validation_split = 0.2

In [6]:
### Import few required libraries
import nltk
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from nltk.tokenize import sent_tokenize, word_tokenize
import random

from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
tk = Tokenizer(num_words=max_words)

In [0]:
tk.fit_on_texts(dfen['description_cleaned'])
# tk.fit_on_texts(dfen['short_description_cleaned'])

In [10]:
word_counts = tk.word_counts
print('number of unique words : ' + str(len(word_counts)))

# checking some random entries in the word_counts dictionary
print(random.choice(list(word_counts.items())))

number of unique words : 9005
('horrible', 1)


In [0]:
max_len = len(tk.word_counts)

In [0]:
words_list = []   ## list to be used to store the words after word_tokenize

In [0]:
# converting text to sequence of numbers
seq = tk.texts_to_sequences(dfen['description_cleaned'])

In [14]:
for i, sq in enumerate(seq):
  print(sq)
  if i == 10:
    break

[5, 45, 5, 26, 7, 5, 39, 45, 39, 10]
[3]
[3]
[51, 16, 8]
[28, 13]
[11, 1, 5]
[8, 3, 51]
[8, 15, 3]
[51, 39, 8]
[51, 39, 12, 42, 33, 3, 29, 25, 24]
[3]


In [0]:
dfen['description_cleaned'][1000]

'dac gso a basis oncall detail modify modify place location collaboration platform update record accordingly'

In [0]:
sent_tokenize(dfen['description_cleaned'][7])

['hr tool site not load page correctly']

In [15]:
articles = []
for i in range(len(dfen)):
  articles.append(sent_tokenize(dfen['description_cleaned'][i]))

articles[0]

['verify user detail employee manager name check user name reset password advise user login check caller confirm able login issue resolve']

In [0]:
### initializing the placeholder variable
MAX_SENTS = 1
data = np.zeros((len(dfen['short_description_cleaned']), MAX_SENTS, max_words), dtype='int32')

### word index encoding
for i, sentence in enumerate(articles):
    for j, sent in enumerate(sentence):
        if j < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                try:
                    if k < max_words and tk.word_index[word] < max_len:
                      data[i, j, k] = tk.word_index[word]
                      k = k + 1
                except:
                      #  print(word)
                        pass

In [0]:
tk.word_index['idbdaily']

4689

In [0]:
wordTokens[9]

'network'

In [0]:
data.shape

(5294, 1, 55)

In [0]:
wordTokens

['unable',
 'access',
 'machine',
 'utility',
 'finish',
 'drawer',
 'adjustment',
 'setting',
 'no',
 'network']

In [0]:
tk.word_index['price']

327

In [0]:
data[34,:,:]

array([[ 201,  214,   26,    7, 1922,  255,  199,  147,  342,  709,   25,
        1154,   59, 1280,  172,  252,   53,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int32)

In [0]:
# converting labels into one-hot vectors
labels = pd.get_dummies(dfen['target1']).values

In [0]:
print(data.shape)
print(labels.shape)

(5294, 1, 55)
(5294, 59)


In [0]:
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size = validation_split, random_state=9)

In [0]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(4235, 1, 55) (4235, 59)
(1059, 1, 55) (1059, 59)


In [0]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional, TimeDistributed
from keras.layers import Activation, Concatenate, SpatialDropout1D, Input, Lambda, Flatten
from keras.callbacks import EarlyStopping

In [0]:
import keras
from keras import Model
from keras.layers import Layer
import keras.backend as K
import tensorflow_hub as hub
import tensorflow as tf
sess = tf.Session()
K.set_session(sess)

Bidirectional LSTM

In [0]:
### Reshaping data for the input to LSTM model

x_train_re = np.reshape(x_train, (len(x_train), 55))
x_test_re = np.reshape(x_test, (len(x_test), 55))

In [22]:
### trying with regularizing embedding

lstm_dim = 128
output_length = 150

model = Sequential()
model.add(Embedding(output_dim = output_length, input_dim = max_len, input_length = 55, 
                    embeddings_regularizer=keras.regularizers.l2(.001)))
model.add(Dropout(0.2))

model.add(Bidirectional(LSTM(lstm_dim), merge_mode = 'sum'))
model.add(Dropout(0.2))

model.add(Dense(units = 256, activation = 'relu'))
model.add(Dropout(0.2))

model.add(Dense(units = 59, activation = 'softmax'))

model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 55, 150)           1350750   
_________________________________________________________________
dropout_1 (Dropout)          (None, 55, 150)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               285696    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               33024     
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         

In [0]:
batch_size = 32
model.fit(x_train_re, y_train, epochs = 10, batch_size=batch_size, verbose = 2, validation_split=0.2)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 3388 samples, validate on 847 samples
Epoch 1/10





 - 25s - loss: 2.7760 - acc: 0.4684 - val_loss: 2.1475 - val_acc: 0.5325
Epoch 2/10
 - 22s - loss: 2.1040 - acc: 0.5345 - val_loss: 2.0252 - val_acc: 0.5620
Epoch 3/10
 - 22s - loss: 1.9456 - acc: 0.5587 - val_loss: 1.9750 - val_acc: 0.5714
Epoch 4/10
 - 22s - loss: 1.7936 - acc: 0.5885 - val_loss: 1.9342 - val_acc: 0.5998
Epoch 5/10
 - 22s - loss: 1.6827 - acc: 0.6110 - val_loss: 1.9726 - val_acc: 0.6009
Epoch 6/10
 - 22s - loss: 1.5380 - acc: 0.6417 - val_loss: 2.1422 - val_acc: 0.5785
Epoch 7/10
 - 22s - loss: 1.4368 - acc: 0.6588 - val_loss: 2.1143 - val_acc: 0.5714
Epoch 8/10
 - 22s - loss: 1.3252 - acc: 0.6948 - val_loss: 2.0885 - val_acc: 0.5762
Epoch 9/10
 - 22s - loss: 1.2182 - acc: 0.7329 - val_loss: 2.1226 - val_acc: 0.5986
Epoch 10/10
 - 22s - loss: 1.1378 - acc: 0.7479 - val_loss: 2.2295 - val_acc: 0.5856


<keras.callbacks.History at 0x7f3c29917ac8>

In [0]:
score,acc = model.evaluate(x_test_re, y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 2.26
acc: 0.58


ELMO Embedding

In [0]:
### Try ELMO embedding
elmo_model = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True)

def ElmoEmbedding(x):
    return elmo_model(inputs={
                            "tokens": tf.squeeze(tf.cast(x, tf.string)),
                            "sequence_len": tf.constant(batch_size*[max_len])
                      },
                      signature="tokens",
                      as_dict=True)["elmo"]

In [25]:
batch_size = 32
input_text = Input(shape=(max_len, 55), dtype=tf.string)
embedding = Lambda(ElmoEmbedding, output_shape=(max_len, 1024))(input_text)
x3 = Bidirectional(LSTM(lstm_dim, return_sequences=True, dropout=0.2))(embedding)
out = TimeDistributed(Dense(59, activation='softmax'))(x3)

model_el = Model(input_text, out)

model_el.summary()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 9005, 55)          0         
_________________________________________________________________
lambda_2 (Lambda)            (None, 9005, 1024)        0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 9005, 256)         1180672   
_________________________________________________________________
time_distributed_1 (TimeDist (None, 9005, 59)          15163     
Total params: 1,195,835
Trainable params: 1,195,835
Non-trainable params: 0
_________________________________________________________________


In [0]:
### Try with Glove Embedding
"""
# load the whole embedding into memory
embeddings_index = dict()
f = open('./glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))


for word, i in tk.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

Attention Model

In [0]:
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        ### Inside build (), weights and biases are defined
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")        
        super(attention, self).build(input_shape)

    def call(self,x):
        ### Inside call (), main logic of Attention will be written
        et=K.squeeze(K.tanh(K.dot(x,self.W)+self.b),axis=-1)
        at=K.softmax(et)
        at=K.expand_dims(at,axis=-1)
        output=x*at
        return K.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        ### get_config() method collects the input shape and other information about the model
        return super(attention,self).get_config()


In [27]:
### With attention layer
inputs2=Input(shape=(55,))
x1=Embedding(input_dim=len(tk.word_counts)+1,output_dim=output_length, 
             input_length=55,embeddings_regularizer=keras.regularizers.l2(.001))(inputs2)
att_in=LSTM(100,return_sequences=True,dropout=0.3)(x1)        ### see how to use recurrent_dropout=0.2
att_out=attention()(att_in)
x1=LSTM(100,dropout=0.3)(x1)
outputs1=Dense(59,activation='softmax')(x1)
model2=Model(inputs2,outputs1)

model2.summary()







Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 55)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 55, 150)           1350900   
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               100400    
_________________________________________________________________
dense_4 (Dense)              (None, 59)                5959      
Total params: 1,457,259
Trainable params: 1,457,259
Non-trainable params: 0
_________________________________________________________________


In [0]:
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
# model.fit(x=x_train_re,y=y_train,batch_size=100,epochs=10,verbose=1,shuffle=True,validation_split=0.2)

Implementing BERT using FastAI

In [0]:
# For this we need to install libraries like fastai and transformers
# already colab has fastai; so installing transformers now

# pip install transformers

Implementing BERT using keras_bert

In [33]:
%%bash
pip install keras_bert
# from keras_bert import Tokenizer

Collecting keras_bert
  Downloading https://files.pythonhosted.org/packages/2c/0f/cdc886c1018943ea62d3209bc964413d5aa9d0eb7e493abd8545be679294/keras-bert-0.81.0.tar.gz
Collecting keras-transformer>=0.30.0
  Downloading https://files.pythonhosted.org/packages/54/0c/fede535ac576c03863c44bf2e0bf051fe21f5e10103631b6b6236ae446f3/keras-transformer-0.32.0.tar.gz
Collecting keras-pos-embd>=0.10.0
  Downloading https://files.pythonhosted.org/packages/09/70/b63ed8fc660da2bb6ae29b9895401c628da5740c048c190b5d7107cadd02/keras-pos-embd-0.11.0.tar.gz
Collecting keras-multi-head>=0.22.0
  Downloading https://files.pythonhosted.org/packages/40/3e/d0a64bb2ac5217928effe4507c26bbd19b86145d16a1948bc2d4f4c6338a/keras-multi-head-0.22.0.tar.gz
Collecting keras-layer-normalization>=0.12.0
  Downloading https://files.pythonhosted.org/packages/a4/0e/d1078df0494bac9ce1a67954e5380b6e7569668f0f3b50a9531c62c1fc4a/keras-layer-normalization-0.14.0.tar.gz
Collecting keras-position-wise-feed-forward>=0.5.0
  Downloading

In [35]:
pip install bert

Collecting bert
  Downloading https://files.pythonhosted.org/packages/e8/e6/55ed98ef52b168a38192da1aff7265c640f214009790220664ee3b4cb52a/bert-2.2.0.tar.gz
Collecting erlastic
  Downloading https://files.pythonhosted.org/packages/f3/30/f40d99fe35c38c2e0415b1e746c89569f2483e64ef65d054b9f0f382f234/erlastic-2.0.0.tar.gz
Building wheels for collected packages: bert, erlastic
  Building wheel for bert (setup.py) ... [?25l[?25hdone
  Created wheel for bert: filename=bert-2.2.0-cp36-none-any.whl size=3756 sha256=693c76c0d8188c861e26b6626f4bca12f4e18ac7ba1991f655e634ed9988c1e9
  Stored in directory: /root/.cache/pip/wheels/fe/71/b7/941459453bd38e5d97a8c886361dee19325e9933c9cf88ad46
  Building wheel for erlastic (setup.py) ... [?25l[?25hdone
  Created wheel for erlastic: filename=erlastic-2.0.0-cp36-none-any.whl size=6786 sha256=0f6b4097b5592c6fb08e09e1379acf086337103590d14bccf5e63b532e9253cf
  Stored in directory: /root/.cache/pip/wheels/02/62/46/93c713a5f061aeeb4f16eb6bf5ee798816e6ddda70fa

In [0]:
import tensorflow_hub as hub

In [0]:
# param path for bert model
# ref: https://github.com/strongio/keras-bert/blob/master/keras-bert.ipynb

bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

In [0]:
class BertLayer(keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=10,
        pooling="first",
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
        **kwargs,
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path
        if self.pooling not in ["first", "mean"]:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path, trainable=self.trainable, name=f"{self.name}_module"
        )

        # Remove unused layers
        trainable_vars = self.bert.variables
        if self.pooling == "first":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            trainable_layers = ["pooler/dense"]

        elif self.pooling == "mean":
            trainable_vars = [
                var
                for var in trainable_vars
                if not "/cls/" in var.name and not "/pooler/" in var.name
            ]
            trainable_layers = []
        else:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        if self.pooling == "first":
            pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "pooled_output"
            ]
        elif self.pooling == "mean":
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]

            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")

        return pooled

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [0]:
# Build model
def build_model(max_seq_length): 
    in_id = keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]
    
    bert_output = BertLayer(n_fine_tune_layers=3, pooling="first")(bert_inputs)
    dense = keras.layers.Dense(256, activation='relu')(bert_output)
    pred = keras.layers.Dense(59, activation='softmax')(dense)
    
    model = keras.models.Model(inputs=bert_inputs, outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model

def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

In [0]:
# max_seq_length = max_len
max_seq_length = 55

In [43]:
model = build_model(max_seq_length)

# Instantiate variables
initialize_vars(sess)

model.fit(
    [x_train_re, x_train_re, x_train_re], 
    y_train,
    validation_split=0.2,
    epochs=2,
    batch_size=32
)

# validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels),
# model.fit(x_train_re, y_train, epochs = 10, batch_size=batch_size, verbose = 2, validation_split=0.2)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          (None, 55)           0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        (None, 55)           0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        (None, 55)           0                                            
__________________________________________________________________________________________________
bert_layer_1 (BertLayer)        ((None, 55), 768)    110104890   input_ids[0][0]                  
                                                                 input_masks[0][0]          






















Epoch 2/2


<keras.callbacks.History at 0x7f6402c40588>

In [0]:
# model.save('BertModel.h5')
# pre_save_preds = model.predict([test_input_ids[0:100], 
#                                 test_input_masks[0:100], 
#                                 test_segment_ids[0:100]]
#                               ) # predictions before we clear and reload model

# # Clear and load model
# model = None
# model = build_model(max_seq_length)
# initialize_vars(sess)
# model.load_weights('BertModel.h5')

# post_save_preds = model.predict([test_input_ids[0:100], 
#                                 test_input_masks[0:100], 
#                                 test_segment_ids[0:100]]
#                               ) # predictions after we clear and reload model
# all(pre_save_preds == post_save_preds) # Are they the same?