In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.initializers import TruncatedNormal
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds

### Encoder

In [2]:
class MultiHeadAttention(layers.Layer):
    def __init__(self, hidden_size, num_heads):

        super(MultiHeadAttention, self).__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.projection_dim = hidden_size // num_heads
        self.Q = layers.Dense(hidden_size)
        self.K = layers.Dense(hidden_size)
        self.V = layers.Dense(hidden_size)
        self.out = layers.Dense(hidden_size)

    def attention(self, query, key, value, mask):
        
        #### complete this part ####
        score = tf.matmul(query, key, transpose_b=True) # multiply Q anv K layer matrix : QK^T
        key_dimension = tf.cast(tf.shape(key)[-1], tf.float32) # dimension of layer Key(K)
        scaled_score = score / tf.math.sqrt(key_dimension) # scaled score matrix with square root of K dimension
        max_len = tf.cast(tf.shape(scaled_score)[-1], tf.int64) # change type to int
        mask_rep = tf.repeat(mask, max_len, axis=2) * (-1e9) 
        scaled_score += mask_rep # concatenate
        weights = tf.nn.softmax(scaled_score, axis=-1) # final weights 
        ############################
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs, att_mask):
        batch_size = tf.shape(inputs)[0]
        query = self.separate_heads(self.Q(inputs)  , batch_size)  
        key = self.separate_heads(self.K(inputs), batch_size)  
        value = self.separate_heads(self.V(inputs) , batch_size) 
        attention, self.att_weights = self.attention(query, key, value, att_mask)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention, (batch_size, -1, self.hidden_size))
        output = self.out(concat_attention)  
        return output

#### Feed-Forward Sub-Layer

In [3]:
# Unlike the original transformer, BERT uses "GELU" activation function.

@tf.function
def GELU(x):

    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
    # written base on article
    return x * cdf

In [4]:
class FFN(layers.Layer):

    def __init__(self, intermediate_size, hidden_size, drop_rate):

        super(FFN, self).__init__()
        self.intermediate = layers.Dense(intermediate_size, activation=GELU, kernel_initializer=TruncatedNormal(stddev=0.02))
        self.out = layers.Dense(hidden_size, kernel_initializer=TruncatedNormal(stddev=0.02))
        self.drop = layers.Dropout(drop_rate)

    def call(self, inputs):

        x = self.intermediate(inputs)
        x = self.drop(x)
        out = self.out(x)
        return out

#### Add & Norm

In [5]:
class AddNorm(layers.Layer):

    def __init__(self, LNepsilon, drop_rate):
    
        super(AddNorm, self).__init__()
        self.LN = layers.LayerNormalization(epsilon=LNepsilon)
        self.dropout = layers.Dropout(drop_rate)

    def call(self, sub_layer_in, sub_layer_out):

        x = self.dropout(sub_layer_out) # apply Dropout to each sublayer
        x_out = sub_layer_in + x # add sublayer to sublayer's input
        out = self.LN(x_out) # normalize final layers
        return out

#### Residual connections

In [6]:
class Encoder(layers.Layer):

    def __init__(self, hidden_size, num_heads, intermediate_size, drop_rate=0.1, LNepsilon=1e-12):

        super(Encoder, self).__init__()

        self.attention = MultiHeadAttention(hidden_size, num_heads)
        self.ffn = FFN(intermediate_size, hidden_size, drop_rate)
        self.norm_1 = AddNorm(LNepsilon, drop_rate)
        self.norm_2 = AddNorm(LNepsilon, drop_rate)

    def call(self, inputs, mask):

        attention_out = self.attention(inputs, mask)
        out_norm1 = self.norm_1(inputs, attention_out)
        ffn_output = self.ffn(out_norm1)
        out_2 = self.norm_2(out_norm1, ffn_output)
        return out_2

    def compute_mask(self, x, mask):

        return mask

### BERT

In [7]:
class BertEmbedding(layers.Layer):

    def __init__(self, vocab_size, maxlen, hidden_size):

        super(BertEmbedding, self).__init__()
        self.TokEmb = layers.Embedding(input_dim=vocab_size, output_dim=hidden_size, mask_zero=True)
        self.PosEmb = tf.Variable(tf.random.truncated_normal(shape=(maxlen, hidden_size), stddev=0.02))
        self.LN = layers.LayerNormalization(epsilon=1e-12)
        self.dropout = layers.Dropout(0.1)

    def call(self, inputs):

        token_embedding = self.TokEmb(inputs)
        x = token_embedding + self.PosEmb
        x = self.LN(x)
        out = self.dropout(x)
        return out

    def compute_mask(self, x, mask=None):
        m = 1-tf.cast(self.TokEmb.compute_mask(x), tf.float32)
        m = m[:, tf.newaxis, tf.newaxis, :]
        return m

For each input sentence, the pooler changes the hidden states of the last encoder layer (which have the shape [batch size, sequence lenght, hidden size]) into a vector representation (which has the shape [batch size, hidden size]).
The pooler does this by giving a dense layer the hidden state that goes with the first token, which is a special token at the beginning of each sentence.

In [8]:
class Pooler(layers.Layer):

    def __init__(self, hidden_size):

        super(Pooler, self).__init__()
        self.dense = layers.Dense(hidden_size, activation='tanh')

    def call(self, encoder_out):

        x = encoder_out[:,0,:]
        out = self.dense(x)
        return out

In [9]:
def create_BERT(vocab_size, 
                maxlen, 
                hidden_size, 
                num_layers, 
                num_att_heads, 
                intermediate_size, 
                drop_rate=0.1
                ):

    """
    creates a BERT model based on the arguments provided

        Arguments:
        vocab_size: number of words in the vocabulary
        maxlen: maximum length of each sentence
        hidden_size: dimension of the hidden state of each encoder layer
        num_layers: number of encoder layers
        num_att_heads: number of attention heads in the multi-headed attention layer
        intermediate_size: dimension of the intermediate layer in the feed-forward sublayer of the encoders
        drop_rate: dropout rate of all the dropout layers used in the model
        returns: 
    """

    input_tokens = layers.Input(shape=(maxlen))
    embedding_out = BertEmbedding(vocab_size, maxlen, hidden_size)(input_tokens)
    for i in range(num_layers):
        embedding_out = Encoder(hidden_size, num_att_heads, intermediate_size, drop_rate=drop_rate)(embedding_out)
    pooler_out = Pooler(hidden_size)(embedding_out)

    model = tf.keras.Model(inputs=input_tokens, outputs=[embedding_out, pooler_out])

    return model

### Load dataset

In [11]:
train_reviews, test_reviews = pd.read_csv('./dataset/train_reviews.csv').values[:, 1:], pd.read_csv('./dataset/test_reviews.csv').values[:, 1:]
(train_texts, train_labels), (test_texts, test_labels)  = (train_reviews[:,0],train_reviews[:,1]), (test_reviews[:,0],test_reviews[:,1]) 
train_texts = [s.lower() for s in train_texts]
test_texts = [s.lower() for s in test_texts] 
aprx_vocab_size = 20000
cls_token = '[cls]'
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(corpus_generator=train_texts,
                                                        target_vocab_size=aprx_vocab_size,
                                                        reserved_tokens=[cls_token])

In [12]:
def encode_sentence(s, maxlen):

    token_id_list = tokenizer.encode(s)  # encode tokenizer
    token_id_list = tokenizer.encode(cls_token) + token_id_list # add special token to the beginning of the token id list.
    if len(token_id_list) < maxlen: # check for len of token to shorrten than maxlen and add all tokens ids to list.
        token_id_list += [0 for i in range(maxlen - len(token_id_list))] # Paded the token id list with zeros
    else:
        token_id_list = token_id_list[:maxlen]

    return token_id_list

In [13]:
MAXLEN = 32
x_train = np.array([encode_sentence(x, MAXLEN) for x in train_texts], dtype=np.int64)
x_test = np.array([encode_sentence(x, MAXLEN) for x in test_texts], dtype=np.int64)
y_train = train_labels.astype(np.int64)
y_test = test_labels.astype(np.int64)

### create and train BERT

In [14]:
hidden_size = 768
num_heads = 12
num_layers = 12
vocab_size = tokenizer.vocab_size  

bert = create_BERT(vocab_size, 
                    MAXLEN,
                    hidden_size, 
                    num_layers, 
                    num_heads,  
                    hidden_size*4, 
                    drop_rate=0.1
                    )

inputs = bert.inputs

        # connect out Dense layers and create model
x = bert.outputs[1]
x = layers.Dropout(0.1)(x)
out = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs=inputs, outputs=out)

In [15]:
model.compile(tf.keras.optimizers.Adam(learning_rate=5e-5),
             "binary_crossentropy", 
             metrics=["accuracy"])
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 32)]              0         
                                                                 
 bert_embedding (BertEmbeddi  (None, 32, 768)          15356928  
 ng)                                                             
                                                                 
 encoder (Encoder)           (None, 32, 768)           7087872   
                                                                 
 encoder_1 (Encoder)         (None, 32, 768)           7087872   
                                                                 
 encoder_2 (Encoder)         (None, 32, 768)           7087872   
                                                                 
 encoder_3 (Encoder)         (None, 32, 768)           7087872   
                                                           

In [16]:
history = model.fit(
    x_train,
    y_train,
    batch_size=128,
    epochs=2,
    validation_data=(x_test, y_test)
)

Epoch 1/2
Epoch 2/2


### Attention Visualization

In [17]:
#@title Run this!
import sys

# !test -d bertviz_repo && echo "FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo"
# # !rm -r bertviz_repo # Uncomment if you need a clean pull from repo
# !test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
# if not 'bertviz_repo' in sys.path:
#   sys.path += ['bertviz_repo']
! pip install -q bertviz

from bertviz import head_view

def call_html():
    import IPython
    display(IPython.core.display.HTML('''
    <script  src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.6/157.6 KB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.7/132.7 KB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m102.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.6/79.6 KB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m106.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [18]:
# This function takes as input a model (the trained BERT-based model from the previous section) and
#  a list of tokens (an encoded sentence).

def get_att_weights(model, tok_id_list):
  
    out = model(np.array([tok_id_list])) # output of model
    att_weights = [model.get_layer(name).attention.att_weights for name in [m.name for m in model.layers if m.name.startswith('encoder')]] # get weights of attention

    return att_weights

In [19]:
import torch
def get_att_tok(model, sent):

    maxlen = model.layers[0].input_shape[0][-1]
    encoded_toks = encode_sentence(sent, maxlen)
    att_weights = get_att_weights(model, encoded_toks)
    pad_start_idx = np.min(np.where(np.array(encoded_toks) == 0))
    toks = encoded_toks[:pad_start_idx]
    atts = []
    for att in att_weights:
        layer_att = torch.FloatTensor(att[:, :, :pad_start_idx, :pad_start_idx].numpy())
        atts.append(layer_att)
    toks = [tokenizer.decode([m]) for m in toks]
    return toks, atts


#### Attention visualization
giving your opinion about a movie and visualize the attention.

In [24]:
sentence = "I Love this movie because the actors are funny and professional"
toks, atts = get_att_tok(model, sentence.lower())
call_html()
head_view(atts, toks)

<IPython.core.display.Javascript object>