**unlike bert_tokenizer (using homemade embedding layer) will use bert as embedding layer**

will not train the weight or the variables of bert

**Stage 1 : Importing dependencies**

In [None]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-for-tf2
  Downloading bert-for-tf2-0.14.9.tar.gz (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 KB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py-params>=0.9.6
  Downloading py-params-0.10.2.tar.gz (7.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting params-flow>=0.8.0
  Downloading params-flow-0.8.2.tar.gz (22 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bert-for-tf2, params-flow, py-params
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2-0.14.9-py3-none-any.whl size=30534 sha256=c6f982a4686446dd092a7034d40f2c8fffde0f924e78505744df5aa7fe2bf8a3
  Stored in directory: /root/.cache/pip/wheels/ab/a4/72/df07592cea3ae06b5e846f5e52262

In [None]:
try:
    %tensorflow_version 2.x
except:
    pass
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow import keras
from keras import layers
import bert

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


**Stage 2 : Data preprocessing**

**Loading files**

import files from personal Google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    "/content/drive/MyDrive/Colab Notebooks/bert/train.csv",
    header = None,
    names = cols,
    engine = "python",
    encoding = "latin1"
)

In [None]:
data.drop(["id", "date", "query", "user"],
          axis = 1,
          inplace = True)

In [None]:
data.head(5)

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


**Preprocessing**

cleaning

In [None]:
def clean_tweet(tweet):
    # decode tweets included in the lxml format -> BeautifulSoup.get_text() : 유니코드 텍스트만 들어있는 문자열 반환
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # remove metions -> find all the @ signs followed by letters or numbers -> replace them into white space
    # r is to indicate that i am writing a regex
    # + means that they can be repeated as many times as needed
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # remove url links
    # ? : s can or cannot be there -> search http and https//letter&numbers./
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # remove everything that is not letters, ., !, ?
    # ^ means not
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # get rid of the spaces that are repeated several times
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [None]:
# apply claean to all the tweets
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [None]:
data_labels = data.sentiment.values
# make the label of 4 into 1 (labels are made up of 0 and 4)
data_labels[data_labels == 4] = 1

**Tokenization**

need to create a BERT layer to have acces to meta data for the tokenizer (like vocab size)

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
# get BERT model from the website
# trainable = False : won't fine tune the weights of BERT
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable = False)
# get the vocab file for BERT tokenizer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
# lower casing the text file
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
# create tokenizer
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [None]:
# tokenize every sentences
def encode_sentence(sent):
    # [CLS] token : token that is used for classification 
    # [CLS] token will be given at the beginning of all sentences
    # Can distinguish between single and consecutive sentences
    # [SEP] token : Use at the end of a sentence to separate sentences
    return ["[CLS]"] + tokenizer.tokenize(sent) + ["[SEP]"]

In [None]:
# data_inputs would be list of encoded sentences applied to a cleaned sentence
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

**Dataset Creation**

create padded batches (to pad sentences for each batch independently)

add the minimum of padding tokens possible

for that, we sort sentences by length, apply padded_batches and then shuffle

In [None]:
# BERT needs 3 inputs

# 1. tokenized version of the sentence
def get_ids(tokens):
    return tokenizer.convert_tokens_to_ids(tokens)

# 2. list of mask : indicates to BERT where the values of the sentences are
def get_mask(tokens):
    return np.char.not_equal(tokens, "[PAD]").astype(int)

# 3. sequence of 1s and 0s
# 0 : indicate that we are currently in the first sentence -> correspond to the tokens of the first sentence
# 1 : corresponds to the tokens of the second sentence
def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for tok in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_seg_id = 1 - current_seg_id
    return seg_ids

create padded batches (so i can pad sentences for each batch independently)

this way can add the minimum of padding tokens possible

for that, can sort sentences by length, apply padded_batches and the shuffle

In [None]:
# list of all sentences, corresponding label and the length of the sentence
# to iterate over data_inputs while having accounts at 'i' so that we can have access to the corresponding label (data_labels[i])
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
# shuffle data_with_len
# shuffle because in the initial data file, inputs are sorted according to the label (sentiments)
random.shuffle(data_with_len)

# sort every sentences according to the length
# x[2] is len(sent)
data_with_len.sort(key = lambda x: x[2])
# sent_lab : sentence label
sorted_all = [([get_ids(sent_lab[0]),
                get_mask(sent_lab[0]),
                get_segments(sent_lab[0])],
               sent_lab[1])
            # only when the length of the sentence is more than 7
            for sent_lab in data_with_len if sent_lab[2] > 7]

In [None]:
# sentences don't have the same length -> call generator
# different length of inputs but same length of output
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types = (tf.int32, tf.int32))

In [None]:
BATCH_SIZE = 32
# padded_batch : 입력데이터의 크기가 가변일 때 같은 크기로 읽을 수 있도록 변환해주는 함수
# https://kyoungseop.tistory.com/entry/tensorflow-dataset-paddedbatch-%ED%95%A8%EC%88%98
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes = ((3, None), ()), padding_values = (0, 0))

In [None]:
# ceil() : gets the smaller integer that is higher than the number we pass
# len(sorted_all) is the number of inputs
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
# shuffle <- all_batched is sorted from the shortest to longest
all_batched.shuffle(NB_BATCHES)
# create test and train datasets
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.take(NB_BATCHES_TEST)

**Stage 3 : Model Building**

In [None]:
my_sent = ["[CLS]"] + tokenizer.tokenize("Roses are red.") + ["[SEP]"]
bert_layer([tf.expand_dims(tf.cast(get_ids(my_sent), tf.int32), 0),
            tf.expand_dims(tf.cast(get_mask(my_sent), tf.int32), 0),
            tf.expand_dims(tf.cast(get_segments(my_sent), tf.int32), 0)])

[<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
 array([[-9.27935660e-01, -4.10335362e-01, -9.65754867e-01,
          9.07317877e-01,  8.12913775e-01, -1.74174383e-01,
          9.11234617e-01,  3.41952175e-01, -8.74521255e-01,
         -9.99989271e-01, -7.78409779e-01,  9.69385147e-01,
          9.86160517e-01,  6.36962950e-01,  9.48631287e-01,
         -7.51193106e-01, -4.58339304e-01, -7.08104551e-01,
          4.62098330e-01, -6.57927275e-01,  7.60414660e-01,
          9.99994814e-01, -3.96860719e-01,  3.44166040e-01,
          6.16488695e-01,  9.94400144e-01, -7.76633799e-01,
          9.38316584e-01,  9.59452271e-01,  7.32879400e-01,
         -6.93436861e-01,  2.93080509e-01, -9.93785441e-01,
         -1.64551824e-01, -9.67019558e-01, -9.95549619e-01,
          5.32935441e-01, -6.88061237e-01,  1.34714758e-02,
          2.98194177e-02, -9.18356538e-01,  4.20526206e-01,
          9.99989092e-01,  2.52676457e-01,  6.06235445e-01,
         -3.50750148e-01, -1.00000000e+00,  4.975

In [None]:
class DCNNBERTEmbedding(tf.keras.Model):

    def __init__(self,
                 nb_filters = 50,
                 # number of hidden units
                 FFN_units = 512,
                 nb_classes = 2,
                 dropout_rate = 0.1,
                 training = False,
                 name = "dcnn"):
        super(DCNNBERTEmbedding, self).__init__(name = name)

        # embedding layer made by BERT import form the website
        # Embedding : vectorize words to map them into semantic geometric space
        self.bert_layer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
            trainable = False
        )

        # focus on 2 consecutive words
        # Conv1D : shift feature detectors in only one dimension, Extract regional features using filters
        self.bigram = layers.Conv1D(filters = nb_filters,
                                    kernel_size = 2,
                                    padding = "valid",
                                    activation = "relu")
        self.trigram = layers.Conv1D(filters = nb_filters,
                                    kernel_size = 3,
                                    padding = "valid",
                                    activation = "relu")
        self.fourgram = layers.Conv1D(filters = nb_filters,
                                    kernel_size = 4,
                                    padding = "valid",
                                    activation = "relu")
        # GlobalMaxPooling1D : Choose and return the largest vector of multiple vector information
        self.pool = layers.GlobalMaxPooling1D()
        self.dense_1 = layers.Dense(units = FFN_units, activation = "relu")
        # need dropout layer to prevent overfitting
        self.dropout = layers.Dropout(rate = dropout_rate)

        if nb_classes == 2:
            # units : Number of neurons active in that hidden layer
            # activation : Which function will fit the calculation result of the weight and bias of the hidden layer and print it?
            # 1 unit -> activation signoid (classification between 0 and 1)
            self.last_dense = layers.Dense(units = 1, activation = "sigmoid")
        else:
            # nb_classes unit -> activation softmax
            self.last_dense = layers.Dense(units = nb_classes,
                                           activation = "softmax")

    # embedder using bert
    def embed_with_bert(self, all_tokens):
        # input contains of three different types of tokens -> need to access them using all the batches
        # tokens : cls, sep, pad
        _, embs = self.bert_layer([all_tokens[:, 0, :],
                                   all_tokens[:, 1, :],
                                   all_tokens[:, 2, :]])
        return embs

    # if the training is false -> apply dropout
    # while training -> dropout : in order to prevent overfitting
    # while pedicting -> no dropout : in order to see all the results      
    def call(self, inputs, training):
        # embedding layer made by BERT
        x = self.embed_with_bert(inputs)

        # first set of output from the first se of Convolutional Layer
        x_1 = self.bigram(x)
        # apply the absolute maximum
        # each of the 50 feature detectors of size 2 -> get 1 number which is maximum activation for the particular feature
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)

        # concat all the result and apply to the dense layer
        # concat : Concatenates the list of tensors values along dimension axis
        # x_1, x_2, x_3 shape : (batch_size, nb_filters) ---- concat ----> merged shape : (batch_size, 3 * nb_filters)
        # axis = -1 : Concat based on the lowest dimension
        merged = tf.concat([x_1, x_2, x_3], axis = -1)
        merged = self.dense_1(merged)
        # apply dropout
        merged = self.dropout(merged, training)
        # call output
        output = self.last_dense(merged)

        return output

**Stage 4 : Training**

In [None]:
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

In [None]:
Dcnn = DCNNBERTEmbedding(
            nb_filters = NB_FILTERS,
            FFN_units = FFN_UNITS,
            nb_classes = NB_CLASSES,
            dropout_rate = DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss = "binary_crossentropy",
                 optimizer = "adam",
                 metrics = ["accuracy"])
else:
    Dcnn.compile(loss = "sparse_categorical_crossentropy",
                 optimizer = "adam",
                 metrics = ["sparse_categorical_accuracy"])

In [None]:
checkpoint_path = "/content/drive/MyDrive/Colab Notebooks/bert/ckpt_bert_embedding"

ckpt = tf.train.Checkpoint(Dcnn = Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep = 1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!")

Latest checkpoint restored!


In [None]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs = None):
        ckpt_manager.save()
        print("Checkpoint saved as{}." .format(checkpoint_path))

**Result**

In [None]:
Dcnn.fit(train_dataset,
         epochs = NB_EPOCHS,
         callbacks = [MyCustomCallback()])

Epoch 1/5
   4513/Unknown - 137s 27ms/step - loss: 0.2543 - accuracy: 0.8922Checkpoint saved as/content/drive/MyDrive/Colab Notebooks/bert/ckpt_bert_embedding.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f33aeb79fa0>

**Stage 5 : Evaluation**

In [None]:
results = Dcnn.evaluate(test_dataset)
print(results)

[0.18308138847351074, 0.9245443940162659]
