In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import bert
import matplotlib.pyplot as plt
import os

In [2]:
device = '/GPU:0' if len(tf.config.list_physical_devices('GPU')) > 0 else '/CPU:0'
device

'/GPU:0'

# Load Data

In [3]:
dataset_path = 'data/training_dataset.csv'
dataset_df = pd.read_csv(dataset_path)
print("Shape:", dataset_df.shape)

Shape: (8795, 55)


In [4]:
text_df = dataset_df['Title']
features = dataset_df.columns[1:]
label_df = dataset_df[features]
num_classes = len(features)
print("Number of classes:", num_classes)

Number of classes: 54


# Transform

## Load Bert Preprocess and Embedding

In [5]:
# load input into preprocessor model
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessing_model = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
bert_preprocess = hub.KerasLayer(preprocessing_model)
encoder_inputs = bert_preprocess(text_input)

In [6]:
# load preprocessed input into bert encoder
bert_model = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
bert_layer = hub.KerasLayer(bert_model, trainable=True)
outputs = bert_layer(encoder_inputs)
pooled_output = outputs["pooled_output"]
sequence_output = outputs["sequence_output"]

## Convert to Bert Input

In [7]:
# create tokenizer instance
BertTokenizer = bert.bert_tokenization.FullTokenizer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocab_file, do_lower_case)

In [8]:
max_seq_length = 128

In [9]:
# See BERT paper: https://arxiv.org/pdf/1810.04805.pdf
# And BERT implementation convert_single_example() at https://github.com/google-research/bert/blob/master/run_classifier.py

def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [10]:
input_word_ids, input_mask, input_type_ids = [], [], []
for sequence in text_df:
    sequence_tokens = tokenizer.tokenize(sequence)
    sequence_tokens = ["[CLS]"] + sequence_tokens + ["[SEP]"]
    
    sequence_ids = get_ids(sequence_tokens, tokenizer, max_seq_length)
    input_word_ids.append(sequence_ids)
    
    sequence_mask = get_masks(sequence_tokens, max_seq_length)
    input_mask.append(sequence_mask)
    
    sequence_segments = get_segments(sequence_tokens, max_seq_length)
    input_type_ids.append(sequence_segments)

transformed_seq = dict(
    input_word_ids= tf.convert_to_tensor(np.asarray(input_word_ids).astype('int32'), dtype=tf.int32),
    input_mask= tf.convert_to_tensor(np.asarray(input_mask).astype('int32'), dtype=tf.int32),
    input_type_ids= tf.convert_to_tensor(np.asarray(input_type_ids).astype('int32'), dtype=tf.int32)
)

In [72]:
transformed_seq['input_word_ids'].shape

TensorShape([8795, 128])

# Model

## Create Model

In [312]:
inputs = dict(
    input_word_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
    input_mask=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
    input_type_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32)
)

In [313]:
x = bert_layer(inputs)['pooled_output']
outputs = tf.keras.layers.Dense(num_classes, activation='sigmoid', name='output')(x)
model = tf.keras.Model(inputs, outputs)
model.summary()

Model: "model_13"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_81 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
input_82 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
input_80 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      multiple             109482241   input_81[0][0]                   
                                                                 input_82[0][0]            

In [14]:
# compile model
optimizer = tf.keras.optimizers.Adam(3e-5)
metrics = ['accuracy', tf.keras.metrics.AUC(name='auc')]
# model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics)

## Training

In [15]:
# reduce learning rate and save model
saved_model_path = 'bert_kaidah_model.h5'
callbacks = [tf.keras.callbacks.ReduceLROnPlateau(), 
             tf.keras.callbacks.ModelCheckpoint(saved_model_path, save_best_only=True)]

In [317]:
num_epoch = 100
batch_size = 128
history = model.fit(transformed_text.values,
                    label_df,
                    batch_size=batch_size,
                    epochs=num_epoch,
                    validation_split=0.3,
                    callbacks=callbacks)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type dict).

In [122]:
testing = text_preprocessed[:2]
# testing = testing.to_numpy(dtype=np.float32)
# testing = tf.data.Dataset.from_tensor_slices(testing)
# testing = np.asarray(testing).astype(np.float32)
type(testing.values[0])

dict

In [117]:
encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3', trainable=True, name='encoder')

In [11]:
def build_classifier_model(num_classes):
    inputs = dict(
      input_word_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
      input_mask=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
      input_type_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
    )

    encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3', trainable=True, name='encoder')
    net = encoder(inputs)['sequence_output']
    net = tf.keras.layers.GlobalAveragePooling1D()(net)
    net = tf.keras.layers.Dropout(rate=0.1)(net)
    net = tf.keras.layers.Dense(num_classes, activation='softmax', name='classifier')(net)
    return tf.keras.Model(inputs, net, name='prediction')

In [172]:
def bad(x, num_classes):
#     inputs = dict(
#       input_word_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
#       input_mask=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
#       input_type_ids=tf.keras.layers.Input(shape=(None,), dtype=tf.int32),
#     )

#     encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='encoder')
#     print(x)
    net = encoder(x)['pooled_output']
    print(net.shape)
    net = tf.keras.layers.Dropout(rate=0.1)(net)
    print(net.shape)
    net = tf.keras.layers.Dense(num_classes, activation='sigmoid', name='classifier')(net)
    print(net.shape)
    return net

In [143]:
bad(testing.values[0], 54)

(1, 768)
(1, 768)
(1, 768)
(1, 54)


<tf.Tensor: shape=(1, 54), dtype=float32, numpy=
array([[0.83549744, 0.5201568 , 0.63167685, 0.24671842, 0.58465695,
        0.3932768 , 0.13925646, 0.48468375, 0.54390746, 0.44662276,
        0.4522498 , 0.5936609 , 0.36753014, 0.3918841 , 0.63395387,
        0.44995788, 0.5953264 , 0.42487493, 0.33414108, 0.39370912,
        0.25651643, 0.41655833, 0.6877387 , 0.5466085 , 0.21995077,
        0.6463429 , 0.42815667, 0.3562347 , 0.45436433, 0.63951087,
        0.45244047, 0.40392876, 0.67751366, 0.49266887, 0.37938863,
        0.32924724, 0.42324358, 0.3748405 , 0.65886796, 0.42843848,
        0.3422416 , 0.47810945, 0.41968876, 0.5282222 , 0.7022111 ,
        0.24793096, 0.76028544, 0.58536035, 0.7315845 , 0.3678004 ,
        0.18701774, 0.5103993 , 0.6609004 , 0.22424951]], dtype=float32)>

In [21]:
test_classifier_model = build_classifier_model(54)
# bert_raw_result = test_classifier_model.predict(testing.values[0])
# print(tf.sigmoid(bert_raw_result))

In [22]:
test_classifier_model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=metrics)

In [23]:
test_classifier_model.fit(ds, epochs=1, batch_size=256)

ValueError: in user code:

    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\engine\training.py:855 train_function  *
        return step_function(self, iterator)
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\engine\training.py:845 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1285 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2833 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3608 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\engine\training.py:838 run_step  **
        outputs = model.train_step(data)
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\engine\training.py:796 train_step
        loss = self.compiled_loss(
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:204 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\losses.py:155 __call__
        losses = call_fn(y_true, y_pred)
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\losses.py:259 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\losses.py:1712 sparse_categorical_crossentropy
        return backend.sparse_categorical_crossentropy(
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\keras\backend.py:4979 sparse_categorical_crossentropy
        res = nn.sparse_softmax_cross_entropy_with_logits_v2(
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\ops\nn_ops.py:4228 sparse_softmax_cross_entropy_with_logits_v2
        return sparse_softmax_cross_entropy_with_logits(
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\BlackFeather\anaconda3\envs\tf\lib\site-packages\tensorflow\python\ops\nn_ops.py:4133 sparse_softmax_cross_entropy_with_logits
        raise ValueError("Shape mismatch: The shape of labels (received %s) "

    ValueError: Shape mismatch: The shape of labels (received (54,)) should equal the shape of logits except for the last dimension (received (128, 54)).


In [16]:
ds = tf.data.Dataset.from_tensor_slices((transformed_seq, label_df.values))
ds

<TensorSliceDataset shapes: ({input_word_ids: (128,), input_mask: (128,), input_type_ids: (128,)}, (54,)), types: ({input_word_ids: tf.int32, input_mask: tf.int32, input_type_ids: tf.int32}, tf.int64)>

In [18]:
for data in ds.take(1):
    print(data)

({'input_word_ids': <tf.Tensor: shape=(128,), dtype=int32, numpy=
array([  101,  4070, 16360,  6072, 26044,  2953,  3913,  2319,  6643,
       18317,   102,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,    

In [None]:
result = tf.sigmoid(bert_raw_result)
index = np.asarray(result[0]).argmax()
print(index, result[0][index])

## Plot Metric

In [None]:
directory = 'metric/'
if not os.path.exists(directory):
    os.mkdir(directory)

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.savefig(directory+'bert_'+string+'.png')
    plt.show()

for metric in model.metrics_names:
    plot_graphs(history, metric)

# Save Class

In [None]:
# add id as new column
class_df.insert(0, 'id', class_df.index+1)
class_df.pop('count')
class_df.pop('weight')
class_df.columns

In [None]:
# save to directory
directory = 'data/'
if not os.path.exists(directory):
    os.mkdir(directory)

saved_data_path = os.path.join(directory, 'label.csv')
class_df.to_csv(saved_data_path, index=False)

Learning rate -> Coursera Course<br>
[Model architecture](https://ieeexplore.ieee.org/abstract/document/8723320)<br>
[Layer concatenation](https://keras.io/api/layers/merging_layers/concatenate/)<br>
https://stackoverflow.com/questions/38387913/reason-of-having-high-auc-and-low-accuracy-in-a-balanced-dataset