In [1]:
# !pip install tensorflow_text
# !pip install transformers
# !pip install keras-tuner

In [2]:
## Usual Imports

## Math and Arrays
import numpy as np
# import pandas as pd
from statistics import mean

# OS and Utilities
import sys
import datetime

## File and String Handling
import re
import json
import string

# Visualizations
import matplotlib.pyplot as plt

# BERT
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig

# Tensorflow 2 core - preprocessing no longer needed as we are using BERT
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow import keras

#Import Tuining toolkit for automatic Hyperparameter search
import kerastuner as kt

# from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# Add local path to .py modules and add utilities
sys.path.insert(0, '../python')

import debug

from jbyrne_utils import load_data
# from jbyrne_utils import tokenize_sentences
# from jbyrne_utils import embed_matrix
# from jbyrne_utils import run_model

# Set message level

# debug.off()
# debug.on()
debug.show_detail()


*************** DEBUG DETAILS TURNED ON *****************


In [3]:
### Parameters for the base model

# maximum number of tokens to look at.
max_len = 100



## Step 1:  Load the ClaimBuster datafile

In [4]:
d = load_data("../data/3xNCS.json")

# Randomize the order as the data is sorted by class
np.random.seed(42)
np.random.shuffle(d)


Loaded 11056 data records.


In [5]:
## View an random example entry
d[512]

{'sentence_id': 9703,
 'label': 1,
 'text': 'President Obama was right, he said that that was outrageous to have deficits as high as half a trillion dollars under the Bush years.'}

## Step 2:  Tokenize the sentences using BERT tokenizer



In [6]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [7]:
input_sentences=[]
input_ids=[]
attention_masks=[]  # used so BERT can discount padding in the fixed-length token list

# avoid big output for just this cell
debug.on()

for sentence in [ i["text"] for i in d ]:
    input_sentences.append(sentence)
    bert_input = bert_tokenizer.encode_plus(sentence,
                                            add_special_tokens=True,  # adds the CLS etc.
                                            max_length=max_len,
                                            truncation=True,          # truncate sentences over max_len
                                            padding = 'max_length',   # add padding ids (0) up to max_len
                                            return_attention_mask=True)
    input_ids.append(bert_input['input_ids'])
    attention_masks.append(bert_input['attention_mask'])
    debug.detail(bert_input)

    
input_ids = np.asarray(input_ids)
attention_masks = np.asarray(attention_masks)
input_sentences = np.asarray(input_sentences)
labels = np.array( [i["label"] for i in d] )

# check lengths of arrays
debug.msg(len(input_ids), len(attention_masks), len(labels), len(input_sentences))

# reset to previous debugging level
debug.last()

****************** DEBUG TURNED ON **********************
11056 11056 11056 11056
*************** DEBUG DETAILS TURNED ON *****************


In [8]:
## Verify the tokenization of the previous sample sentence

debug.msg(d[512]["text"])
debug.msg(input_ids[512])
bert_tokenizer.convert_ids_to_tokens(input_ids[512])



President Obama was right, he said that that was outrageous to have deficits as high as half a trillion dollars under the Bush years.
[  101  2343  8112  2001  2157  1010  2002  2056  2008  2008  2001 25506
  2000  2031 15074  2015  2004  2152  2004  2431  1037 23458  6363  2104
  1996  5747  2086  1012   102     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]


['[CLS]',
 'president',
 'obama',
 'was',
 'right',
 ',',
 'he',
 'said',
 'that',
 'that',
 'was',
 'outrageous',
 'to',
 'have',
 'deficit',
 '##s',
 'as',
 'high',
 'as',
 'half',
 'a',
 'trillion',
 'dollars',
 'under',
 'the',
 'bush',
 'years',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

## Step 3a: Split into training, validation and test datasets

In [9]:
train_len = int(0.8 * len(d))
val_len = int(0.2 * len(d))

train_ids, val_ids             = np.split(input_ids, [train_len])
train_attn, val_attn           = np.split(attention_masks, [train_len])
train_sentences, val_sentences = np.split(input_sentences, [train_len])
train_labels, val_labels       = np.split(labels, [train_len])

debug.msg(f"proportion of checkable claims in training data  : {np.count_nonzero(train_labels == 1)/len(train_labels):.4f}")
debug.msg(f"proportion of checkable claims in validation data: {np.count_nonzero(val_labels == 1)/len(val_labels):.4f}")

debug.detail(len(train_ids), len(train_attn), len(train_sentences), len(train_labels))
debug.detail(len(val_ids), len(val_attn), len(val_sentences), len(val_labels))

proportion of checkable claims in training data  : 0.2495
proportion of checkable claims in validation data: 0.2518
8844 8844 8844 8844
2212 2212 2212 2212


In [10]:
train_labels

array([1, 0, 0, ..., 0, 0, 0])

## Step 3b: Even out the checkable and non-checkable classes.

The intention here is to equalize the number of checkable and non-checkable sentences. In the raw dataset, approximately 25% of the statements are labelled as checkable claims.

As we are looking at detailed text and whether it includes a checkable claim, there is no reliable equivalent of the data enhancement techniques that exist for image or sound data.  

We are presented with the choice, therefore, of removing $\frac{2}{3}$ of the non-checkable claims - as the source dataset has provided, or adding two copies of each checkable claim to reach approximately a 1:1 ratio of classes in the training data. The second method has proved especially successful in the CNN examples, so we will do the same for the BERT case.


In [11]:
## Ideally we could rerandomize the training set, but
## for the moment, we will try just adding copies of 
## the positive records to the end.

pos_train_ids = train_ids[ train_labels == 1 ]
pos_train_attn = train_attn[ train_labels == 1 ]
pos_train_sentences = train_sentences[ train_labels == 1 ]
pos_train_labels = train_labels[ train_labels == 1 ]  # kinda redundant, but an easy way to get the right length.

print(train_ids.shape)
print(train_attn.shape)
print(train_sentences.shape)
print(train_labels.shape)


## concatenate two copies of the positive cases to each of the training datasets

train_ids       = np.concatenate( (train_ids, pos_train_ids, pos_train_ids) )
train_attn      = np.concatenate( (train_attn, pos_train_attn, pos_train_attn) )
train_sentences = np.concatenate( (train_sentences, pos_train_sentences, pos_train_sentences) )
train_labels    = np.concatenate( (train_labels, pos_train_labels, pos_train_labels) )

(8844, 100)
(8844, 100)
(8844,)
(8844,)


In [12]:
print(pos_train_ids.shape)
print(pos_train_attn.shape)
print(pos_train_sentences.shape)
print(pos_train_labels.shape)

print(train_ids.shape)
print(train_attn.shape)
print(train_sentences.shape)
print(train_labels.shape)

(2207, 100)
(2207, 100)
(2207,)
(2207,)
(13258, 100)
(13258, 100)
(13258,)
(13258,)


## Step 4: Set up the Bert model

Claim detection is a sentence classification task, so for the first runs, I will base this on the `TFBertForSequenceClassification` class from the huggungface Tensorflow implementation of Bert. 

To keep it simple, the model build is packaged into a build_bert_model function that returns a compiled model
with the search space already added.



In [13]:
# Function to build the model with hyperparameter search spaces included

def build_bert_model(hp):
    
    ################################################
    ####  DEFINE THE HYPERPARAMETER SEARCH SPACE ###
    ################################################
    bert_trainable = hp.Choice('bert_trainable', values=[True, False])
                               
    optimizer      = keras.optimizers.Adam( hp.Choice('learning_rate',
                                                      values=[5e-4, 2e-4, 1e-4, 5e-5, 2e-5, 1e-5, \
                                                              5e-6, 2e-6, 1e-6]),
                                            hp.Choice('epsilon',
                                                      values= [1e-8, 1e-7, 1e-6, 1e-5]))
    
    loss           = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric         = keras.metrics.SparseCategoricalAccuracy('accuracy')

    
    ###############################
    ####  CREATE THE BERT MODEL ###
    ###############################
    bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                                 num_labels=2,
                                                                 trainable=bert_trainable)
    print('\nBert Model',bert_model.summary())

    ############################################
    ####  COMPILE THE MODEL WITH THE CHOICES ###
    ############################################
    bert_model.compile(loss=loss, optimizer=optimizer, metrics=[metric] )

    return(bert_model)

## Step 5: Set up the Keras Tuner

From https://github.com/keras-team/keras-tuner/README.md:

***
Next, instantiate a tuner. You should specify the model-building function, the name of the objective to optimize (whether to minimize or maximize is automatically inferred for built-in metrics), the total number of trials (max_trials) to test, and the number of models that should be built and fit for each trial (executions_per_trial).

Available tuners are `RandomSearch` and `Hyperband`.

Note: the purpose of having multiple executions per trial is to reduce results variance and therefore be able to more accurately assess the performance of a model. If you want to get results faster, you could set executions_per_trial=1 (single round of training for each model configuration).
***

More reading shows that there are additional tuners in `kt.tuners` like `kt.tuners.BayesianOptimization`.

In [14]:
# Create the tuner object
#
# There are two options, RandomSearch and Hyperband.

runtag = datetime.datetime.now().strftime("%y%m%d-%H%M%S")
log_dir='./tb_bert_tuner/'+ runtag

tuner = kt.RandomSearch(
    build_bert_model,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=3,
    directory=log_dir,
    project_name='BertTunerRandom')


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________

Bert Model None


## Step 6: Create Callbacks

Adding a new callback here, keras.callbacks.EarlyStopping(). This stops after the monitored metric (usually loss) stops improving.  The $patience$ term is the number of epochs that will be executed without improvement before stopping to allow for possible oscilation.

In [15]:
runtag = datetime.datetime.now().strftime("%y%m%d-%H%M%S")

log_dir='./tb_bert/'+ runtag

model_save_path='../models/bert_keras_tuner/' + runtag

## Create Callback list
callbacks = [keras.callbacks.ModelCheckpoint(filepath=model_save_path,
                                             save_weights_only=False,
                                             monitor='val_loss',
                                             mode='min',
                                             save_best_only=True),
             keras.callbacks.EarlyStopping(monitor='loss', patience=3),
             keras.callbacks.TensorBoard(log_dir=log_dir)]


## Step 7: Run the Optimizer

Tuner.search() has the same signature (parameters) as keras.Model.fit().

In [17]:
%%time
tuner.search([train_ids,train_attn],
             train_labels,
             batch_size=32,
             epochs=10,
             validation_data=([val_ids,val_attn],val_labels),
             callbacks=callbacks)

Trial 5 Complete [00h 52m 56s]
val_accuracy: 0.7481916546821594

Best val_accuracy So Far: 0.93098251024882
Total elapsed time: 03h 34m 14s
INFO:tensorflow:Oracle triggered exit


In [25]:
best_model = tuner.get_best_models(1)[0]
best_model.save("../models/best_model_bert_tuner_random")
best_model

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________

Bert Model None




INFO:tensorflow:Assets written to: ../models/best_model_bert_tuner_random/assets


INFO:tensorflow:Assets written to: ../models/best_model_bert_tuner_random/assets


<transformers.models.bert.modeling_tf_bert.TFBertForSequenceClassification at 0x7f471c207940>

In [22]:
best_hyperparameters = tuner.get_best_hyperparameters(1)[0]
best_hyperparameters

<kerastuner.engine.hyperparameters.HyperParameters at 0x7f471c317730>

In [27]:
tuner.results_summary()


Results summary
Results in ./tb_bert_tuner/210404-165233/BertTunerRandom
Showing 10 best trials
Objective(name='val_accuracy', direction='max')
Trial summary
Hyperparameters:
bert_trainable: 1
learning_rate: 1e-05
epsilon: 1e-07
Score: 0.93098251024882
Trial summary
Hyperparameters:
bert_trainable: 1
learning_rate: 0.0002
epsilon: 1e-06
Score: 0.7481916546821594
Trial summary
Hyperparameters:
bert_trainable: 1
learning_rate: 0.0002
epsilon: 1e-08
Score: 0.7481916546821594
Trial summary
Hyperparameters:
bert_trainable: 0
learning_rate: 0.0005
epsilon: 1e-06
Score: 0.5827305416266123
Trial summary
Hyperparameters:
bert_trainable: 0
learning_rate: 1e-06
epsilon: 1e-05
Score: 0.28375527262687683


## References used
BERT Text Classification using Keras https://swatimeena989.medium.com/bert-text-classification-using-keras-903671e0207d#2f06
Keras Tuner blog: https://blog.tensorflow.org/2020/01/hyperparameter-tuning-with-keras-tuner.html
Keras Tuner Git:  https://github.com/keras-team/keras-tuner

@misc{omalley2019kerastuner,
	title        = {Keras {Tuner}},
	author       = {O'Malley, Tom and Bursztein, Elie and Long, James and Chollet, Fran\c{c}ois and Jin, Haifeng and Invernizzi, Luca and others},
	year         = 2019,
	howpublished = {\url{https://github.com/keras-team/keras-tuner}}
}

TypeError: cannot pickle 'KeyedRef' object