In [1]:
# !pip install tensorflow_text
# !pip install transformers

In [2]:
## Usual Imports

## Math and Arrays
import numpy as np
# import pandas as pd
from statistics import mean

# OS and Utilities
import sys
import datetime

## File and String Handling
import re
import json
import string

# Visualizations
import matplotlib.pyplot as plt

# BERT
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig

# Tensorflow 2 core - preprocessing no longer needed as we are using BERT
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow import keras
# from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# Add local path to .py modules and add utilities
sys.path.insert(0, '../python')

import debug

from jbyrne_utils import load_data
# from jbyrne_utils import tokenize_sentences
# from jbyrne_utils import embed_matrix
# from jbyrne_utils import run_model

# Set message level

# debug.off()
# debug.on()
debug.show_detail()


*************** DEBUG DETAILS TURNED ON *****************


In [3]:
### Parameters for the base model

# maximum number of tokens to look at.
max_len = 100



## Step 1:  Load the ClaimBuster datafile

In [4]:
d = load_data("../data/3xNCS.json")

# Randomize the order as the data is sorted by class
np.random.seed(42)
np.random.shuffle(d)


Loaded 11056 data records.


In [5]:
## View an random example entry
d[512]

{'sentence_id': 9703,
 'label': 1,
 'text': 'President Obama was right, he said that that was outrageous to have deficits as high as half a trillion dollars under the Bush years.'}

## Step 2:  Tokenize the sentences using BERT tokenizer



In [6]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [7]:
input_sentences=[]
input_ids=[]
attention_masks=[]  # used so BERT can discount padding in the fixed-length token list

# avoid big output for just this cell
debug.on()

for sentence in [ i["text"] for i in d ]:
    input_sentences.append(sentence)
    bert_input = bert_tokenizer.encode_plus(sentence,
                                            add_special_tokens=True,  # adds the CLS etc.
                                            max_length=max_len,
                                            truncation=True,          # truncate sentences over max_len
                                            padding = 'max_length',   # add padding ids (0) up to max_len
                                            return_attention_mask=True)
    input_ids.append(bert_input['input_ids'])
    attention_masks.append(bert_input['attention_mask'])
    debug.detail(bert_input)

    
input_ids = np.asarray(input_ids)
attention_masks = np.asarray(attention_masks)
input_sentences = np.asarray(input_sentences)
labels = np.array( [i["label"] for i in d] )

# check lengths of arrays
debug.msg(len(input_ids), len(attention_masks), len(labels), len(input_sentences))

# reset to previous debugging level
debug.last()

****************** DEBUG TURNED ON **********************
11056 11056 11056 11056
*************** DEBUG DETAILS TURNED ON *****************


In [8]:
## Verify the tokenization of the previous sample sentence

debug.msg(d[512]["text"])
debug.msg(input_ids[512])
bert_tokenizer.convert_ids_to_tokens(input_ids[512])



President Obama was right, he said that that was outrageous to have deficits as high as half a trillion dollars under the Bush years.
[  101  2343  8112  2001  2157  1010  2002  2056  2008  2008  2001 25506
  2000  2031 15074  2015  2004  2152  2004  2431  1037 23458  6363  2104
  1996  5747  2086  1012   102     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]


['[CLS]',
 'president',
 'obama',
 'was',
 'right',
 ',',
 'he',
 'said',
 'that',
 'that',
 'was',
 'outrageous',
 'to',
 'have',
 'deficit',
 '##s',
 'as',
 'high',
 'as',
 'half',
 'a',
 'trillion',
 'dollars',
 'under',
 'the',
 'bush',
 'years',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

## Step 3a: Split into training, validation and test datasets

In [9]:
train_len = int(0.8 * len(d))
val_len = int(0.2 * len(d))

train_ids, val_ids             = np.split(input_ids, [train_len])
train_attn, val_attn           = np.split(attention_masks, [train_len])
train_sentences, val_sentences = np.split(input_sentences, [train_len])
train_labels, val_labels       = np.split(labels, [train_len])

debug.msg(f"proportion of checkable claims in training data  : {np.count_nonzero(train_labels == 1)/len(train_labels):.4f}")
debug.msg(f"proportion of checkable claims in validation data: {np.count_nonzero(val_labels == 1)/len(val_labels):.4f}")

debug.detail(len(train_ids), len(train_attn), len(train_sentences), len(train_labels))
debug.detail(len(val_ids), len(val_attn), len(val_sentences), len(val_labels))

proportion of checkable claims in training data  : 0.2495
proportion of checkable claims in validation data: 0.2518
8844 8844 8844 8844
2212 2212 2212 2212


## Step 3b: Even out the checkable and non-checkable classes.

The intention here is to equalize the number of checkable and non-checkable sentences in the training dataset. In the raw dataset, approximately 25% of the statements are labelled as checkable claims.

As we are looking at detailed text and whether it includes a checkable claim, there is no reliable equivalent of the data enhancement techniques that exist for image or sound data.  

We are presented with the choice, therefore, of removing $\frac{2}{3}$ of the non-checkable claims - as the source dataset has provided, or adding two copies of each checkable claim to reach approximately a 1:1 ratio of classes in the training data. The second method has proved especially successful in the CNN examples, so we will do the same for the BERT case.


In [10]:
## Ideally we could rerandomize the training set, but
## for the moment, we will try just adding copies of 
## the positive records to the end.

pos_train_ids = train_ids[ train_labels == 1 ]
pos_train_attn = train_attn[ train_labels == 1 ]
pos_train_sentences = train_sentences[ train_labels == 1 ]
pos_train_labels = train_labels[ train_labels == 1 ]  # kinda redundant, but an easy way to get the right length.

print(train_ids.shape)
print(train_attn.shape)
print(train_sentences.shape)
print(train_labels.shape)


## concatenate two copies of the positive cases to each of the training datasets

train_ids       = np.concatenate( (train_ids, pos_train_ids, pos_train_ids) )
train_attn      = np.concatenate( (train_attn, pos_train_attn, pos_train_attn) )
train_sentences = np.concatenate( (train_sentences, pos_train_sentences, pos_train_sentences) )
train_labels    = np.concatenate( (train_labels, pos_train_labels, pos_train_labels) )

(8844, 100)
(8844, 100)
(8844,)
(8844,)


In [11]:
print(f"train_ids.shape:       {train_ids.shape}")
print(f"train_attn.shape:      {train_attn.shape}")
print(f"train_sentences.shape: {train_sentences.shape}")
print(f"train_labels.shape:    {train_labels.shape}\n\n")

print(f"val_ids.shape:         {val_ids.shape}")
print(f"val_attn.shape:        {val_attn.shape}")
print(f"val_sentences.shape:   {val_sentences.shape}")
print(f"val_labels.shape:      {val_labels.shape}")

train_ids.shape:       (13258, 100)
train_attn.shape:      (13258, 100)
train_sentences.shape: (13258,)
train_labels.shape:    (13258,)


val_ids.shape:         (2212, 100)
val_attn.shape:        (2212, 100)
val_sentences.shape:   (2212,)
val_labels.shape:      (2212,)


## Step 4: Set up the Bert model

Claim detection is a sentence classification task, so I will base this on the `TFBertForSequenceClassification` class from the huggungface Tensorflow implementation of Bert. 

In [12]:
bert_trainable = True   # sets whether we can update the bert model during training.



bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                             num_labels=2,
                                                             trainable=bert_trainable)

print('\nBert Model',bert_model.summary())

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________

Bert Model None


In [13]:
## Compile the model

optimizer = keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08)  

# the following line - a change from "binary_crossentropy" seems to make a huge difference
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric    = keras.metrics.SparseCategoricalAccuracy('accuracy')

bert_model.compile(loss=loss, optimizer=optimizer, metrics=[metric] )


In [14]:
runtag = datetime.datetime.now().strftime("%y%m%d-%H%M%S")

log_dir='./tb_bert/'+ runtag

model_save_path='../models/bert_model/' + runtag

## Add callbacks to save both the best weights we find and info for tensorboard
callbacks = [keras.callbacks.ModelCheckpoint(filepath=model_save_path,
                                             save_weights_only=False,
                                             monitor='val_loss',
                                             mode='min',
                                             save_best_only=True),
             keras.callbacks.TensorBoard(log_dir=log_dir)]


## Step 5: Train the Bert Model, and check against validation step

In [15]:
history=bert_model.fit([train_ids,train_attn],
                       train_labels,
                       batch_size=32,
                       epochs=6,
                       validation_data=([val_ids,val_attn],val_labels),
                       callbacks=callbacks)

Epoch 1/6
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


Using this basic guess, we already have reasonable numbers - achieving val_accuracy of 92.6% after only a couple of epochs. Next is to tune the model's hyperparameters to focus down on the best setup.

## References used
BERT Text Classification using Keras https://swatimeena989.medium.com/bert-text-classification-using-keras-903671e0207d#2f06