In [1]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"]="1"
import re
import json
import string
import numpy as np
import spacy
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import RobertaTokenizer, RobertaTokenizerFast, TFRobertaModel

tf.random.set_seed(1234)
np.random.seed(1234)

seq_len = 450
max_len = 512


In [2]:
train_path = "../data/squad/train-v2.0.json"
eval_path = "../data/squad/dev-v2.0.json"

# Save the slow pretrained tokenizer
slow_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
save_path = "roberta_base/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

In [3]:
class SquadExample:
    def __init__(self, question, context, start_char_idx, answer_text, all_answers, seq_len, max_len):
        
        # Clean context, answer and question
        context = " ".join(str(context).split())
        question = " ".join(str(question).split())
        answer = " ".join(str(answer_text).split())

        self.question = str(question)
        self.context = str(context)
        self.start_char_idx = start_char_idx
        self.answer_text = str(answer)
        self.all_answers = all_answers
        self.max_len = max_len
        self.seq_len = seq_len
        self.skip_doc = False
        
        self.input_ids = None
        self.attention_mask = None
        self.token_type_ids = None
        self.start_token_idx = None
        self.end_token_idx = None
        self.skip = None
        
    def __str__(self):
        print(np.stack(self.input_ids).shape)
        print(np.stack(self.token_type_ids).shape)
        print(np.stack(self.attention_mask).shape)
        return "<SquadExample>"
        
    def __repr__(self):
        return repr({"input_ids":self.input_ids, 
                     "token_type_ids":self.token_type_ids, 
                     "attention_mask":self.attention_mask,
                     "start_token_idx":self.start_token_idx,
                     "end_token_idx":self.end_token_idx,
                     "skip":self.skip})

    def preprocess(self):
        context = self.context
        question = self.question
        answer = self.answer_text
        start_char_idx = self.start_char_idx

        # Find end character index of answer in context
        end_char_idx = start_char_idx + len(answer)
        if (end_char_idx >= len(context)) or (start_char_idx < 0):
            self.skip_doc = True
            return

        # Mark the character indexes in context that are in answer
        is_char_in_ans = [0] * len(context)
        for idx in range(start_char_idx, end_char_idx):
            is_char_in_ans[idx] = 1
            
        # Tokenize context
        tokenized_context = tokenizer(context, return_offsets_mapping=True)
        
        context_input_ids = tokenized_context.input_ids
        context_offset_mapping = tokenized_context.offset_mapping
        context_attention_mask = tokenized_context.attention_mask
        
        self.context_input_ids = context_input_ids
        self.context_offset_mapping = context_offset_mapping
        
        # Find tokens that were created from answer characters
        ans_token_idx = []
        for idx, (start, end) in enumerate(context_offset_mapping):
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_idx.append(idx)

#         if len(ans_token_idx) == 0:
#             self.skip_doc = True
#             return
        if (len(ans_token_idx) == 0):
            ans_token_idx = [-1]

        # Find start and end token index for tokens from answer
        start_token_idx = ans_token_idx[0]
        end_token_idx = ans_token_idx[-1]
        
        self.start_token_idx_master = start_token_idx
        self.end_token_idx_master = end_token_idx
        
        # Tokenize question
        tokenized_question = tokenizer("</s> "+question, return_offsets_mapping=True)
        
        ## Crop start and end tokens
        question_input_ids = tokenized_question.input_ids[1:]
        context_input_ids = context_input_ids[1:-1]

        ##
        ## SPLIT UP CONTEXT INTO MULTIPLE QUESTIONS OF max_len
        ##
        
        if seq_len >= len(context_input_ids):
            offsets = [0]
        else:
            ii = 0
            offsets = []
            while (ii+seq_len) <= len(context_input_ids):
                offsets.append(ii)
                ii = ii + round(seq_len/2)
            offsets = offsets + [len(context_input_ids)-seq_len]
        
        list_input_ids = []
        list_start_token_idx = []
        list_end_token_idx = []
        list_attention_mask = []
        list_token_type_ids = []
        list_skip = []
        
        for ii in offsets:
            subcontext_input_ids = [0]+context_input_ids[ii:(ii+seq_len)]+question_input_ids
            subcontext_start_token_idx = start_token_idx - ii
            subcontext_end_token_idx = end_token_idx - ii
            subcontext_padding = [0] * (self.max_len - len(subcontext_input_ids))
            
            subcontext_attention_mask = [1] * len(subcontext_input_ids) + subcontext_padding
            subcontext_token_type_ids = [0] + [0]*len(context_input_ids[ii:(ii+seq_len)]) + [1]*len(question_input_ids) + (np.array(subcontext_padding)+1).tolist()
            subcontext_input_ids = subcontext_input_ids + (np.array(subcontext_padding)+1).tolist()
            
            if (subcontext_start_token_idx >= 0) and (subcontext_end_token_idx < seq_len):
                skip = False
            else:
                subcontext_start_token_idx = 0
                subcontext_end_token_idx = 0
                skip = False
                
            list_input_ids.append(subcontext_input_ids)
            list_attention_mask.append(subcontext_attention_mask)
            list_token_type_ids.append(subcontext_token_type_ids)
            list_start_token_idx.append(subcontext_start_token_idx)
            list_end_token_idx.append(subcontext_end_token_idx)
            list_skip.append(skip)
        
        self.input_ids = (list_input_ids)
        self.attention_mask = (list_attention_mask)
        self.token_type_ids = (list_token_type_ids)
        self.start_token_idx = (list_start_token_idx)
        self.end_token_idx = (list_end_token_idx)
        self.skip = list_skip
        self.example_offset = offsets
        
    def train_examples(self, include_impossible=False):
        for idx, skip_ex in enumerate(self.skip):
            if include_impossible is False:
                if skip_ex is False:
                    yield  {"input_ids":self.input_ids[idx],
                            "token_type_ids":self.token_type_ids[idx],
                            "attention_mask":self.attention_mask[idx],
                            "start_token_idx":self.start_token_idx[idx],
                            "end_token_idx":self.end_token_idx[idx]}
            else:
                yield  {"input_ids":self.input_ids[idx],
                            "token_type_ids":self.token_type_ids[idx],
                            "attention_mask":self.attention_mask[idx],
                            "start_token_idx":self.start_token_idx[idx],
                            "end_token_idx":self.end_token_idx[idx]}
    
    
    def inference_from_onehot(self, pred_start, pred_end):

#         if force_answer == False:
        if (np.max(np.argmax(pred_start, axis=1)) == 0) and (np.max(np.argmax(pred_end, axis=1)) == 0):
            return("", -1,-1,-1,-1)
        
        seq_len = min(self.seq_len, len(self.context_input_ids))
        
        pred_start_matrix = np.zeros((len(self.input_ids), len(self.context_input_ids)))
        pred_end_matrix = np.zeros((len(self.input_ids), len(self.context_input_ids)))
        
        for idx, value in enumerate(pred_start):
            offset = self.example_offset[idx]+1
            pred_start_sub = pred_start[idx][1:seq_len]
            pred_end_sub   = pred_end[idx][1:seq_len]
            pred_start_matrix[idx,(offset):(offset+seq_len-1)] = pred_start_sub
            pred_end_matrix[idx,(offset):(offset+seq_len-1)] = pred_end_sub
            
        highest_prob = np.argmax(np.max(pred_start_matrix, axis=1) + np.max(pred_end_matrix, axis=1))
        
        top_start = np.argmax(pred_start_matrix[highest_prob,:])
        top_end   = np.argmax(pred_end_matrix[highest_prob,:])
        
#         pred_start = np.max(pred_start_matrix, axis=0)
#         pred_end   = np.max(pred_end_matrix, axis=0)
        
#         top_start = np.argmax(pred_start)
#         top_end = np.argmax(pred_end)
        
        start_char = self.context_offset_mapping[top_start][0]
        end_char = self.context_offset_mapping[top_end][1]
        
        return (self.context[start_char:end_char], top_start, top_end, start_char, end_char)
    
                
    def model_inference(self, model):
        pred = model.predict([np.stack(self.input_ids),
                      np.stack(self.attention_mask),
                      np.stack(self.token_type_ids)], batch_size=8)
        pred_start = pred[0]
        pred_end   = pred[1]
        
        return self.inference_from_onehot(pred_start, pred_end)
    
    def fake_inference(self):
        pred_start_mat = []
        pred_end_mat = []
        for idx, val in enumerate(self.start_token_idx):
            pred_start = np.zeros_like(np.array(self.input_ids[idx]))
            pred_end = np.zeros_like(np.array(self.input_ids[idx]))
            if self.skip[idx] == False:
                pred_start[val] = 1.0
                pred_end[self.end_token_idx[idx]] = 1.0
            pred_start_mat.append(pred_start)
            pred_end_mat.append(pred_end)
            
        pred_start_mat = np.array(pred_start_mat)
        pred_end_mat = np.array(pred_end_mat)
        
        return self.inference_from_onehot(pred_start_mat,pred_end_mat)
        

def create_squad_examples(raw_data, seq_len, max_len):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                if len(qa["answers"]) > 0:
                    question = qa["question"]
                    answer_text = qa["answers"][0]["text"]
                    all_answers = [_["text"] for _ in qa["answers"]]
                    start_char_idx = qa["answers"][0]["answer_start"]
                    squad_eg = SquadExample(
                        question, context, start_char_idx, answer_text, all_answers, seq_len, max_len
                    )
                    squad_eg.preprocess()
                    squad_examples.append(squad_eg)
                else:
                    question = qa["question"]
                    answer_text = ""
                    all_answers = [""]
                    start_char_idx = 0
                    squad_eg = SquadExample(
                        question, context, start_char_idx, answer_text, all_answers, seq_len, max_len
                    )
                    squad_eg.preprocess()
                    squad_examples.append(squad_eg)
    return squad_examples


def create_inputs_targets(squad_examples, include_impossible=False):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in squad_examples:
        if item.skip_doc is False:
            for example in item.train_examples(include_impossible):
                for key in dataset_dict:
                    dataset_dict[key].append(np.array(example[key]))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = (
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"])
    y = (dataset_dict["start_token_idx"], dataset_dict["end_token_idx"])
    return x, y

def merge_squad_results(squad_examples, start_preds, end_preds):
    ii = 0
    tally = []
    for ex in squad_examples:
        if ex.skip_doc is False:
            n_sub = len(ex.skip)
            pred_out = ex.inference_from_onehot(start_preds[ii:(ii+n_sub),:], end_preds[ii:(ii+n_sub),:])[0]
            tally.append(pred_out in ex.all_answers)
            ii = ii + n_sub
    return np.mean(tally)


with open(train_path) as f:
    raw_train_data = json.load(f)

with open(eval_path) as f:
    raw_eval_data = json.load(f)


print("Here")

eval_squad_examples = create_squad_examples(raw_eval_data, seq_len, max_len)
x_eval, y_eval = create_inputs_targets(eval_squad_examples, include_impossible=True)
print(f"{len(eval_squad_examples)} evaluation points created.")


train_squad_examples = create_squad_examples(raw_train_data, seq_len, max_len)
x_train, y_train = create_inputs_targets(train_squad_examples, include_impossible=False)
print(f"{len(train_squad_examples)} examples. {x_train[0].shape} training points created.")


Here


Token indices sequence length is longer than the specified maximum sequence length for this model (569 > 512). Running this sequence through the model will result in indexing errors


11873 evaluation points created.
130319 examples. (130236, 512) training points created.


In [4]:
def create_model():
    ## BERT encoder
    encoder = TFRobertaModel.from_pretrained("roberta-base")

    ## QA Model
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    embedding = encoder.roberta(
        input_ids, 
        token_type_ids=token_type_ids, 
        attention_mask=attention_mask
    )[0]

    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)

    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logits = layers.Flatten()(end_logits)

    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)

    model = keras.Model(
        inputs=[input_ids, 
                token_type_ids, 
                attention_mask],
        outputs=[start_probs, end_probs],
    )
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    return model

In [5]:
strategy = tf.distribute.MirroredStrategy()
print("Number of devices: {}".format(strategy.num_replicas_in_sync))

use_tpu = False
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
else:
    with strategy.scope():
        model = create_model()

model.summary()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
Number of devices: 2


Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
roberta (TFRobertaMainLayer)    TFBaseModelOutputWit 124645632   input_1[0][0]                    
______________________________________________________________________________________________

In [6]:
# exact_match_callback = ExactMatch(x_eval, y_eval)
model.fit(
    x_train,
    y_train,
    epochs=3,  # For demonstration, 3 epochs are recommended
    verbose=1,
    batch_size=12,
    validation_split=0.1,
#     callbacks=[exact_match_callback],
)

Epoch 1/3
INFO:tensorflow:batch_all_reduce: 196 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/devi

<tensorflow.python.keras.callbacks.History at 0x7f6a18227780>

In [7]:
model.save("roberta_base_squad2_512in")

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: roberta_base_squad2_512in/assets


In [8]:
eval_preds = model.predict(x_eval, batch_size=36, verbose=True)
merge_squad_results(eval_squad_examples, eval_preds[0], eval_preds[1])



0.736495611073599

In [9]:
train_preds = model.predict(x_train, batch_size=36, verbose=1)
merge_squad_results(train_squad_examples, train_preds[0], train_preds[1])



0.782126177227677