## Imports

In [3]:
import h5py
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from transformers import TFBertModel, TFBertForQuestionAnswering
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
from sklearn.metrics import f1_score

## Import Raw SQuAD data

In [2]:
train_data = h5py.File(r'..\SQuADv2\train_386.h5', 'r')
dev_data = h5py.File(r'..\SQuADv2\dev_386.h5', 'r')

In [14]:
max_seq_length = 386

indices = np.arange(131911, dtype = int)
shuffle = np.random.shuffle(indices)

train_ids = np.array(train_data['input_ids'], dtype = np.int32)[indices]
train_masks = np.array(train_data['attention_mask'], dtype = np.int32)[indices]
train_tokens = np.array(train_data['token_type_ids'], dtype = np.int32)[indices]

dev_ids = np.array(dev_data['input_ids'], dtype = np.int32)
dev_masks = np.array(dev_data['attention_mask'], dtype = np.int32)
dev_tokens = np.array(dev_data['token_type_ids'], dtype = np.int32)

train_input_start = np.array(train_data['input_start'], dtype = np.int32)[indices]
train_input_end = np.array(train_data['input_end'], dtype = np.int32)[indices]

#dev_input_start = np.array(dev_data['input_start'], dtype = np.int32)
#dev_input_end = np.array(dev_data['input_end'], dtype = np.int32)


## Get BERT model with head

In [6]:
def get_base_bert_model():
    max_seq_length = 386
    bert_layer = TFBertModel.from_pretrained('bert-large-uncased')
    
    input_ids = Input((max_seq_length,), dtype = tf.int32, name = 'input_ids')
    #input_segs = Input((512,), dtype = tf.int32)
    input_masks = Input((max_seq_length,), dtype = tf.int32, name = 'input_masks')
    input_tokens = Input((max_seq_length,), dtype = tf.int32, name = 'input_tokens')
    pooled_outputs = bert_layer([input_ids, input_masks, input_tokens])[0] #1 for pooled outputs, 0 for sequence
    logits = Dense(2)(pooled_outputs)
    start_logits, end_logits = tf.split(logits, 2, axis=-1)
    start_logits = tf.squeeze(start_logits, axis=-1)
    end_logits = tf.squeeze(end_logits, axis=-1)

    model = Model(inputs = [input_ids, input_masks, input_tokens], outputs = [start_logits, end_logits])
    return model

In [7]:
bert_base = get_base_bert_model()

In [8]:
bert_base.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 386)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 386)]        0                                            
__________________________________________________________________________________________________
input_tokens (InputLayer)       [(None, 386)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 386, 1024),  335141888   input_ids[0][0]                  
                                                                 input_masks[0][0]            

In [9]:
opt = keras.optimizers.Adam(1e-5)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits = True)
bert_base.compile(loss = [loss_fn, loss_fn],
                  optimizer=opt,
                  metrics = ['accuracy'])

## Partial epoch fine-tuning for the first epoch

In [31]:
for i in range(10):
    if i == 9:
        idx = indices[i*len(indices)//10:]
    else:
        idx = indices[i*len(indices)//10:(i+1)*len(indices)//10]
    bert_base.fit([train_ids[idx], train_masks[idx], train_tokens[idx]], 
                  [train_input_start[idx], train_input_end[idx]],
                  #validation_data=[[dev_ids, dev_masks],
                  #[dev_input_start, dev_input_end]],
                  epochs = 1,
                  batch_size = 4,
                  shuffle = True)
    bert_base.save_weights('bert_squadv2_span_detection_weights_epoch_0_first_%i.h5' %i)
              



## Fine-tuning for 6 full epochs

In [15]:
for i in range(6):
    bert_base.fit([train_ids, train_masks, train_tokens], 
                  [train_input_start, train_input_end],
                  #validation_data=[[dev_ids, dev_masks],
                  #[dev_input_start, dev_input_end]],
                  epochs = 1,
                  batch_size = 4)
    bert_base.save_weights('bert_squadv2_span_detection_weights_epoch_%d.h5' %i)
              



##  Extracting the embeddings

In [4]:
from transformers.data.processors.squad import SquadV2Processor
from transformers.data.processors.squad import squad_convert_examples_to_features
from transformers import BertConfig
from BERTVision.utils.evaluation import Squad2Config

class UntrainedBertSquad2Faster(object):
    def __init__(self, weights,
                 config = Squad2Config()):
        self.weights = weights
        self.tokenizer = config.tokenizer
        self.named_model = config.named_model
        self.model = self.bert_large_uncased_for_squad2(config.max_seq_length)

    def bert_large_uncased_for_squad2(self, max_seq_length):
        input_ids = Input((max_seq_length,), dtype = tf.int32, name = 'input_ids')
        input_masks = Input((max_seq_length,), dtype = tf.int32, name = 'input_masks')
        input_tokens = Input((max_seq_length,), dtype = tf.int32, name = 'input_tokens')
        
        #Load model from huggingface
        config = BertConfig.from_pretrained("bert-large-uncased", output_hidden_states=True)
        bert_layer = TFBertModel.from_pretrained(self.named_model, config = config)
        bert_layer.load_weights(self.weights)

        outputs, _, embeddings = bert_layer([input_ids, input_masks, input_tokens]) #1 for pooled outputs, 0 for sequence

        model = Model(inputs = [input_ids, input_masks, input_tokens], outputs = [embeddings, outputs])
        return model

#weights for just encoders without the head
#BERT Weights files are too large to store in github
model = UntrainedBertSquad2Faster('bert_squadv2_span_detection_weights_epoch_1_BERT_ONLY.h5')

In [5]:
#Get train data as well
train_data = h5py.File('../SQuADv2/train_386.h5', 'r')
train_ids = np.array(train_data['input_ids'], dtype = np.int32)
train_masks = np.array(train_data['attention_mask'], dtype = np.int32)
train_tokens = np.array(train_data['token_type_ids'], dtype = np.int32)
train_input_start = np.array(train_data['input_start'], dtype = np.int32)
train_input_end = np.array(train_data['input_end'], dtype = np.int32)
train_data.close()

### Write in batches of 8 each

In [None]:
#training data NO padding batches of 8
data_dir = '../data/train_bert_1_epoch_fine_tuned_full386/'
def write_file(directory, idx, embeddings):
    with h5py.File(directory + str(idx) + '.h5', 'w') as f:
        f.create_dataset('hidden_state_activations', data = embeddings)
        
embeddings = np.zeros((8,386,1024,25), dtype = np.float16)
for i in range(12697,16489):
    e, _ = model.model.predict([train_ids[i*8:(i+1)*8], train_masks[i*8:(i+1)*8], train_tokens[i*8:(i+1)*8]])
    for j in range(25):
        embeddings[:, :, :, j] = e[j]
    
    if e[0].shape[0] == 8:
        write_file(data_dir, i*8, embeddings)
    else:
        write_file(data_dir', i*8, embeddings[:e[0].shape[0]])
    if not i%1000:
        print(i)