# Imports

In [1]:
import h5py
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from transformers import TFBertModel, TFBertForQuestionAnswering
from tensorflow.keras.layers import Dense, Input, Flatten, Activation, GlobalMaxPool1D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.utils import multi_gpu_model, to_categorical
import tensorflow.keras.backend as K
from sklearn.metrics import f1_score, accuracy_score
from transformers.data.processors.squad import SquadV2Processor
from transformers import BertTokenizer
from transformers.data.processors.squad import squad_convert_examples_to_features
from tensorflow.keras.callbacks import ModelCheckpoint
from tqdm.notebook import tqdm, tnrange
import pandas as pd
from transformers import BertConfig
import time
import pickle

## Build Model from Weights

In [2]:
config = BertConfig.from_pretrained("bert-large-uncased", output_hidden_states = True)
bert_layer = TFBertModel.from_pretrained('bert-large-uncased', config = config)

In [3]:
def get_base_bert_model(bert_layer):
    max_seq_length = 386

    input_ids = Input((max_seq_length,), dtype = tf.int32, name = 'input_ids')
    input_masks = Input((max_seq_length,), dtype = tf.int32, name = 'input_masks')
    input_tokens = Input((max_seq_length,), dtype = tf.int32, name = 'input_tokens')

    #1 for pooled outputs (CLS token), 0 for sequence
    _, cls_output, embeddings = bert_layer([input_ids, input_masks, input_tokens])
    x = Dense(2, name = 'dense_2', kernel_initializer = 'he_normal') (cls_output)

    model = Model(inputs = [input_ids, input_masks, input_tokens], outputs = [x,cls_output, embeddings],
                  name = 'BERT_SQuADv2_BinaryClassification')
    
    return model

In [4]:
bert_base = get_base_bert_model(bert_layer)
bert_base.load_weights('./weights/bert_squadv2_binary_classification_weights_epoch_3.h5')

## Load Training & Data

In [5]:
train_data = h5py.File(r'C:/w266/cris/BERTVision/data/squad_train.h5', 'r')
dev_data = h5py.File(r'C:/w266/cris/BERTVision/data/squad_dev.h5', 'r')
indices = np.array(eval(open('indices.txt', 'r').readline()))

In [6]:
max_seq_length = 386

train_ids = np.array(train_data['input_ids'], dtype = np.int32)[indices]
train_masks = np.array(train_data['attention_mask'], dtype = np.int32)[indices]
train_tokens = np.array(train_data['token_type_ids'], dtype = np.int32)[indices]

dev_ids = np.array(dev_data['input_ids'], dtype = np.int32)
dev_masks = np.array(dev_data['attention_mask'], dtype = np.int32)
dev_tokens = np.array(dev_data['token_type_ids'], dtype = np.int32)

train_input_start = np.array(train_data['input_start'], dtype = np.int32)[indices]
train_input_end = np.array(train_data['input_end'], dtype = np.int32)[indices]

answer_no_answer = np.where(train_input_start + train_input_end > 0, 0, 1)
answer_no_answer = to_categorical(answer_no_answer).astype(np.uint8)

In [7]:
processor = SquadV2Processor()
data_raw = processor.get_dev_examples("C:/w266/cris/BERTVision/data")

dev_answers = dict(zip([d.qas_id for d in data_raw], 
    [np.uint8(d.is_impossible) for d in data_raw]))

tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
dd_raw = squad_convert_examples_to_features(
            examples = data_raw,
            tokenizer = tokenizer,
            max_seq_length = 386,
            doc_stride = 128,
            max_query_length = 64,
            is_training = False,)

dev_predict_qasids = [d.qas_id for d in dd_raw]

100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:04<00:00,  7.17it/s]
convert squad examples to features: 100%|███████████████████████████████████████| 11873/11873 [01:47<00:00, 110.20it/s]
add example index and unique id: 100%|███████████████████████████████████████| 11873/11873 [00:00<00:00, 379851.96it/s]


## Generate Embeddings for Dev

In [9]:
data = np.zeros((len(dev_ids), 1024, 26), dtype = np.float16)

for i in tqdm(range(len(dev_ids))):
    _, cls_output, embeddings = bert_base.predict([dev_ids[[i]], dev_masks[[i]], dev_tokens[[i]]])
    data[i] = np.concatenate([np.expand_dims(cls_output, axis=2), 
                              np.transpose(np.array(embeddings), (1,2,3,0))[:,0,::]], axis=2)

with open('./data/dev_embeddings_3_epoch.pkl', 'wb') as handle:
    pickle.dump(data, handle, protocol = pickle.HIGHEST_PROTOCOL)

print(f"dev embeddings saved to './data/dev_embeddings_3_epoch.pkl' with shape: {data.shape}")

HBox(children=(FloatProgress(value=0.0, max=12227.0), HTML(value='')))


dev embeddings saved to './data/dev_embeddings_3_epoch.pkl' with shape: (12227, 1024, 26)


## Generate Embeddings for Train

In [8]:
train_data = np.zeros((len(train_ids), 1024, 26), dtype = np.float16)

for i in tqdm(range(len(train_ids))):
    _, cls_output, embeddings = bert_base.predict([train_ids[[i]], train_masks[[i]], train_tokens[[i]]])
    train_data[i] = np.concatenate([np.expand_dims(cls_output, axis=2), 
                              np.transpose(np.array(embeddings), (1,2,3,0))[:,0,::]], axis=2)

with open('./data/train_embeddings_3_epoch.pkl', 'wb') as handle:
    pickle.dump(train_data, handle, protocol = pickle.HIGHEST_PROTOCOL)

print(f"train embeddings saved to './data/train_embeddings_3_epoch.pkl' with shape: {train_data.shape}")

HBox(children=(FloatProgress(value=0.0, max=131911.0), HTML(value='')))


train embeddings saved to './data/train_embeddings_3_epoch.pkl' with shape: (131911, 1024, 26)
