# Set up

In [1]:
%%capture
!pip install transformers

In [2]:
import os, re, json, requests, io, string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig
import pandas as pd
import numpy as np
import pickle
import random

MAX_LEN = 384
random.seed(42)
configuration = BertConfig()

In [3]:
# Load the Drive helper and mount -> we could use this to save the weights of the models
from google.colab import drive
drive.mount('/content/gdrive', force_remount = True)

Mounted at /content/gdrive


# Set up tokenizer

In [4]:
# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)

# Load data

In [12]:

url = "https://raw.githubusercontent.com/amrlnic/SQuAD/main/data/training_set.json" # Make sure the url is the raw version of the file on GitHub
download = requests.get(url).content
data = json.loads(download)

def load_dataset(file, record_path = ['data', 'paragraphs', 'qas', 'answers'], verbose = True):

  """
  parse the SQUAD dataset into a dataframe
  """

  if verbose:
      print("Reading the json file")

  if verbose:
      print("[INFO] processing...")

  # parsing different level's in the json file
  js = pd.json_normalize(file , record_path )
  m = pd.json_normalize(file, record_path[:-1] )
  r = pd.json_normalize(file, record_path[:-2])
  t = pd.json_normalize(file, record_path[0])

  title = pd.json_normalize(file['data'], record_path = ['paragraphs'], meta = 'title')

  #combining it into single dataframe
  idx = np.repeat(r['context'].values, r.qas.str.len())
  ndx  = np.repeat(m['id'].values, m['answers'].str.len())
  m['context'] = idx
  m['title'] = np.repeat(title['title'].values, r.qas.str.len())
  js['q_idx'] = ndx
  main = pd.concat([ m[['id','question','context', 'title']].set_index('id'), js.set_index('q_idx')], 1, sort = False).reset_index()
  main['c_id'] = main['context'].factorize()[0]
  if verbose:
      print(f"[INFO] there are {main.shape[0]} questions with single answer")
      print(f"[INFO] there are {main.groupby('c_id').sum().shape[0]} different contexts")
      print(f"[INFO] there are {len(t)} unrelated subjects")
      print("[INFO] Done")
  return main

squad_dataset = load_dataset(data)

Reading the json file
[INFO] processing...
[INFO] there are 87599 questions with single answer
[INFO] there are 18891 different contexts
[INFO] there are 442 unrelated subjects
[INFO] Done


In [None]:
squad_dataset.head()

Unnamed: 0,index,question,context,title,answer_start,text,c_id
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,515,Saint Bernadette Soubirous,0
1,5733be284776f4190066117f,What is in front of the Notre Dame Main Building?,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,188,a copper statue of Christ,0
2,5733be284776f41900661180,The Basilica of the Sacred heart at Notre Dame...,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,279,the Main Building,0
3,5733be284776f41900661181,What is the Grotto at Notre Dame?,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,381,a Marian place of prayer and reflection,0
4,5733be284776f4190066117e,What sits on top of the Main Building at Notre...,"Architecturally, the school has a Catholic cha...",University_of_Notre_Dame,92,a golden statue of the Virgin Mary,0


# Pre - processing

In [5]:
SAMPLES = squad_dataset.shape[0]

def preprocess_sentence(text):

  """
  lowercase and strip the given text
  """

  text = text.lower()
  text = text.strip()
  return text

def clean_dataset(dataset):

  """
  preprocess the dataset
  """

  _dataset = dataset.copy()

  cleaned_questions = _dataset['question'].apply(preprocess_sentence)
  cleaned_texts = _dataset['text'].apply(preprocess_sentence)

  # we process only different contexts and then we duplicate them
  unique_context = pd.Series(_dataset['context'].unique())
  count_c = _dataset.groupby('c_id').count()['text']
  cleaned_contexts = unique_context.apply(preprocess_sentence)

  _dataset['question'] = cleaned_questions
  _dataset['text'] = cleaned_texts
  _dataset['context'] = pd.Series(np.repeat(cleaned_contexts, count_c).tolist())

  return _dataset

In [13]:
squad_dataset = clean_dataset(squad_dataset)

# Split

In [7]:
def split(dataset, train_size = 0.8):

  """
  split the dataset in two part: the training and the validation
  """

  # find unique titles
  titles = squad_dataset['title']
  unique_titles = titles.unique()


  n_titles = len(unique_titles)
  titles_seq = list(range(n_titles))

  train_len = int(n_titles*train_size)

  # sample train indexes
  train_ind = random.sample(titles_seq, train_len)
  test_ind = list(set(titles_seq) - set(train_ind))

  train_titles = unique_titles[train_ind]
  test_titles = unique_titles[test_ind]

  squad_columns = list(squad_dataset.columns)

  # initialize empty train and test df
  train_data = pd.DataFrame(columns = squad_columns)
  test_data = pd.DataFrame(columns = squad_columns)

  for train_title in train_titles:

    train_section = squad_dataset[squad_dataset['title'] == train_title]
    train_data = train_data.append(train_section)

  for test_title in test_titles:

    test_section = squad_dataset[squad_dataset['title'] == test_title]
    test_data = test_data.append(test_section)


  return train_data, test_data

In [14]:
tr_df, vl_df = split(squad_dataset)
tr_df.shape[0],vl_df.shape[0]

(69129, 18470)

# Filter rows

In [9]:
def skip(row):

  """
  Create the input sequences and find the rows that we have to skip
  """

  answer = row['text']
  context = row['context']
  start_char_idx = row['answer_start']
  question = row['question']

  # initialize skip column
  row['skip'] = False


  # Find end character index of answer in context
  end_char_idx = start_char_idx + len(answer)
  if end_char_idx >= len(context):
    row['skip'] = True
    return row

  # Mark the character indexes in context that are in answer
  is_char_in_ans = [0] * len(context)
  for idx in range(start_char_idx, end_char_idx):
    is_char_in_ans[idx] = 1

  # Tokenize context
  tokenized_context = tokenizer.encode(context)
  row['tokenized context'] = tokenized_context

  # Find tokens that were created from answer characters
  ans_token_idx = []
  for idx, (start, end) in enumerate(tokenized_context.offsets):
    if sum(is_char_in_ans[start:end]) > 0:
      ans_token_idx.append(idx)

  if len(ans_token_idx) == 0:
    row['skip'] = True
    return row

  # Find start and end token index for tokens from answer
  start_token_idx = ans_token_idx[0]
  end_token_idx = ans_token_idx[-1]

  row['start token idx'] = start_token_idx
  row['end token idx'] = end_token_idx

  # Tokenize question
  tokenized_question = tokenizer.encode(question)
  row['tokenized question'] = tokenized_question

  # Inputs of the model: here are used to determine whether to skip the row or not
  input_ids = tokenized_context.ids + tokenized_question.ids[1:]
  token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(
            tokenized_question.ids[1:]
        )
  attention_mask = [1] * len(input_ids)

  padding_length = MAX_LEN - len(input_ids)

  if padding_length > 0:  # pad
    input_ids = input_ids + ([0] * padding_length)
    attention_mask = attention_mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([0] * padding_length)
  elif padding_length < 0:
    row['skip'] = True
  
  row['input ids'] = np.array(input_ids)
  row['token type ids'] = np.array(token_type_ids)
  row['attention mask'] = np.array(attention_mask)

  return row


In [15]:
# takes a while
tr_df = tr_df.apply(skip, axis = 1)
vl_df = vl_df.apply(skip, axis = 1)

len(tr_df[tr_df['skip']]), len(vl_df[vl_df['skip']])

(1031, 421)

In [16]:
# we get rid of samples where the answer doesn't match the context

tr_df = tr_df[tr_df['skip'] == False]
vl_df = vl_df[vl_df['skip'] == False]

len(tr_df), len(vl_df)

(68098, 18049)

# Save datasets as json

In [76]:
def df_to_json(df, path):

  """
  parse the given dataframe into the SQUAD json format
  """
  
  data = []

  for title, articles in df.groupby('title'):
    chapter = {'title': title}
    paragraphs = []
    for context, contents in articles.groupby('context'):
      paragraph = {'context': context}
      qas = []
      for i, content in contents.iterrows():
        qa = {'answers': [{'answer_start': content['answer_start'], 'text': content['text']}], 'question': content['question'], 'id': content['index']}
        qas.append(qa)
      paragraph.update({'qas': qas})
      paragraphs.append(paragraph)
    chapter.update({'paragraphs': paragraphs})
    data.append(chapter)
  raw_data = {'data': data}

  with open(path, 'w') as handle:
    json.dump(raw_data, handle)

  print(f'dataset saved in {path}')

In [77]:
# save datasets in json format
path_to_train_set = os.path.join(os.getcwd(), 'BERT_train_set.json')
df_to_json(tr_df, path_to_train_set)

path_to_valid_set = os.path.join(os.getcwd(), 'BERT_valid_set.json')
df_to_json(vl_df, path_to_valid_set)

dataset saved in /content/BERT_train_set.json
dataset saved in /content/BERT_valid_set.json


# Define input and output

In [8]:
train_path = "/content/gdrive/My Drive/Colab Notebooks/SQUAD_project/train_df"
val_path = "/content/gdrive/My Drive/Colab Notebooks/SQUAD_project/val_df"

In [6]:
# Save dataframes on drive 

pickle.dump( tr_df, open(train_path, "wb" ) )
pickle.dump( vl_df, open(val_path, "wb" ) )

In [10]:
# Load dataframes

tr_df = pickle.load( open(train_path, "rb" ) )  
vl_df = pickle.load( open(val_path, "rb" ) )  

In [16]:
def create_inputs_targets(squad_examples):

  '''
  Function to create inputs for the model

  squad_examples (df)
  '''

  dataset_dict = {
      "input ids": [],
      "token type ids": [],
      "attention mask": [],
      "start token idx": [],
      "end token idx": [],
  }

  n_items = len(squad_examples)
  for i in range(n_items):
    item = squad_examples.iloc[i]

    for key in dataset_dict:
      dataset_dict[key].append(getattr(item, key))

  for key in dataset_dict:
    dataset_dict[key] = np.array(dataset_dict[key])

  x = [
       dataset_dict["input ids"],
       dataset_dict["token type ids"],
       dataset_dict["attention mask"],
      ]
  y = [dataset_dict["start token idx"], dataset_dict["end token idx"]]
  return x, y

In [17]:
x_train, y_train = create_inputs_targets(tr_df)
x_eval, y_eval = create_inputs_targets(vl_df)

# Model

In [23]:
def create_model(enc_dec = True, enc_dim = 128, dec_dim = 64, 
                 rec_mod = 'biLSTM', bert_ft = True, 
                 dropout = False, drop_prob = 0.5):

    """ 
    Returns a keras model for predicting the start and the end of the answer

    enc_dec (boolean): whether to use the encoder decoder model or not. If False, the base model will be used
    enc_dim (int): encoding dimension
    dec_dim (int): decoding dimension
    rec_mod (string): type of recurrent modules // 'biLSTM' or 'GRU'
    bert_ft (boolean): whether or not the bert will be fine - tuned
    dropout (boolean): whether or not using the dropout
    drop_prob (double): dropout probability
    """

    # use pre - trained BERT for creating the embeddings
    bert_model = TFBertModel.from_pretrained("bert-base-uncased")
    if not bert_ft:
      for layer in bert_model.layers:
        layer.trainable = False

    # input
    input_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(MAX_LEN,), dtype=tf.int32)
    embeddings = bert_model(
        input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask
    )[0]


    if enc_dec: # model with encoder - decoder


      if rec_mod == 'biLSTM':

        encoder = layers.Bidirectional(layers.LSTM(enc_dim, return_sequences = True), 
                                          merge_mode = 'concat')(embeddings)

        decoder = layers.Bidirectional(layers.LSTM(dec_dim, return_sequences = True), 
                                                      merge_mode = 'concat')(encoder)

        high_dim = dec_dim*2 # number of units of the dense layers of the highway network

    
      else:

        encoder = layers.GRU(enc_dim, return_sequences = True)(embeddings)

        decoder = layers.GRU(dec_dim, return_sequences = True)(encoder)

        high_dim = dec_dim


      # highway network
      x_proj = layers.Dense(units = high_dim, activation = 'relu')(decoder)
      x_gate = layers.Dense(units = high_dim, activation = 'sigmoid')(decoder)

      x = (x_proj * x_gate) + (1 - x_gate) * decoder
    

    else: # base model

      x = embeddings

    # dropout
    if dropout:
      x = layers.Dropout(drop_prob)(x)

    # output

    start_logits = layers.Dense(1, use_bias = False)(x)
    start_logits = layers.Flatten()(start_logits)

    end_logits = layers.Dense(1, use_bias = False)(x)
    end_logits = layers.Flatten()(end_logits)

    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)


    model = keras.Model(
        inputs = [input_ids, token_type_ids, attention_mask],
        outputs = [start_probs, end_probs]
    )

    loss = keras.losses.SparseCategoricalCrossentropy(from_logits = False)
    optimizer = keras.optimizers.Adam(lr = 5e-5)
    model.compile(optimizer = optimizer, loss = [loss, loss])

    return model

In [None]:
rec_mod = 'GRU'
ft = True
dropout = True
drop_prob = 0.5


use_tpu = True
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        #model = create_model(enc_dec = False, dropout = True, drop_prob = 0.5)
        model = create_model(enc_dec = True, rec_mod = rec_mod, bert_ft = ft, 
                              dropout = dropout, drop_prob = drop_prob)
else:
    model = create_model(enc_dec = True, rec_mod = rec_mod, bert_ft = ft, 
                          dropout = dropout, drop_prob = drop_prob)

In [None]:
model.summary()

## Create evaluation Callback

This callback will compute the exact match score using the validation data
after every epoch.


In [26]:
def normalize_text(text):
    text = text.lower()

    # Remove punctuations
    exclude = set(string.punctuation)
    text = "".join(ch for ch in text if ch not in exclude)

    # Remove articles
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    text = re.sub(regex, " ", text)

    # Remove extra white space
    text = " ".join(text.split())
    return text

In [27]:

class ExactMatch(keras.callbacks.Callback):
    """
    Each `SquadExample` object contains the character level offsets for each token
    in its input paragraph. We use them to get back the span of text corresponding
    to the tokens between our predicted start and end tokens.
    All the ground-truth answers are also present in each `SquadExample` object.
    We calculate the percentage of data points where the span of text obtained
    from model predictions matches one of the ground-truth answers.
    """

    def __init__(self, x_eval, y_eval):
        self.x_eval = x_eval
        self.y_eval = y_eval

    def on_epoch_end(self, epoch, logs=None):
        pred_start, pred_end = self.model.predict(self.x_eval)
        count = 0

        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
          squad_eg = vl_df.iloc[idx]
          offsets = squad_eg['tokenized context'].offsets
          start = np.argmax(start)
          end = np.argmax(end)
          if start >= len(offsets):
              continue
          pred_char_start = offsets[start][0]
          if end < len(offsets):
            pred_char_end = offsets[end][1]
            pred_ans = squad_eg.context[pred_char_start:pred_char_end]
          else:
            pred_ans = squad_eg.context[pred_char_start:]

          normalized_pred_ans = normalize_text(pred_ans)
          normalized_true_ans = normalize_text(squad_eg['text'])
          #normalized_true_ans = [normalize_text(_) for _ in squad_eg.all_answers]
          #if normalized_pred_ans in normalized_true_ans:
          if normalized_pred_ans == normalized_true_ans:
                count += 1
        acc = count / len(self.y_eval[0])
        print(f"\nepoch = {epoch+1}, exact match score = {acc:.2f}")



## Training

In [35]:
# weights path
filepath = '/content/gdrive/My Drive/Colab Notebooks/bert_encDec_weights.h5'

# checkpoint callback 
checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath = filepath,
        save_weights_only = True,
        )

exact_match_callback = ExactMatch(x_eval, y_eval)
model.fit(
    x_train,
    y_train,
    epochs = 1,
    verbose = 1,
    batch_size = 256,
    callbacks = [exact_match_callback, checkpoint],
)







































epoch = 1, exact match score = 0.62


<tensorflow.python.keras.callbacks.History at 0x7f747373b690>

## Evaluation

In [36]:
! git clone https://github.com/amrlnic/SQuAD.git

fatal: destination path 'SQuAD' already exists and is not an empty directory.


In [37]:
predictions = model.predict(x_eval) 

predictions2 = {}
for i in range(len(predictions[0])):
  start=np.argmax(predictions[0][i])
  end=np.argmax(predictions[1][i])
  tokenized_answer = x_eval[0][i:i+1][0][start:end+1]

  decoded = tokenizer.decode(tokenized_answer)

  predictions2[vl_df.iloc[i]['index']] = decoded

##### Save model predictions on val set as a .JSON file  #####

import json

with open('pred.json', 'w') as fp:
    json.dump(predictions2, fp)

In [38]:
!python3 SQuAD/evaluation/evaluate.py SQuAD/BERT/BERT_valid_set.json pred.json

{
  "exact": 55.482298188265275,
  "f1": 70.70630423648977,
  "total": 18049,
  "HasAns_exact": 55.482298188265275,
  "HasAns_f1": 70.70630423648977,
  "HasAns_total": 18049
}
