In [None]:
import requests
import json
import torch
import os
from tqdm import tqdm

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Connecting Google Drive in order to save the model**

In [None]:
if not os.path.exists('/content/drive/MyDrive/BERT-SQuAD'):
  os.mkdir('/content/drive/MyDrive/BERT-SQuAD')

In [None]:
!pip install transformers



In [None]:
# Load the training dataset and take a look at it
with open('train.json', 'rb') as f:
  squad = json.load(f)

In [None]:
# Each 'data' dict has two keys (title and paragraphs)
squad['data'][0].keys()

dict_keys(['paragraphs'])

### **Get data 📁**

After we got a taste of the jsons files data format let's extract our data and store them into some data structures.

In [None]:
def read_data(path):
  # load the json file
  with open(path, 'rb') as f:
    squad = json.load(f)

  contexts = []
  questions = []
  answers = []

  for group in squad['data']:
    for passage in group['paragraphs']:
      context = passage['context']
      for qa in passage['qas']:
        question = qa['question']
        for answer in qa['answers']:
          contexts.append(context)
          questions.append(question)
          answers.append(answer)

  return contexts, questions, answers

Put the contexts, questions and answers for training and validation into the appropriate lists.

In [None]:
train_contexts, train_questions, train_answers = read_data('train.json')
valid_contexts, valid_questions, valid_answers = read_data('test.json')

As you can see above, the answers are dictionaries whith the answer text and an integer which indicates the start index of the answer in the context. As the SQuAD does not give us the end index of the answer in the context we have to find it ourselves. So, let's get the character position at which the answer ends in the passage. Note that sometimes SQuAD answers are off by one or two characters, so we will also adjust for that.

In [None]:
def add_end_idx(answers, contexts):
  for answer, context in zip(answers, contexts):
    gold_text = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(gold_text)

    # sometimes squad answers are off by a character or two so we fix this
    if context[start_idx:end_idx] == gold_text:
      answer['answer_end'] = end_idx
    elif context[start_idx-1:end_idx-1] == gold_text:
      answer['answer_start'] = start_idx - 1
      answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == gold_text:
      answer['answer_start'] = start_idx - 2
      answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(valid_answers, valid_contexts)

### **Tokenization 🔢**

As we know we have to tokenize our data in form that is acceptable for the BERT model. We are going to use the `BertTokenizerFast` instead of `BertTokenizer` as the first one is much faster. Since we are going to train our model in batches we need to set `padding=True`.

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
valid_encodings = tokenizer(valid_contexts, valid_questions, truncation=True, padding=True)

Let's see what we got after tokenizing our data.

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
no_of_encodings = len(train_encodings['input_ids'])
print(f'We have {no_of_encodings} context-question pairs')

We have 51 context-question pairs


In [None]:
train_encodings['input_ids'][0]

[101,
 3653,
 25918,
 8082,
 11616,
 1024,
 1010,
 22822,
 17062,
 24552,
 1012,
 1010,
 2695,
 25918,
 8082,
 11616,
 1024,
 1010,
 22822,
 17062,
 24552,
 1012,
 1010,
 7709,
 1024,
 1010,
 5001,
 10464,
 24895,
 14405,
 8586,
 23518,
 14405,
 29107,
 3367,
 7277,
 20996,
 5602,
 1011,
 4372,
 1011,
 1061,
 3806,
 12412,
 11826,
 2007,
 25212,
 2050,
 9617,
 16033,
 15530,
 2483,
 1012,
 1010,
 2019,
 25344,
 1024,
 1010,
 2236,
 2007,
 2203,
 4140,
 22648,
 20192,
 2140,
 20014,
 19761,
 3508,
 1012,
 1010,
 12407,
 2005,
 7709,
 1024,
 1010,
 2023,
 2003,
 1037,
 2382,
 1011,
 2095,
 1011,
 2214,
 2931,
 1010,
 2040,
 2038,
 2042,
 2058,
 11179,
 2005,
 2116,
 2086,
 1012,
 2016,
 2038,
 2699,
 2116,
 2367,
 8738,
 2015,
 1010,
 2021,
 2003,
 7736,
 1012,
 2016,
 2038,
 2042,
 2000,
 2256,
 22466,
 4017,
 7277,
 5970,
 18014,
 1010,
 2363,
 2070,
 2192,
 12166,
 1010,
 1998,
 2772,
 1996,
 9619,
 1012,
 1996,
 10831,
 1998,
 6666,
 1997,
 1996,
 7709,
 2031,
 2042,
 4541,
 2000,
 1

Let's decode the first pair of context-question encoded pair and look into it.

In [None]:
tokenizer.decode(train_encodings['input_ids'][0])

'[CLS] preoperative diagnosis :, morbid obesity., postoperative diagnosis :, morbid obesity., procedure :, laparoscopic antecolic antegastric roux - en - y gastric bypass with eea anastomosis., anesthesia :, general with endotracheal intubation., indication for procedure :, this is a 30 - year - old female, who has been overweight for many years. she has tried many different diets, but is unsuccessful. she has been to our bariatric surgery seminar, received some handouts, and signed the consent. the risks and benefits of the procedure have been explained to the patient., procedure in detail :, the patient was taken to the operating room and placed supine on the operating room table. all pressure points were carefully padded. she was given general anesthesia with endotracheal intubation. scd stockings were placed on both legs. foley catheter was placed for bladder decompression. the abdomen was then prepped and draped in standard sterile surgical fashion. marcaine was then injected thro

We can see that each word is assigned a number.

For example,

beyonce $\rightarrow$ 20773  
[CLS] $\rightarrow$ 101  
[SEP] $\rightarrow$ 102   
[PAD] $\rightarrow$ 0  

We see that the above form matches the one in the image we saw in the Data preprocessing section before.

Next we need to convert our character start/end positions to token start/end positions. Why is that? Because our words converted into tokens, so the answer start/end needs to show the index of start/end token which contains the answer and not the specific characters in the context.

In [None]:
def add_token_positions(encodings, answers):
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

    # if start position is None, the answer passage has been truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
    if end_positions[-1] is None:
      end_positions[-1] = tokenizer.model_max_length

  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(valid_encodings, valid_answers)

In [None]:
train_encodings['start_positions'][:10]

[78, 512, 88, 42, 7, 512, 312, 86, 512, 46]

### **Dataset definition 🗄️**

We have to define our dataset using the PyTorch Dataset class from `torch.utils` in order create our dataloaders after that.

In [None]:
class SQuAD_Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids)

In [None]:
train_dataset = SQuAD_Dataset(train_encodings)
valid_dataset = SQuAD_Dataset(valid_encodings)

### **Dataloaders 🔁**

In [None]:
from torch.utils.data import DataLoader

# Define the dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16)

## **Fine-Tuning ⚙️**

In [None]:
from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### **Training 🏋️‍♂️**

Μy choices for some parameters:

* Use of `AdamW` which is a stochastic optimization method that modifies the typical implementation of weight decay in Adam, by decoupling weight decay from the gradient update. This helps to avoid overfitting which is necessary in this case were the model is very complex.

* Set the `lr=5e-5` as I read that this is the best value for the learning rate for this task.

In [None]:
# Check on the available device - use GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Working on {device}')

Working on cuda


In [None]:
from transformers import AdamW

N_EPOCHS = 30
optim = AdamW(model.parameters(), lr=5e-5)

model.to(device)
model.train()

for epoch in range(N_EPOCHS):
  loop = tqdm(train_loader, leave=True)
  for batch in loop:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    loss.backward()
    optim.step()

    loop.set_description(f'Epoch {epoch+1}')
    loop.set_postfix(loss=loss.item())

Epoch 1: 100%|██████████| 4/4 [00:05<00:00,  1.25s/it, loss=1.18]
Epoch 2: 100%|██████████| 4/4 [00:05<00:00,  1.28s/it, loss=0.543]
Epoch 3: 100%|██████████| 4/4 [00:05<00:00,  1.29s/it, loss=0.617]
Epoch 4: 100%|██████████| 4/4 [00:05<00:00,  1.31s/it, loss=1.06]
Epoch 5: 100%|██████████| 4/4 [00:05<00:00,  1.33s/it, loss=0.28]
Epoch 6: 100%|██████████| 4/4 [00:05<00:00,  1.34s/it, loss=0.0868]
Epoch 7: 100%|██████████| 4/4 [00:05<00:00,  1.33s/it, loss=0.412]
Epoch 8: 100%|██████████| 4/4 [00:05<00:00,  1.31s/it, loss=0.0186]
Epoch 9: 100%|██████████| 4/4 [00:05<00:00,  1.29s/it, loss=0.061]
Epoch 10: 100%|██████████| 4/4 [00:05<00:00,  1.28s/it, loss=0.751]
Epoch 11: 100%|██████████| 4/4 [00:05<00:00,  1.27s/it, loss=0.0822]
Epoch 12: 100%|██████████| 4/4 [00:05<00:00,  1.25s/it, loss=0.446]
Epoch 13: 100%|██████████| 4/4 [00:04<00:00,  1.24s/it, loss=0.0177]
Epoch 14: 100%|██████████| 4/4 [00:04<00:00,  1.24s/it, loss=0.143]
Epoch 15: 100%|██████████| 4/4 [00:04<00:00,  1.23s/it, 

**Save the model in my drive in order not to run it each time**

In [None]:
#model_path = '/content/drive/MyDrive/BERT-SQuAD'
#model.save_pretrained(model_path)
#tokenizer.save_pretrained(model_path)

**Respectively, load the saved model**

In [None]:
#from transformers import BertForQuestionAnswering, BertTokenizerFast

#model_path = '/content/drive/MyDrive/BERT-SQuAD'
#model = BertForQuestionAnswering.from_pretrained(model_path)
#tokenizer = BertTokenizerFast.from_pretrained(model_path)

#device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#print(f'Working on {device}')

#model = model.to(device)

### **Testing ✅**

We are evaluating the model on the validation set by checking the model's predictions for the answer's start and end indexes and comparing with the true ones.

In [None]:
from sklearn.metrics import f1_score

model.eval()

acc = []
f1_scores = []
exact_matches = []

for batch in tqdm(valid_loader):
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)

        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Accuracy calculation
        start_acc = (start_pred == start_true).sum().item() / len(start_pred)
        end_acc = (end_pred == end_true).sum().item() / len(end_pred)
        acc.append(start_acc)
        acc.append(end_acc)

        # F1 score calculation
        for i in range(len(start_true)):
            true_span = set(range(start_true[i].item(), end_true[i].item() + 1))
            pred_span = set(range(start_pred[i].item(), end_pred[i].item() + 1))

            # F1 score
            intersection = len(true_span.intersection(pred_span))
            precision = intersection / len(pred_span) if len(pred_span) > 0 else 0
            recall = intersection / len(true_span) if len(true_span) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
            f1_scores.append(f1)

            # Exact match
            exact_match = 1 if true_span == pred_span else 0
            exact_matches.append(exact_match)

acc = sum(acc) / len(acc)
f1_avg = sum(f1_scores) / len(f1_scores)
exact_match_percentage = sum(exact_matches) / len(exact_matches)

print(f"Average Accuracy: {acc}")
print(f"Average F1 Score: {f1_avg}")
print(f"Exact Match Percentage: {exact_match_percentage}")


100%|██████████| 1/1 [00:00<00:00,  2.15it/s]

Average Accuracy: 0.625
Average F1 Score: 0.5150013762730525
Exact Match Percentage: 0.5



