<a href="https://colab.research.google.com/github/alexpod1000/SQuAD-QA/blob/main/ModelTrainExperimentalCode_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#%%bash
#[[ ! -e /colabtools ]] && exit  # Continue only if running on Google Colab

# Clone repository
# https://sysadmins.co.za/clone-a-private-github-repo-with-personal-access-token/
# For cloning the main branch:
#!git clone https://fb5b65b126107273e595ce8b6c9d2d533103c6e2:x-oauth-basic@github.com/alexpod1000/SQuAD-QA.git
# For cloning the "evaluation-features" branch
#!git clone --branch evaluation-features https://fb5b65b126107273e595ce8b6c9d2d533103c6e2:x-oauth-basic@github.com/alexpod1000/SQuAD-QA.git
# Change current working directory to match project
#%cd SQuAD-QA/
#!pwd

#!pip install transformers

In [2]:
# External imports
import copy
import nltk
import numpy as np
import pandas as pd
import string
import torch
import json

from functools import partial
from nltk.tokenize import TreebankWordTokenizer, SpaceTokenizer
from transformers import AutoTokenizer
from typing import Tuple, List, Dict, Any, Union

# Project imports
from data_loading.utils import bert_padder_collate_fn_eval
from data_loading.qa_dataset import CustomQADatasetBERT_eval
from squad_data.parser import SquadFileParser
from squad_data.utils import build_mappers_and_dataframe_bert_eval
from evaluation.evaluate import evaluate_predictions
from evaluation.utils import build_evaluation_dict_bert
from utils import split_dataframe

In [3]:
# Parameters dictionary

def prepare_input_distilbert(inputs, device):
    model_input = {}
    model_input["input_ids"] = inputs["input_ids"].to(device)
    model_input["attention_mask"] = inputs["attention_mask"].to(device)
    return model_input

def prepare_input_albert(inputs, device):
    # for now we'll just copy distilbert since it works
    model_input = {}
    model_input["input_ids"] = inputs["input_ids"].to(device)
    model_input["attention_mask"] = inputs["attention_mask"].to(device)
    return model_input

possible_models_dict = {
    "distilbert" : {
        "model_url" : "distilbert-base-uncased",
        "tokenizer_url": "distilbert-base-uncased",
        "tokenizer_max_length": 384,
        "prepare_model_input_fn": prepare_input_distilbert,
        "train_params": {
            "epochs": 2,
            "initial_lr": 0.00003,
            "batch_size_train": 32,
            "batch_size_val": 32,
            "batch_size_test": 32,
            "weight_decay": 0.01,
            "dropout_rate": 0.1
        }
    },
    "albert": {
        "model_url": "albert-base-v2",
        "tokenizer_url": "albert-base-v2",
        "tokenizer_max_length": 384,
        "prepare_model_input_fn": prepare_input_albert,
        "train_params": {
            "epochs": 2,
            "initial_lr": 0.00003,
            "batch_size_train": 8,
            "batch_size_val": 8,
            "batch_size_test": 8,
            "weight_decay": 0.01,
            "dropout_rate": 0.1
        }
    },
    "distilroberta": {
        "model_url": "distilroberta-base",
        "tokenizer_url": "distilroberta-base",
        "tokenizer_max_length": 384,
        "prepare_model_input_fn": prepare_input_albert,
        "train_params": {
            "epochs": 2,
            "initial_lr": 0.00003,
            "batch_size_train": 8,
            "batch_size_val": 8,
            "batch_size_test": 8,
            "weight_decay": 0.01,
            "dropout_rate": 0.1
        }
    },
    "bert": {
        "model_url": "bert-base-uncased",
        "tokenizer_url": "bert-base-uncased",
        "tokenizer_max_length": 384,
        "prepare_model_input_fn": prepare_input_albert,
        "train_params": {
            "epochs": 2,
            "initial_lr": 0.00003,
            "batch_size_train": 8,
            "batch_size_val": 8,
            "batch_size_test": 8,
            "weight_decay": 0.01,
            "dropout_rate": 0.1
        }
    }
}

#current_selected_experiment = "distilbert"
#current_selected_experiment = "bert"
#current_selected_experiment = "albert"
current_selected_experiment = "distilroberta"
params_dict = possible_models_dict[current_selected_experiment]

### Parse the json and get the data

In [4]:
train_file_json = "squad_data/data/training_set.json"
test_file_json = "squad_data/data/dev-v1.1.json"

train_parser = SquadFileParser(train_file_json)
test_parser = SquadFileParser(test_file_json)

train_data = train_parser.parse_documents()
test_data = test_parser.parse_documents()

########################### DEBUG
# reduce size for faster testing
#full_data = data
#data = []
#for i in range(1): # use only the first 1 documents
#  data.append(full_data[i])

### Prepare the mappers and datafram

In [5]:
def bert_tokenizer_fn(question, paragraph, tokenizer, max_length=384, doc_stride=128):
    pad_on_right = tokenizer.padding_side == "right"
    # Process the sample
    tokenized_input_pair = tokenizer(
        question,
        paragraph,
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    return tokenized_input_pair

In [6]:
tokenizer = AutoTokenizer.from_pretrained(params_dict["tokenizer_url"])
tokenizer_fn_preprocess = partial(bert_tokenizer_fn, tokenizer=tokenizer, max_length=params_dict["tokenizer_max_length"]-3)
tokenizer_fn_train = partial(bert_tokenizer_fn, tokenizer=tokenizer, max_length=params_dict["tokenizer_max_length"])

# Model train

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import transformers

from timeit import default_timer as timer
from tqdm import tqdm
from transformers.optimization import AdamW

from models.utils import SpanExtractor

In [8]:
use_amp = True
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"The device is {device}")
print(f"Automatic Mixed Precision Enabled: {use_amp}")

The device is cuda
Automatic Mixed Precision Enabled: True


Model:

(input_ids, attention_mask) -> (answer_start, answer_end) // for each token in input_ids

In [9]:
class ParametricBertModelQA(torch.nn.Module):

    def __init__(self, hidden_size, num_labels, config_dict, dropout_rate=0.3):
        super(ParametricBertModelQA, self).__init__()
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.bert = transformers.AutoModel.from_pretrained(config_dict["model_url"])#(bert_config)
        self.bert_drop = torch.nn.Dropout(dropout_rate)
        self.qa_outputs = torch.nn.Linear(self.hidden_size, self.num_labels)
        self.prepare_input_fn = config_dict["prepare_model_input_fn"]

    #@torch.cuda.amp.autocast() # goes OOM for whatever reason, don't use.
    def forward(self, inputs):
        # --- 1) Extract data from inputs dictionary and put it on right device
        curr_device = self.bert.device
        # --- 2) Run BERT backbone to produce final representation
        input_dict_for_bert = self.prepare_input_fn(inputs, curr_device)
        output = self.bert(**input_dict_for_bert)
        # --- 3) On top of the final representation, run a mapper to get scores for each position.
        sequence_output = output[0]   #(None, seq_len, hidden_size)
        # do dropout
        sequence_output = self.bert_drop(sequence_output)
        logits = self.qa_outputs(sequence_output) #(None, seq_len, hidden_size)*(hidden_size, 2)=(None, seq_len, 2)
        start_logits, end_logits = logits.split(1, dim=-1)    #(None, seq_len, 1), (None, seq_len, 1)
        start_logits = start_logits.squeeze(-1)  #(None, seq_len)
        end_logits = end_logits.squeeze(-1)    #(None, seq_len)
        # --- 4) Prepare output tuple
        outputs = (start_logits, end_logits,) 
        return outputs

In [10]:
# Define baseline model
model = ParametricBertModelQA(768, 2, params_dict, dropout_rate=params_dict["train_params"]["dropout_rate"]).to(device)
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

# Evaluation

In [11]:
# Uncomment below line to load model from disk
model.load_state_dict(torch.load("trained_models/distilroberta_google_2_epochs.pt"))

<All keys matched successfully>

## Quantitative evaluation

In [12]:
test_paragraphs_mapper, test_df = build_mappers_and_dataframe_bert_eval(tokenizer, tokenizer_fn_preprocess, test_data)

In [13]:
dataset_test_QA = CustomQADatasetBERT_eval(tokenizer_fn_train, test_df, test_paragraphs_mapper)
test_data_loader = torch.utils.data.DataLoader(
    dataset_test_QA, collate_fn = bert_padder_collate_fn_eval, 
    batch_size=params_dict["train_params"]["batch_size_test"], shuffle=True
)

In [14]:
with open(test_file_json, "r") as f:
    dataset_json = json.load(f)
pred_dict = build_evaluation_dict_bert(model, scaler, test_data_loader, test_paragraphs_mapper, tokenizer, device, show_progress=True)
eval_results = evaluate_predictions(dataset_json, pred_dict)
print(eval_results)

100%|██████████| 1350/1350 [00:50<00:00, 26.53it/s]


{
  "exact": 80.34058656575213,
  "f1": 87.68012325520641,
  "total": 10570,
  "HasAns_exact": 80.34058656575213,
  "HasAns_f1": 87.68012325520641,
  "HasAns_total": 10570
}


## Simple qualitative evaluation

In [None]:
def get_answer_span_helper(context, question, model, tokenizer_fn, tokenizer, device="cpu"):
    tokenized_input = tokenizer_fn(question, context)
    output_span = model({
        "input_ids": torch.tensor(tokenized_input["input_ids"]).to(device), 
        "attention_mask": torch.tensor(tokenized_input["attention_mask"]).to(device)
    })
    start, end = SpanExtractor.extract_most_probable(output_span[0], output_span[1])
    start = start.item()
    end = end.item()
    return tokenizer.decode(tokenized_input["input_ids"][0][start:end], skip_special_tokens=True)

In [None]:
context = "This is a test message, written to see if our model can correctly predict its outputs."
question = "Who needs to predict its outputs?"
pred_answer = get_answer_span_helper(context, question, model, tokenizer_fn_train, tokenizer, device="cuda")
print(pred_answer)