<a href="https://colab.research.google.com/github/alexpod1000/SQuAD-QA/blob/main/ModelTrainExperimentalCode_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#%%bash
#[[ ! -e /colabtools ]] && exit  # Continue only if running on Google Colab

# Clone repository
# https://sysadmins.co.za/clone-a-private-github-repo-with-personal-access-token/
# For cloning the main branch:
#!git clone https://fb5b65b126107273e595ce8b6c9d2d533103c6e2:x-oauth-basic@github.com/alexpod1000/SQuAD-QA.git
# For cloning the "evaluation-features" branch
#!git clone --branch evaluation-features https://fb5b65b126107273e595ce8b6c9d2d533103c6e2:x-oauth-basic@github.com/alexpod1000/SQuAD-QA.git
# Change current working directory to match project
#%cd SQuAD-QA/
#!pwd

#!pip install transformers

In [2]:
# External imports
import copy
import nltk
import numpy as np
import pandas as pd
import string
import torch
import json

from functools import partial
from nltk.tokenize import TreebankWordTokenizer, SpaceTokenizer
from transformers import AutoTokenizer
from typing import Tuple, List, Dict, Any, Union

# Project imports
from squad_data.parser import SquadFileParser
from squad_data.utils import build_mappers_and_dataframe_bert
from evaluation.evaluate import evaluate_predictions
from evaluation.utils import build_evaluation_dict_bert
from utils import split_dataframe


In [3]:
# Parameters dictionary

def prepare_input_distilbert(inputs, device):
    model_input = {}
    model_input["input_ids"] = inputs["input_ids"].to(device)
    model_input["attention_mask"] = inputs["attention_mask"].to(device)
    return model_input

def prepare_input_albert(inputs, device):
    # for now we'll just copy distilbert since it works
    model_input = {}
    model_input["input_ids"] = inputs["input_ids"].to(device)
    model_input["attention_mask"] = inputs["attention_mask"].to(device)
    return model_input

possible_models_dict = {
    "distilbert" : {
        "model_url" : "distilbert-base-uncased",
        "tokenizer_url": "distilbert-base-uncased",
        "tokenizer_max_length": 384,
        "prepare_model_input_fn": prepare_input_distilbert,
        "train_params": {
            "epochs": 4,
            "initial_lr": 0.00005,
            "batch_size_train": 16,
            "batch_size_val": 32,
            "batch_size_test": 32,
            "weight_decay": 0.01,
            "dropout_rate": 0.1
        }
    },
    "albert": {
        "model_url": "albert-base-v2",
        "tokenizer_url": "albert-base-v2",
        "tokenizer_max_length": 384,
        "prepare_model_input_fn": prepare_input_albert,
        "train_params": {
            "epochs": 3,
            "initial_lr": 0.00005,
            "batch_size_train": 8,
            "batch_size_val": 8,
            "batch_size_test": 8,
            "weight_decay": 0.01,
            "dropout_rate": 0.1
        }
    },
    "bert": {
        "model_url": "bert-base-uncased",
        "tokenizer_url": "bert-base-uncased",
        "tokenizer_max_length": 384,
        "prepare_model_input_fn": prepare_input_albert,
        "train_params": {
            "epochs": 3,
            "initial_lr": 0.00005,
            "batch_size_train": 8,
            "batch_size_val": 8,
            "batch_size_test": 8,
            "weight_decay": 0.01,
            "dropout_rate": 0.1
        }
    }
}

current_selected_experiment = "distilbert"
params_dict = possible_models_dict[current_selected_experiment]

### Parse the json and get the data

In [4]:
train_file_json = "squad_data/data/training_set.json"
test_file_json = "squad_data/data/dev-v1.1.json"

train_parser = SquadFileParser(train_file_json)
test_parser = SquadFileParser(test_file_json)

train_data = train_parser.parse_documents()
test_data = test_parser.parse_documents()

########################### DEBUG
# reduce size for faster testing
#full_data = data
#data = []
#for i in range(1): # use only the first 1 documents
#  data.append(full_data[i])

### Prepare the mappers and datafram

In [5]:
def bert_tokenizer_fn(question, paragraph, tokenizer, max_length=384, doc_stride=128):
    pad_on_right = tokenizer.padding_side == "right"
    # Process the sample
    tokenized_input_pair = tokenizer(
        question,
        paragraph,
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    return tokenized_input_pair

In [6]:
tokenizer = AutoTokenizer.from_pretrained(params_dict["tokenizer_url"])
tokenizer_fn_preprocess = partial(bert_tokenizer_fn, tokenizer=tokenizer, max_length=params_dict["tokenizer_max_length"]-3)
tokenizer_fn_train = partial(bert_tokenizer_fn, tokenizer=tokenizer, max_length=params_dict["tokenizer_max_length"])

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
paragraphs_mapper, df = build_mappers_and_dataframe_bert(tokenizer, tokenizer_fn_preprocess, train_data, limit_answers=1)
print(paragraphs_mapper[next(iter(paragraphs_mapper))])
df.head()

In [None]:
df_train, df_val = split_dataframe(df, train_ratio=0.7)

In [None]:
print(f"Total samples: {len(df)}, Train samples: {len(df_train)}, Validation samples: {len(df_val)}")

### DataConverter and CustomQADataset

In [None]:
from data_loading.utils import bert_padder_collate_fn
from data_loading.qa_dataset import CustomQADatasetBERT

datasetQA = CustomQADatasetBERT(tokenizer_fn_train, df_train, paragraphs_mapper)
data_loader = torch.utils.data.DataLoader(datasetQA, collate_fn = bert_padder_collate_fn, batch_size=10, shuffle=True)

test_batch = next(iter(data_loader))
print(test_batch["input_ids"].shape)
print(test_batch["y_gt"].shape)

In [None]:
"""
NOTE: this logic is used for sample creation only, such that each sample is "short enough" for BERT; 
      a duplicate of this logic will need to be used in QADataset Dataloader class when we'll take
      short samples' text, tokenize them again, and find the correct index
ALTERNATIVE: for BERT models we could directly get the answer spans, and pass them in dataframe to another QADataset
             built specifically for BERT, that will just take the data from dataframe (way nicer and faster solution).
SUGGESTION: we could also use specific dict keys and in QADataset pick stuff from these keys: 
                - if these keys are absent then don't use BERT logic (eg span_start and span_end) and use previous logic
                - if these keys are present, then just use them and gather the BERT samples.
                Call these keys like "tokenizer_span_idx" (to make them kinda unique)
"""

# Model train

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import transformers

from timeit import default_timer as timer
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup
from transformers.optimization import AdamW

from models.utils import SpanExtractor

In [None]:
use_amp = True
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"The device is {device}")
print(f"Automatic Mixed Precision Enabled: {use_amp}")

Model:

(input_ids, attention_mask) -> (answer_start, answer_end) // for each token in input_ids

In [None]:
def train_step(model, scaler, optimizer, loss_function, dataloader, scheduler=None, device="cpu", show_progress=False):
    acc_loss = 0
    acc_start_accuracy = 0
    acc_end_accuracy = 0
    count = 0

    time_start = timer()
    
    model.train()
    wrapped_dataloader = tqdm(dataloader) if show_progress else dataloader
    for batch in wrapped_dataloader:
        # NOTE: we'll pass directly the batch dict to the model for inputs.
        answer_spans_start = batch["y_gt"][:, 0]
        answer_spans_end = batch["y_gt"][:, 1]
        # Clear gradients
        model.zero_grad()
        # Place to right device
        answer_spans_start = answer_spans_start.to(device)
        answer_spans_end = answer_spans_end.to(device)
        # Use Automatic Mixed Precision if enabled
        with torch.cuda.amp.autocast(enabled=scaler.is_enabled()):
            # Run forward pass
            pred_answer_start_scores, pred_answer_end_scores = model(batch)
            # Compute the CrossEntropyLoss
            loss = (loss_function(pred_answer_start_scores, answer_spans_start) + loss_function(pred_answer_end_scores, answer_spans_end))/2.0
        scaler.scale(loss).backward()
        # Optimizer step (via scaler)
        scaler.step(optimizer)
        scaler.update()
        # Update LR scheduler
        if scheduler is not None:
            scheduler.step()
        # --- Compute metrics ---
        # Get span indexes
        pred_span_start_idxs, pred_span_end_idxs = SpanExtractor.extract_most_probable(pred_answer_start_scores, pred_answer_end_scores)
        gt_start_idxs = answer_spans_start.cpu().detach()
        gt_end_idxs = answer_spans_end.cpu().detach()
        # two accs
        start_accuracy = torch.sum(gt_start_idxs == pred_span_start_idxs) / len(pred_span_start_idxs)
        end_accuracy = torch.sum(gt_end_idxs == pred_span_end_idxs) / len(pred_span_end_idxs)
        # Gather stats
        acc_loss += loss.item()
        acc_start_accuracy += start_accuracy.item()
        acc_end_accuracy += end_accuracy.item()
        count += 1
    time_end = timer()
    return {
        "loss": acc_loss / count, 
        "accuracy_start": acc_start_accuracy / count, 
        "accuracy_end": acc_end_accuracy / count,
        "time": time_end - time_start
    }

In [None]:
@torch.no_grad()
def validation_step(model, scaler, loss_function, dataloader, device="cpu", show_progress=False):
    acc_loss = 0
    acc_start_accuracy = 0
    acc_end_accuracy = 0
    count = 0

    time_start = timer()
    wrapped_dataloader = tqdm(dataloader) if show_progress else dataloader
    
    model.eval()
    for batch in wrapped_dataloader:
        answer_spans_start = batch["y_gt"][:, 0]
        answer_spans_end = batch["y_gt"][:, 1]
        # Place to right device
        answer_spans_start = answer_spans_start.to(device)
        answer_spans_end = answer_spans_end.to(device)
        # Use Automatic Mixed Precision if enabled
        with torch.cuda.amp.autocast(enabled=scaler.is_enabled()):
            # Run forward pass
            pred_answer_start_scores, pred_answer_end_scores = model(batch)
            # Compute the CrossEntropyLoss
            loss = (loss_function(pred_answer_start_scores, answer_spans_start) + loss_function(pred_answer_end_scores, answer_spans_end))/2.0
        # --- Compute metrics ---
        # Get span indexes
        pred_span_start_idxs, pred_span_end_idxs = SpanExtractor.extract_most_probable(pred_answer_start_scores, pred_answer_end_scores)
        gt_start_idxs = answer_spans_start.cpu().detach()
        gt_end_idxs = answer_spans_end.cpu().detach()
        # two accs
        start_accuracy = torch.sum(gt_start_idxs == pred_span_start_idxs) / len(pred_span_start_idxs)
        end_accuracy = torch.sum(gt_end_idxs == pred_span_end_idxs) / len(pred_span_end_idxs)
        # Gather stats
        acc_loss += loss.item()
        acc_start_accuracy += start_accuracy.item()
        acc_end_accuracy += end_accuracy.item()
        count += 1
    time_end = timer()
    return {
        "loss": acc_loss / count, 
        "accuracy_start": acc_start_accuracy / count, 
        "accuracy_end": acc_end_accuracy / count,
        "time": time_end - time_start
    }

In [None]:
def get_params_for_optimizer(model, no_decay, weight_decay=0.0001):
    param_optimizer = list(model.named_parameters())
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer if not any(
                    nd in n for nd in no_decay
                )
            ], 
            'weight_decay': weight_decay
        },
        {
            'params': [
                p for n, p in param_optimizer if any(
                    nd in n for nd in no_decay
                )
            ],
            'weight_decay': 0.0
        },
    ]
    return optimizer_parameters

In [None]:
class ParametricBertModelQA(torch.nn.Module):

    def __init__(self, hidden_size, num_labels, config_dict, dropout_rate=0.3):
        super(ParametricBertModelQA, self).__init__()
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.bert = transformers.AutoModel.from_pretrained(config_dict["model_url"])#(bert_config)
        self.bert_drop = torch.nn.Dropout(dropout_rate)
        self.qa_outputs = torch.nn.Linear(self.hidden_size, self.num_labels)
        self.prepare_input_fn = config_dict["prepare_model_input_fn"]

    def forward(self, inputs):
        # --- 1) Extract data from inputs dictionary and put it on right device
        curr_device = self.bert.device
        # --- 2) Run BERT backbone to produce final representation
        input_dict_for_bert = self.prepare_input_fn(inputs, curr_device)
        output = self.bert(**input_dict_for_bert)
        # --- 3) On top of the final representation, run a mapper to get scores for each position.
        sequence_output = output[0]   #(None, seq_len, hidden_size)
        # do dropout
        sequence_output = self.bert_drop(sequence_output)
        logits = self.qa_outputs(sequence_output) #(None, seq_len, hidden_size)*(hidden_size, 2)=(None, seq_len, 2)
        start_logits, end_logits = logits.split(1, dim=-1)    #(None, seq_len, 1), (None, seq_len, 1)
        start_logits = start_logits.squeeze(-1)  #(None, seq_len)
        end_logits = end_logits.squeeze(-1)    #(None, seq_len)
        # --- 4) Prepare output tuple
        outputs = (start_logits, end_logits,) 
        return outputs

In [None]:
# Define baseline model
model = ParametricBertModelQA(768, 2, params_dict, dropout_rate=params_dict["train_params"]["dropout_rate"]).to(device)

# Define parameters on which to apply L2 decay
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
if params_dict["train_params"]["weight_decay"] > 0.0:
    model_params_optimizer = get_params_for_optimizer(model, no_decay, weight_decay=params_dict["train_params"]["weight_decay"])
else:
    model_params_optimizer = model.parameters()

# Define optimizer
optimizer = AdamW(
    model_params_optimizer, 
    lr=params_dict["train_params"]["initial_lr"], 
    correct_bias=False
)

In [None]:
# Estimate the number of train steps for LR scheduler
num_train_steps = int(
    (len(df_train) / params_dict["train_params"]["batch_size_train"]) * params_dict["train_params"]["epochs"]
)

num_warmup_steps = int(num_train_steps * 0.1) # 10% of warmup steps

# LR scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_train_steps
)

In [None]:
loss_function = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

In [None]:
dataset_train_QA = CustomQADatasetBERT(tokenizer_fn_train, df_train, paragraphs_mapper)
dataset_val_QA = CustomQADatasetBERT(tokenizer_fn_train, df_val, paragraphs_mapper)

In [None]:
train_data_loader = torch.utils.data.DataLoader(
    dataset_train_QA, collate_fn = bert_padder_collate_fn, 
    batch_size=params_dict["train_params"]["batch_size_train"], shuffle=True
)
val_data_loader = torch.utils.data.DataLoader(
    dataset_val_QA, collate_fn = bert_padder_collate_fn, 
    batch_size=params_dict["train_params"]["batch_size_val"], shuffle=True
)

In [None]:
history = {
    "train_loss": [], "train_acc_start": [], "train_acc_end": [],
    "val_loss": [], "val_acc_start": [], "val_acc_end": []
}
loop_start = timer()
for epoch in range(params_dict["train_params"]["epochs"]):
    train_dict = train_step(model, scaler, optimizer, loss_function, train_data_loader,scheduler=scheduler, device=device, show_progress=True)
    val_dict = validation_step(model, scaler, loss_function, val_data_loader, device=device, show_progress=True)
    cur_lr = optimizer.param_groups[0]['lr']
    print(f'Epoch: {epoch}, '
          f'lr: {cur_lr}, '
          f'Train loss: {train_dict["loss"]:.4f}, '
          f'Train acc start: {train_dict["accuracy_start"]:.4f}, '
          f'Train acc end: {train_dict["accuracy_end"]:.4f}, '
          f'Val loss: {val_dict["loss"]:.4f}, '
          f'Val acc start: {val_dict["accuracy_start"]:.4f}, '
          f'Val acc end: {val_dict["accuracy_end"]:.4f}, '
          f'Time: {train_dict["time"]:.4f}')
    history["train_loss"].append(train_dict["loss"]);history["train_acc_start"].append(train_dict["accuracy_start"]);history["train_acc_end"].append(train_dict["accuracy_end"]);
    history["val_loss"].append(val_dict["loss"]);history["val_acc_start"].append(val_dict["accuracy_start"]);history["val_acc_end"].append(val_dict["accuracy_end"]);
    #scheduler.step(val_dict["loss"])
    #print(f"Evaluation Results: {eval_results}")
loop_end = timer()
print(f"Elapsed time: {(loop_end - loop_start):.4f}")

In [None]:
# Uncomment below line to save model to disk
#torch.save(model.state_dict(), "distilbert_mdl.pt")

# Evaluation

In [None]:
# Uncomment below line to load model from disk
#model.load_state_dict(torch.load("distilbert_mdl.pt"))

## Quantitative evaluation

In [None]:
test_paragraphs_mapper, test_df = build_mappers_and_dataframe_bert(tokenizer, tokenizer_fn_preprocess, test_data, limit_answers=1)

In [None]:
dataset_test_QA = CustomQADatasetBERT(tokenizer_fn_train, test_df, test_paragraphs_mapper)
test_data_loader = torch.utils.data.DataLoader(
    dataset_test_QA, collate_fn = bert_padder_collate_fn, 
    batch_size=params_dict["train_params"]["batch_size_test"], shuffle=True
)

In [None]:
with open(test_file_json, "r") as f:
    dataset_json = json.load(f)
pred_dict = build_evaluation_dict_bert(model, scaler, test_data_loader, test_paragraphs_mapper, tokenizer, device, show_progress=True)
eval_results = evaluate_predictions(dataset_json, pred_dict)
print(eval_results)

In [None]:
#OLD VERSIONE (EVALUATION V1) Evaluation Results: {'exact_match': 61.64616840113529, 'f1': 77.83523815041448}

## Simple qualitative evaluation

In [None]:
def get_answer_span_helper(context, question, model, tokenizer_fn, tokenizer, device="cpu"):
    tokenized_input = tokenizer_fn(question, context)
    output_span = model({
        "input_ids": torch.tensor(tokenized_input["input_ids"]).to(device), 
        "attention_mask": torch.tensor(tokenized_input["attention_mask"]).to(device)
    })
    start, end = SpanExtractor.extract_most_probable(output_span[0], output_span[1])
    start = start.item()
    end = end.item()
    return tokenizer.decode(tokenized_input["input_ids"][0][start:end], skip_special_tokens=True)

In [None]:
context = "This is a test message, written to see if our model can correctly predict its outputs."
question = "Who needs to predict its outputs?"
pred_answer = get_answer_span_helper(context, question, model, tokenizer_fn_train, tokenizer, device="cuda")
print(pred_answer)