<a href="https://colab.research.google.com/github/alexpod1000/SQuAD-QA/blob/main/Evaluate_model_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Run the following cells only if using Colab
if 'google.colab' in str(get_ipython()):
    # Clone repository
    !git clone https://github.com/alexpod1000/SQuAD-QA.git
    # Change current working directory to match project
    %cd SQuAD-QA/
    !pwd

    !pip install transformers

In [2]:
# External imports
import copy
import nltk
import numpy as np
import pandas as pd
import string
import torch
import json

from functools import partial
from nltk.tokenize import TreebankWordTokenizer, SpaceTokenizer
from transformers import AutoTokenizer
from typing import Tuple, List, Dict, Any, Union

# Project imports
from data_loading.utils import bert_padder_collate_fn_eval
from data_loading.qa_dataset import CustomQADatasetBERT_eval
from models import possible_models_dict
from squad_data.parser import SquadFileParser
from squad_data.utils import build_mappers_and_dataframe_bert_eval
from evaluation.evaluate import evaluate_predictions
from evaluation.utils import build_evaluation_dict_bert
from utils import split_dataframe

In [3]:
#current_selected_experiment = "distilbert"
#current_selected_experiment = "bert"
#current_selected_experiment = "albert"
#current_selected_experiment = "distilroberta"
current_selected_experiment = "distilroberta_extra_linear"
params_dict = possible_models_dict[current_selected_experiment]

### Parse the json and get the data

In [4]:
train_file_json = "squad_data/data/training_set.json"
test_file_json = "squad_data/data/dev-v1.1.json"

train_parser = SquadFileParser(train_file_json)
test_parser = SquadFileParser(test_file_json)

train_data = train_parser.parse_documents()
test_data = test_parser.parse_documents()

### Prepare the mappers and datafram

In [5]:
def bert_tokenizer_fn(question, paragraph, tokenizer, max_length=384, doc_stride=128):
    pad_on_right = tokenizer.padding_side == "right"
    # Process the sample
    tokenized_input_pair = tokenizer(
        question,
        paragraph,
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    return tokenized_input_pair

In [6]:
tokenizer = AutoTokenizer.from_pretrained(params_dict["tokenizer_url"])
tokenizer_fn_preprocess = partial(bert_tokenizer_fn, tokenizer=tokenizer, max_length=params_dict["tokenizer_max_length"]-3)
tokenizer_fn_train = partial(bert_tokenizer_fn, tokenizer=tokenizer, max_length=params_dict["tokenizer_max_length"])

# Model train

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import transformers

from timeit import default_timer as timer
from tqdm import tqdm
from transformers.optimization import AdamW

from models.utils import SpanExtractor

In [8]:
use_amp = True
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"The device is {device}")
print(f"Automatic Mixed Precision Enabled: {use_amp}")

The device is cuda
Automatic Mixed Precision Enabled: True


Model:

(input_ids, attention_mask) -> (answer_start, answer_end) // for each token in input_ids

In [9]:
# Define baseline model
model = params_dict["span_model"](768, 2, params_dict, dropout_rate=params_dict["train_params"]["dropout_rate"]).to(device)
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

# Evaluation

In [10]:
# Uncomment below line to load model from disk
model.load_state_dict(torch.load("trained_models/distilroberta_extralinear_google_2_epochs.pt"))

<All keys matched successfully>

## Quantitative evaluation

In [11]:
test_paragraphs_mapper, test_df = build_mappers_and_dataframe_bert_eval(tokenizer, tokenizer_fn_preprocess, test_data)

In [12]:
dataset_test_QA = CustomQADatasetBERT_eval(tokenizer_fn_train, test_df, test_paragraphs_mapper)
test_data_loader = torch.utils.data.DataLoader(
    dataset_test_QA, collate_fn = bert_padder_collate_fn_eval, 
    batch_size=params_dict["train_params"]["batch_size_test"], shuffle=True
)

In [13]:
with open(test_file_json, "r") as f:
    dataset_json = json.load(f)
pred_dict = build_evaluation_dict_bert(model, scaler, test_data_loader, test_paragraphs_mapper, tokenizer, device, show_progress=True)
eval_results = evaluate_predictions(dataset_json, pred_dict)
print(eval_results)

100%|██████████| 1350/1350 [00:50<00:00, 26.68it/s]


{
  "exact": 80.91769157994324,
  "f1": 88.06750112914818,
  "total": 10570,
  "HasAns_exact": 80.91769157994324,
  "HasAns_f1": 88.06750112914818,
  "HasAns_total": 10570
}


## Simple qualitative evaluation

In [None]:
def get_answer_span_helper(context, question, model, tokenizer_fn, tokenizer, device="cpu"):
    tokenized_input = tokenizer_fn(question, context)
    output_span = model({
        "input_ids": torch.tensor(tokenized_input["input_ids"]).to(device), 
        "attention_mask": torch.tensor(tokenized_input["attention_mask"]).to(device)
    })
    start, end = SpanExtractor.extract_most_probable(output_span[0], output_span[1])
    start = start.item()
    end = end.item()
    return tokenizer.decode(tokenized_input["input_ids"][0][start:end], skip_special_tokens=True)

In [None]:
context = "This is a test message, written to see if our model can correctly predict its outputs."
question = "Who needs to predict its outputs?"
pred_answer = get_answer_span_helper(context, question, model, tokenizer_fn_train, tokenizer, device="cuda")
print(pred_answer)