Question answering using Hugging Face Transformers, fine-tuned on the SQuAD dataset, achieving high accuracy in predicting answer spans within passages of text.

In [None]:
# download and cache the SQuAD dataset
from datasets import load_dataset

raw_datasets = load_dataset("squad")

SQuAD (Stanford Question Answering Dataset) is used as an academic benchmark for extractive question answering

In [None]:
# viewing the dataset object
raw_datasets

In [None]:
# viewing the first element
print("Context:\n", raw_datasets["train"][0]["context"], "\n")
print("Question:\n", raw_datasets["train"][0]["question"], "\n")
print("Answer:\n", raw_datasets["train"][0]["answers"])

The answer_start field contains the starting character index of each answer in the context

In [None]:
# during training, there should be only one possible answer
# checking if more than one possible answers are there
raw_datasets["train"].filter(lambda x: len(x["answers"]["text"]) != 1)

In [None]:
# evaluation can have several possible answers, which can either be same or different
print(raw_datasets["validation"][0]["answers"])
print(raw_datasets["validation"][2]["answers"])

In [None]:
# the answer can indeed be one of the three possibilities as shown above
print(raw_datasets["validation"][2]["context"])
print(raw_datasets["validation"][2]["question"])

Encoder-only models like BERT tend to be great at extracting answers to factoid questions like “Who invented the Transformer architecture?” but fare poorly when given open-ended questions like “Why is the sky blue?” In these more challenging cases, encoder-decoder models like T5 and BART are typically used to synthesize the information in a way that’s quite similar to text summarization.

In [None]:
# using a tokenizer to convert the text in the input into IDs the model can understand
from transformers import AutoTokenizer

# any model can be used that has a fast tokenizer implemented
# refer https://huggingface.co/docs/transformers/index#supported-frameworks
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The Fast Tokenizer is a component of Hugging Face's Transformers library that efficiently tokenizes text data into numerical representations, optimizing speed and memory usage for large-scale natural language processing tasks

In [None]:
# checking for fast tokenizer
tokenizer.is_fast

When we pass to our tokenizer the question and the context together it properly inserts the special tokens to form a sentence like this:
[CLS] question [SEP] context [SEP]

The labels will then be the index of the tokens starting and ending the answer, and the model will be tasked to predicted one start and end logit per token in the input

In [None]:
context = raw_datasets["train"][0]["context"]
question = raw_datasets["train"][0]["question"]

inputs = tokenizer(question, context)
tokenizer.decode(inputs["input_ids"])

- max_length to set the maximum length (here 100)
- truncation="only_second" to truncate the context (which is in the second position) when the question with its context is too long
- stride to set the number of overlapping tokens between two successive chunks (here 50)
- return_overflowing_tokens=True to let the tokenizer know we want the overflowing tokens

In [None]:
inputs = tokenizer(
    question,
    context,
    max_length=100,
    truncation="only_second",
    stride=50,
    return_overflowing_tokens=True,
)

for ids in inputs["input_ids"]:
    print(tokenizer.decode(ids))

In [None]:
inputs = tokenizer(
    question,
    context,
    max_length=100,
    truncation="only_second",
    stride=50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)
inputs.keys()

In [None]:
inputs["overflow_to_sample_mapping"]

In [None]:
#  when we tokenize more examples "overflow_to_sample_mapping" will become more useful
inputs = tokenizer(
    raw_datasets["train"][2:6]["question"],
    raw_datasets["train"][2:6]["context"],
    max_length=100,
    truncation="only_second",
    stride=50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)

print(f"The 4 examples gave {len(inputs['input_ids'])} features.")
print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.")

This information will be useful to map each feature we get to its corresponding label
- (0, 0) if the answer is not in the corresponding span of the context
- (start_position, end_position) if the answer is in the corresponding span of the context, with start_position being the index of the token (in the input IDs) at the start of the answer and end_position being the index of the token (in the input IDs) where the answer ends

In [None]:
# sequence_ids() is used to find the indices that start and end the context in the input IDs
# loop to find the first and last token of the answer
answers = raw_datasets["train"][2:6]["answers"]
start_positions = []
end_positions = []

for i, offset in enumerate(inputs["offset_mapping"]):
    sample_idx = inputs["overflow_to_sample_mapping"][i]
    answer = answers[sample_idx]
    start_char = answer["answer_start"][0]
    end_char = answer["answer_start"][0] + len(answer["text"][0])
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    # If the answer is not fully inside the context, label is (0, 0)
    if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
        # Otherwise it's the start and end token positions
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)

start_positions, end_positions

In [None]:
# compare the theoretical answer with the decoded span of tokens from 83 to 85
idx = 0
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers[sample_idx]["text"][0]

start = start_positions[idx]
end = end_positions[idx]
labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start : end + 1])

print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")

In [None]:
# check index 4 and set label to (0,0)
idx = 4
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers[sample_idx]["text"][0]

decoded_example = tokenizer.decode(inputs["input_ids"][idx])
print(f"Theoretical answer: {answer}, decoded example: {decoded_example}")

In [None]:
# preprocessing function for the complete dataset
# padding every feature to the maximum length

max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
# apply function to complete training set
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)
print("No. of features added: ", len(train_dataset)-(len(raw_datasets["train"])))

In [None]:
# preprocessing validation set
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [None]:
# apply function to validation dataset
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)
print("No. of features added: ", len(validation_dataset)-(len(raw_datasets["validation"])))

The model will output logits for the start and end positions of the answer in the input IDs
- Mask the start and end logits corresponding to tokens outside of the context
- Attributed a score to pairs corresponding to the highest n_best logits (with n_best=20) by taking the product of the corresponding two probabilities
- Scores will be logit scores, and will be obtained by taking the sum of the start and end logits:  log(ab)=log(a)+log(b)
- Look for the pair with the maximum score that yielded a valid answer (e.g., a start_token lower than end_token).

In [None]:
# using the default model for the QA pipeline to generate some predictions on a small part of the validation set
# change tokenizer temporarily
small_eval_set = raw_datasets["validation"].select(range(100))
trained_checkpoint = "distilbert-base-cased-distilled-squad"

tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# remove the columns of our eval_set that are not expected by the model
# build a batch of that small validation set
import torch
from transformers import AutoModelForQuestionAnswering

eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(device)

with torch.no_grad():
    outputs = trained_model(**batch)

In [None]:
# Trainer gives us predictions as NumPy arrays
# use the start and end logits and convert them to numpy format
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [None]:
# one example may have been split into several features in eval_set
# map each example in small_eval_set to the corresponding features in eval_set
# find the predicted answer for each example in our small_eval_set
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["example_id"]].append(idx)

In [None]:
# looping through all the examples and associated features
# pick the one with the best logit score

import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})

In [None]:
# load the metric format for SQuAD
import evaluate

metric = evaluate.load("squad")

# This metric expects the predicted answers as a list of dictionaries with one key for the ID of the example and one key for the predicted text
# and the theoretical answers as a list of dictionaries with one key for the ID of the example and one key for the possible answers

In [None]:
theoretical_answers = [
    {"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set
]

In [None]:
# check that we get sensible results by looking at the first element of both lists
print(predicted_answers[0])
print(theoretical_answers[0])

In [None]:
# compute the metric
metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [None]:
# function to apply to the complete dataset to compute metrics

from tqdm.auto import tqdm


def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [None]:
compute_metrics(start_logits, end_logits, eval_set, small_eval_set)

In [None]:
# fine-tuning the model
# create the model
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

In [None]:
# # Login to HuggingFace to push model
# from huggingface_hub import notebook_login

# notebook_login()

In [None]:
# set training arguments

from transformers import TrainingArguments

args = TrainingArguments(
    "distilbert-base-cased",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    # fp16=True, //  mixed-precision training for GPUs
    push_to_hub=False,
)

In [None]:
# training the model

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
# predict() method of the Trainer will return a tuple where the first elements will be the predictions of the model

predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, raw_datasets["validation"])

In [None]:
# push latest version of model to hub
trainer.push_to_hub(commit_message="Training complete")
# we can use the inference widget on the Model Hub to test the model

Custom Taining Loop

In [None]:
# # build the DataLoaders from our datasets

# from torch.utils.data import DataLoader
# from transformers import default_data_collator

# train_dataset.set_format("torch")
# validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
# validation_set.set_format("torch")

# train_dataloader = DataLoader(
#     train_dataset,
#     shuffle=True,
#     collate_fn=default_data_collator,
#     batch_size=8,
# )
# eval_dataloader = DataLoader(
#     validation_set, collate_fn=default_data_collator, batch_size=8
# )

In [None]:
# reinstantiate model
# model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

In [None]:
# # AdamW = Adam + fix in weight decay
# from torch.optim import AdamW

# optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
# # prepare the accelerator
# from accelerate import Accelerator

# accelerator = Accelerator(fp16=False)
# model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
#     model, optimizer, train_dataloader, eval_dataloader
# )

In [None]:
# # using a linear schedule
# from transformers import get_scheduler

# num_train_epochs = 3
# num_update_steps_per_epoch = len(train_dataloader)
# num_training_steps = num_train_epochs * num_update_steps_per_epoch

# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps,
# )

In [None]:
# # training loop
# # forward pass through the model, then backward pass and optimizer step

# from tqdm.auto import tqdm
# import torch

# progress_bar = tqdm(range(num_training_steps))

# for epoch in range(num_train_epochs):
#     # Training
#     model.train()
#     for step, batch in enumerate(train_dataloader):
#         outputs = model(**batch)
#         loss = outputs.loss
#         accelerator.backward(loss)

#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)

#     # Evaluation
#     model.eval()
#     start_logits = []
#     end_logits = []
#     accelerator.print("Evaluation!")
#     for batch in tqdm(eval_dataloader):
#         with torch.no_grad():
#             outputs = model(**batch)

#         start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
#         end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())

#     start_logits = np.concatenate(start_logits)
#     end_logits = np.concatenate(end_logits)
#     start_logits = start_logits[: len(validation_dataset)]
#     end_logits = end_logits[: len(validation_dataset)]

#     metrics = compute_metrics(
#         start_logits, end_logits, validation_dataset, raw_datasets["validation"]
#     )
#     print(f"epoch {epoch}:", metrics)

#     # # Save and upload
#     # accelerator.wait_for_everyone()
#     # unwrapped_model = accelerator.unwrap_model(model)
#     # unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
#     # if accelerator.is_main_process:
#     #     tokenizer.save_pretrained(output_dir)
#     #     repo.push_to_hub(
#     #         commit_message=f"Training in progress epoch {epoch}", blocking=False
#     #     )

In [None]:
# accelerator.wait_for_everyone()
# unwrapped_model = accelerator.unwrap_model(model)
# unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

In [None]:
# using the fine-tuned model

from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "huggingface-course/bert-finetuned-squad"
question_answerer = pipeline("question-answering", model=model_checkpoint)

context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "Which deep learning libraries back 🤗 Transformers?"
question_answerer(question=question, context=context)