In [1]:
# Step 1: Install dependencies
!pip install datasets transformers evaluate -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Step 2: Import necessary libraries
import torch
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator
from transformers import pipeline

In [13]:
# Step 3: Load SQuAD v2 dataset
dataset = load_dataset("squad_v2")

# Step 4: Load tokenizer and model
model_name = "bert-large-uncased-whole-word-masking"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-large-uncased-whole-word-masking and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Step 5: Preprocess dataset for QA task
def prepare_train_features(examples):
    tokenized_examples = tokenizer(
        examples["question"], examples["context"],
        truncation="only_second", max_length=384, stride=128,
        return_overflowing_tokens=True, return_offsets_mapping=True,
        padding="max_length"
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_position = token_start_index - 1

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_position = token_end_index + 1

                tokenized_examples["start_positions"].append(start_position)
                tokenized_examples["end_positions"].append(end_position)
    return tokenized_examples

small_train = dataset["train"].shuffle(seed=42).select(range(10000))  # 10k samples

train_dataset = small_train.map(
    prepare_train_features,
    batched=True,
    remove_columns=dataset["train"].column_names
)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [15]:
# Step 6: Setup training arguments
training_args = TrainingArguments(
    output_dir="./bert-qa",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=1,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none",
)

# Step 7: Setup Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dataset["validation"].map(
        prepare_train_features,
        batched=True,
        remove_columns=dataset["validation"].column_names
    ),
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

  trainer = Trainer(


In [16]:
# Step 8: Fine-tune the model
trainer.train()


Epoch,Training Loss,Validation Loss
1,1.549,1.594494
2,0.9915,1.711702


TrainOutput(global_step=3372, training_loss=1.4985477773037386, metrics={'train_runtime': 6702.1283, 'train_samples_per_second': 3.017, 'train_steps_per_second': 0.503, 'total_flos': 1.4085234596109312e+16, 'train_loss': 1.4985477773037386, 'epoch': 2.0})

In [17]:
# Step 9: Function for inference on custom context and question
def answer_question(question: str, context: str):
    nlp = pipeline("question-answering", model=model, tokenizer=tokenizer)
    result = nlp(question=question, context=context)
    return result

# Example test
custom_context = "BERT stands for Bidirectional Encoder Representations from Transformers. It is a transformer-based model for natural language processing tasks."
custom_question = "What does BERT stand for?"
print(answer_question(custom_question, custom_context))


Device set to use cuda:0


{'score': 0.8972708479850553, 'start': 16, 'end': 71, 'answer': 'Bidirectional Encoder Representations from Transformers'}
