In [None]:
# Installations
! pip install -q transformers[torch] datasets


In [None]:
from datasets import load_dataset

# Load the dataset
squad = load_dataset("squad")

In [None]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [None]:
example = squad['train'][10]
for key in example:
    print(key, ":", example[key])

id : 5733bed24776f41900661188
title : University_of_Notre_Dame
context : The university is the major seat of the Congregation of Holy Cross (albeit not its official headquarters, which are in Rome). Its main seminary, Moreau Seminary, is located on the campus across St. Joseph lake from the Main Building. Old College, the oldest building on campus and located near the shore of St. Mary lake, houses undergraduate seminarians. Retired priests and brothers reside in Fatima House (a former retreat center), Holy Cross House, as well as Columba Hall near the Grotto. The university through the Moreau Seminary has ties to theologian Frederick Buechner. While not Catholic, Buechner has praised writers from Notre Dame and Moreau Seminary created a Buechner Prize for Preaching.
question : Where is the headquarters of the Congregation of the Holy Cross?
answers : {'text': ['Rome'], 'answer_start': [119]}


In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def prepare_train_features(examples):

    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    offset_mapping = tokenized_examples.pop("offset_mapping")
    #print(offset_mapping[0])
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):

        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)

        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [None]:
# Apply the function to our data
tokenized_datasets = squad.map(prepare_train_features, batched=True, remove_columns=squad["train"].column_names)


In [None]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 88524
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 10784
    })
})

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"finetune-BERT-squad",
    evaluation_strategy = "epoch",
    learning_rate=4e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-08
)

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"].select(range(1000)),
    eval_dataset=tokenized_datasets["validation"].select(range(100)),
    data_collator=data_collator,
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Run the trainer
import torch

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.370839
2,No log,2.043788
3,No log,2.296517


TrainOutput(global_step=375, training_loss=2.0398819986979166, metrics={'train_runtime': 226.1114, 'train_samples_per_second': 13.268, 'train_steps_per_second': 1.658, 'total_flos': 587917702656000.0, 'train_loss': 2.0398819986979166, 'epoch': 3.0})

In [None]:
instance = squad['train'][20]
context = instance['context']
question = instance['question']

In [None]:
context

"All of Notre Dame's undergraduate students are a part of one of the five undergraduate colleges at the school or are in the First Year of Studies program. The First Year of Studies program was established in 1962 to guide incoming freshmen in their first year at the school before they have declared a major. Each student is given an academic advisor from the program who helps them to choose classes that give them exposure to any major in which they are interested. The program also includes a Learning Resource Center which provides time management, collaborative learning, and subject tutoring. This program has been recognized previously, by U.S. News & World Report, as outstanding."

In [None]:
instance['answers']

{'text': ['Learning Resource Center'], 'answer_start': [496]}

In [None]:
given_answer = instance['answers']['text'][0]  # Assuming the first answer is the correct one
given_answer_start = instance['answers']['answer_start'][0]
given_answer, given_answer_start

('Learning Resource Center', 496)

In [None]:
# Tokenize the data
inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [None]:
inputs = {k: v.to(device) for k, v in inputs.items()}

In [None]:
# Get model's output
with torch.no_grad():
    output = model(**inputs)

In [None]:
# Get the predicted answer
start_idx = torch.argmax(output.start_logits)
end_idx = torch.argmax(output.end_logits)

predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx + 1]))


In [None]:
predicted_answer, start_idx, end_idx, start_idx.item(), end_idx.item()

('learning resource center',
 tensor(111, device='cuda:0'),
 tensor(113, device='cuda:0'),
 111,
 113)

In [None]:
correct = (predicted_answer.lower() == given_answer.lower())
evaluation = 'Correct' if correct else f'Incorrect (Predicted: {predicted_answer}, Given: {given_answer})'

print(evaluation)

Correct


In [None]:
from tqdm import tqdm
import numpy as np
import collections

def evaluate_instance(instance, device):
    context = instance['context']
    question = instance['question']
    given_answer = instance['answers']['text'][0]  # Assuming the first answer is the correct one

    # Tokenize the data
    inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)

    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Apply the BERT model
    with torch.no_grad():  # No need to calculate gradients
        output = model(**inputs)

    # Get the predicted answer
    start_idx = torch.argmax(output.start_logits)
    end_idx = torch.argmax(output.end_logits)
    predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx + 1]))

    return predicted_answer.lower(), given_answer.lower()

# Evaluate a number of instances
correct_count = 0
f1_scores = []
recalls = []
em_scores = []
reciprocal_ranks = []

total_count = 100

for i in tqdm(range(total_count)):
    predicted_answer, given_answer = evaluate_instance(squad['train'][i], device)

    # Compute F1 score
    common = collections.Counter(predicted_answer.split()) & collections.Counter(given_answer.split())
    num_same = sum(common.values())
    if num_same == 0:
        f1_scores.append(0)
    else:
        precision = 1.0 * num_same / len(predicted_answer.split())
        recall = 1.0 * num_same / len(given_answer.split())
        f1_scores.append((2 * precision * recall) / (precision + recall))

    # Compute recall
    recalls.append(num_same / len(given_answer.split()))

    # Compute EM score
    em_scores.append(int(predicted_answer == given_answer))

    # Compute reciprocal rank
    reciprocal_rank = 0
    if num_same > 0:
        predicted_tokens = predicted_answer.split()
        given_tokens = given_answer.split()
        ranks = [next((i for i, v in enumerate(predicted_tokens) if v == token), 0) + 1 for token in given_tokens]
        reciprocal_rank = np.mean([1 / rank for rank in ranks if rank > 0])
    reciprocal_ranks.append(reciprocal_rank)

# Calculate and output the metrics
f1_score = np.mean(f1_scores)
recall = np.mean(recalls)
em_score = np.mean(em_scores)
mrr = np.mean(reciprocal_ranks)

print(f'F1 Score: {f1_score:.4f}')
print(f'Recall: {recall:.4f}')
print(f'EM Score: {em_score * 100:.2f}%')
print(f'MRR: {mrr:.4f}')


100%|██████████| 100/100 [00:01<00:00, 60.77it/s]


F1 Score: 0.7214
Recall: 0.7416
EM Score: 63.00%
MRR: 0.5692
