In [7]:
import torch
import evaluate
from transformers import (
    BertTokenizerFast,
    BertForQuestionAnswering,
)
from datasets import load_dataset
from utils import preprocess_and_tokenize, make_predictions, make_predictions_with_swa, post_processing_function
from swag_transformers.swag_bert import SwagBertForQuestionAnswering

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name) # This supports offsets mapping
model = BertForQuestionAnswering.from_pretrained(model_name)

swag_model = SwagBertForQuestionAnswering.from_base(model, no_cov_mat=False)  # Use SWAG (no_cov_mat=False)
model = model.to(device)
swag_model = swag_model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Load SQuAD 2.0 dataset from Hugging Face
squad_dataset = load_dataset("squad_v2")

# Because test set is hidden, make a new split
split = squad_dataset["train"].train_test_split(test_size=0.1, shuffle=False)
train_set = split["train"]
dev_set = split["test"]
test_set = squad_dataset["validation"]

# subsets for testing
train_set = train_set.select(range(1))  
dev_set = dev_set.select(range(10))     
test_set = test_set.select(range(1)) 

# Preprocess the custom splits
tokenized_train, tokenized_dev, tokenized_test = preprocess_and_tokenize(train_set, dev_set, test_set, tokenizer)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [8]:
metric = evaluate.load("squad_v2")

In [9]:
# Evaluate baseline from checkpoint

output_dir = "./model_1"
checkpoint_dir = f"{output_dir}/checkpoint-14825" # epoch 1
model = BertForQuestionAnswering.from_pretrained(checkpoint_dir)
model = model.to(device)
tokenizer = BertTokenizerFast.from_pretrained(output_dir)

predictions = make_predictions(model, tokenized_dev)
final_predictions = post_processing_function(dev_set, tokenized_dev, predictions)
references = [{"id": ex['id'], "answers": ex['answers']} for ex in dev_set]
results = metric.compute(predictions=final_predictions.predictions, references=references)
print("Baseline: ", results)

# Evaluate SWA from checkpoint

checkpoint_dir = f"{output_dir}/checkpoint-swag-epoch-1" 
swag_model = SwagBertForQuestionAnswering.from_pretrained(checkpoint_dir)
swag_model = swag_model.to(device)

predictions = make_predictions_with_swa(swag_model, tokenized_dev)
final_predictions = post_processing_function(dev_set, tokenized_dev, predictions)
references = [{"id": ex['id'], "answers": ex['answers']} for ex in dev_set]
results = metric.compute(predictions=final_predictions.predictions, references=references)
print("SWA: ", results)

  0%|          | 0/10 [00:00<?, ?it/s]

Baseline:  {'exact': 70.0, 'f1': 76.66666666666667, 'total': 10, 'HasAns_exact': 62.5, 'HasAns_f1': 70.83333333333333, 'HasAns_total': 8, 'NoAns_exact': 100.0, 'NoAns_f1': 100.0, 'NoAns_total': 2, 'best_exact': 70.0, 'best_exact_thresh': 0.0, 'best_f1': 76.66666666666667, 'best_f1_thresh': 0.0}


No parameters collected yet, you should first run collect_model!


  0%|          | 0/10 [00:00<?, ?it/s]

SWA:  {'exact': 0.0, 'f1': 0.0, 'total': 10, 'HasAns_exact': 0.0, 'HasAns_f1': 0.0, 'HasAns_total': 8, 'NoAns_exact': 0.0, 'NoAns_f1': 0.0, 'NoAns_total': 2, 'best_exact': 20.0, 'best_exact_thresh': 0.0, 'best_f1': 20.0, 'best_f1_thresh': 0.0}
