In [None]:
# !pip install bert_score
# !pip install evaluate

In [None]:
# !pip install transformers

In [10]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
from bert_score import score
import json
from tqdm import tqdm

def evaluate_bertscore(model_path, data_path):
    model = BertForQuestionAnswering.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    with open(data_path, "r", encoding="utf-8") as f:
        squad_data = json.load(f)["data"]

    references = []
    hypotheses = []

    for article in tqdm(squad_data):
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]

            for qa in paragraph["qas"]:
                question = qa["question"]

                if len(qa["answers"]) == 0:
                    continue

                reference = qa["answers"][0]["text"]

                encoded_inputs = tokenizer.encode_plus(question, context, padding="max_length", truncation=True,
                                                       max_length=512, return_tensors="pt")
                input_ids = encoded_inputs["input_ids"].to(device)
                attention_mask = encoded_inputs["attention_mask"].to(device)

                with torch.no_grad():
                    outputs = model(input_ids, attention_mask=attention_mask)
                    start_scores = outputs.start_logits
                    end_scores = outputs.end_logits

                # Convert the predicted answer indices to tokens
                all_tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())
                answer = " ".join(all_tokens[torch.argmax(start_scores): torch.argmax(end_scores) + 1])
                answer = answer.replace("[CLS]", "").replace("[SEP]", "").replace(" ##", "")

                references.append(reference)
                hypotheses.append(answer)

    _, _, bert_scores = score(hypotheses, references, lang="en", model_type="bert-base-uncased")

    return bert_scores.mean().item()

model_path = "bert-large-uncased-whole-word-masking-finetuned-squad"
data_path = "dev-v2.0.json"

bert_score = evaluate_bertscore(model_path, data_path)
print("BERTScore:", bert_score)


 26%|██▌       | 9/35 [02:57<08:54, 20.54s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for 

BERTScore: 0.8711325526237488


