In [None]:
!pip install transformers
!pip install torch

In [20]:
import json
import torch
from tqdm import tqdm
from transformers import BertForQuestionAnswering, BertTokenizer
from transformers.data.metrics.squad_metrics import squad_evaluate
from transformers import pipeline
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with open('dev-v2.0.json', 'r') as f:
    squad_data = json.load(f)

eval_examples = squad_data['data']

model_name = 'bert-base-uncased'
model = BertForQuestionAnswering.from_pretrained(model_name).to(device)
tokenizer = BertTokenizer.from_pretrained(model_name)

all_predictions = []
all_references = []

pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    device=0 if str(device) == "cuda" else -1
)

for example in tqdm(eval_examples, desc="Evaluating"):
    for paragraph in example['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            qas_id = qa['id']

            inputs = {
                "question": question,
                "context": context
            }
            result = pipeline(inputs)

            answer = result["answer"]
            all_predictions.append({'id': qas_id, 'prediction_text': answer})
            all_references.append({'id': qas_id, 'answers': qa['answers']})

    print("Example evaluation completed.")



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_out

Example evaluation completed.


Evaluating:   6%|▌         | 2/35 [01:56<33:41, 61.26s/it]

Example evaluation completed.


Evaluating:   9%|▊         | 3/35 [02:52<31:30, 59.07s/it]

Example evaluation completed.


Evaluating:  11%|█▏        | 4/35 [03:29<26:03, 50.45s/it]

Example evaluation completed.


Evaluating:  14%|█▍        | 5/35 [04:15<24:25, 48.84s/it]

Example evaluation completed.


Evaluating:  17%|█▋        | 6/35 [05:32<28:10, 58.29s/it]

Example evaluation completed.


Evaluating:  20%|██        | 7/35 [06:52<30:35, 65.57s/it]

Example evaluation completed.


Evaluating:  23%|██▎       | 8/35 [08:09<31:02, 69.00s/it]

Example evaluation completed.


Evaluating:  26%|██▌       | 9/35 [08:55<26:46, 61.80s/it]

Example evaluation completed.


Evaluating:  29%|██▊       | 10/35 [10:23<29:09, 69.96s/it]

Example evaluation completed.


Evaluating:  31%|███▏      | 11/35 [11:15<25:44, 64.35s/it]

Example evaluation completed.


Evaluating:  34%|███▍      | 12/35 [12:16<24:22, 63.58s/it]

Example evaluation completed.


Evaluating:  37%|███▋      | 13/35 [13:08<21:56, 59.86s/it]

Example evaluation completed.


Evaluating:  40%|████      | 14/35 [14:20<22:16, 63.62s/it]

Example evaluation completed.


Evaluating:  43%|████▎     | 15/35 [15:01<18:56, 56.80s/it]

Example evaluation completed.


Evaluating:  46%|████▌     | 16/35 [16:09<19:04, 60.22s/it]

Example evaluation completed.


Evaluating:  49%|████▊     | 17/35 [16:57<16:56, 56.49s/it]

Example evaluation completed.


Evaluating:  51%|█████▏    | 18/35 [18:00<16:31, 58.33s/it]

Example evaluation completed.


Evaluating:  54%|█████▍    | 19/35 [18:43<14:21, 53.87s/it]

Example evaluation completed.


Evaluating:  57%|█████▋    | 20/35 [19:29<12:53, 51.59s/it]

Example evaluation completed.


Evaluating:  60%|██████    | 21/35 [20:20<11:59, 51.42s/it]

Example evaluation completed.


Evaluating:  63%|██████▎   | 22/35 [20:57<10:12, 47.11s/it]

Example evaluation completed.


Evaluating:  66%|██████▌   | 23/35 [22:39<12:40, 63.34s/it]

Example evaluation completed.


Evaluating:  69%|██████▊   | 24/35 [23:48<11:55, 65.02s/it]

Example evaluation completed.


Evaluating:  71%|███████▏  | 25/35 [25:16<11:59, 71.91s/it]

Example evaluation completed.


Evaluating:  74%|███████▍  | 26/35 [26:47<11:38, 77.65s/it]

Example evaluation completed.


Evaluating:  77%|███████▋  | 27/35 [27:30<08:59, 67.40s/it]

Example evaluation completed.


Evaluating:  80%|████████  | 28/35 [28:31<07:37, 65.38s/it]

Example evaluation completed.


Evaluating:  83%|████████▎ | 29/35 [30:10<07:32, 75.40s/it]

Example evaluation completed.


Evaluating:  86%|████████▌ | 30/35 [31:20<06:09, 73.87s/it]

Example evaluation completed.


Evaluating:  89%|████████▊ | 31/35 [32:37<04:59, 74.83s/it]

Example evaluation completed.


Evaluating:  91%|█████████▏| 32/35 [33:50<03:43, 74.41s/it]

Example evaluation completed.


Evaluating:  94%|█████████▍| 33/35 [35:29<02:43, 81.62s/it]

Example evaluation completed.


Evaluating:  97%|█████████▋| 34/35 [36:49<01:21, 81.23s/it]

Example evaluation completed.


Evaluating: 100%|██████████| 35/35 [38:06<00:00, 65.33s/it]

Example evaluation completed.





In [25]:
print(all_predictions)

[{'id': '56ddde6b9a695914005b9628', 'prediction_text': 'Denmark, Iceland'}, {'id': '56ddde6b9a695914005b9629', 'prediction_text': 'Denmark, Iceland'}, {'id': '56ddde6b9a695914005b962a', 'prediction_text': 'Denmark, Iceland'}, {'id': '56ddde6b9a695914005b962b', 'prediction_text': 'Denmark, Iceland'}, {'id': '56ddde6b9a695914005b962c', 'prediction_text': 'Denmark, Iceland'}, {'id': '5ad39d53604f3c001a3fe8d1', 'prediction_text': 'Denmark, Iceland'}, {'id': '5ad39d53604f3c001a3fe8d2', 'prediction_text': 'Denmark, Iceland'}, {'id': '5ad39d53604f3c001a3fe8d3', 'prediction_text': 'Denmark, Iceland'}, {'id': '5ad39d53604f3c001a3fe8d4', 'prediction_text': 'Denmark, Iceland'}, {'id': '56dddf4066d3e219004dad5f', 'prediction_text': 'their prince Bohemond I'}, {'id': '56dddf4066d3e219004dad60', 'prediction_text': 'their duke, William the Conqueror, led to the Norman conquest of'}, {'id': '56dddf4066d3e219004dad61', 'prediction_text': 'their duke, William the Conqueror, led to the Norman conquest of

In [27]:
print(all_references)



In [26]:
predictions_dict = {}

for prediction in all_predictions:
    qas_id = prediction['id']
    prediction_text = prediction['prediction_text']
    predictions_dict[qas_id] = prediction_text

# Save the predictions to a JSON file
with open('predictions.json', 'w') as f:
    json.dump(predictions_dict, f)
