In [1]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from evaluate import load
import pandas as pd
import numpy as np
from datasets import load_dataset
import torch
import json

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
model_name = "deepset/roberta-base-squad2"

In [13]:
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)

In [12]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading: 100%|██████████| 744/744 [00:00<00:00, 175kB/s]
Downloading: 100%|██████████| 266M/266M [00:28<00:00, 9.38MB/s] 
Downloading: 100%|██████████| 323/323 [00:00<00:00, 72.7kB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 519kB/s]  
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 827kB/s]  
Downloading: 100%|██████████| 112/112 [00:00<00:00, 37.3kB/s]


In [14]:
model.eval()

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [15]:
raw_datasets = load_dataset("squad_v2")

print("Context: ", raw_datasets["validation"][0]["context"])
print("Question: ", raw_datasets["validation"][0]["question"])
print("Answer: ", raw_datasets["validation"][0]["answers"])

Found cached dataset squad_v2 (C:/Users/carlos/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)
100%|██████████| 2/2 [00:00<00:00, 15.01it/s]

Context:  The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.
Question:  In what country is Normandy located?
Answer:  {'text': ['France', 'France', 'France', 'France'], 'answer_start': [159, 159, 159, 159]}





In [16]:
raw_datasets["validation"].filter(lambda x: len(x["answers"]["text"]) != 1)
print(raw_datasets["validation"][0]["answers"])
print(raw_datasets["validation"][2]["answers"])

Loading cached processed dataset at C:\Users\carlos\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d\cache-a729e31eeecedd01.arrow


{'text': ['France', 'France', 'France', 'France'], 'answer_start': [159, 159, 159, 159]}
{'text': ['Denmark, Iceland and Norway', 'Denmark, Iceland and Norway', 'Denmark, Iceland and Norway', 'Denmark, Iceland and Norway'], 'answer_start': [256, 256, 256, 256]}


In [7]:
def get_predictions(question, context):
    QA_input = {
        'question': question,
        'context': context
    }
    return nlp(QA_input)

In [10]:
validation_ds = raw_datasets["validation"]
predictions = {}
for item in validation_ds:
    question = item["question"]
    context = item["context"]
    prediction = get_predictions(question, context)
    predictions[item["id"]] = prediction

with open("predictions.json", "w") as outfile:
    json.dump(predictions, outfile)