In [4]:
import json
from pathlib import Path
import torch
from torch.utils.data import DataLoader
import time
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

In [5]:
path = Path("squad/dev-v2.0.json")

with open(path, 'rb') as f:
    squad_dict = json.load(f)

texts = []
queries = []
answers = []
qid = []

# Search for each passage, its question and its answer
for group in squad_dict['data']:
    for passage in group['paragraphs']:
        context = passage['context']
        for qa in passage['qas']:
            question = qa['question']
            id = qa["id"]
            
            if(len(qa['answers']) == 0):
                texts.append(context)
                queries.append(question)
                qid.append(id)
                answers.append("")
            else:
                for answer in qa['answers']:
                    # Store every passage, query and its answer to the lists
                    texts.append(context)
                    queries.append(question)
                    qid.append(id)
                    answers.append(answer)


val_texts, val_queries, val_answers, val_qid = texts, queries, answers, qid

In [6]:
tokenizer = AutoTokenizer.from_pretrained("timpal0l/mdeberta-v3-base-squad2")

Downloading: 100%|██████████| 453/453 [00:00<00:00, 227kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|██████████| 16.3M/16.3M [00:00<00:00, 20.1MB/s]
Downloading: 100%|██████████| 23.0/23.0 [00:00<00:00, 11.5kB/s]
Downloading: 100%|██████████| 173/173 [00:00<00:00, 59.7kB/s]


In [7]:
model = AutoModelForQuestionAnswering.from_pretrained("timpal0l/mdeberta-v3-base-squad2")
model.eval()

Downloading: 100%|██████████| 879/879 [00:00<00:00, 438kB/s]
Downloading: 100%|██████████| 1.11G/1.11G [00:41<00:00, 26.7MB/s]


DebertaV2ForQuestionAnswering(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(251000, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0): DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      

In [8]:
device = torch.device('cuda:0' if torch.cuda.is_available()
                      else 'cpu')
print(device)

cuda:0


In [9]:

def predict(context,query):
  inputs = tokenizer.encode_plus(query, context, max_length= 256, return_tensors='pt').to(device)
  
  model_device = model.to(device)
  outputs = model_device(**inputs)
  answer_start = torch.argmax(outputs[0]) # get the most likely beginning of answer with the argmax of the score
  answer_end = torch.argmax(outputs[1]) + 1 

  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

  return answer

def normalize_text(s):
  """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
  import string, re

  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)

  def white_space_fix(text):
    return " ".join(text.split())

  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)

  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()
  
  # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)
  
  common_tokens = set(pred_tokens) & set(truth_tokens)
  
  # if there are no common tokens then f1 = 0
  if len(common_tokens) == 0:
    return 0
  
  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)
  
  return 2 * (prec * rec) / (prec + rec)

In [10]:
def give_an_answer(context,query):
    prediction = predict(context,query)
    return prediction

In [11]:
predictions = {}

for i in range(len(val_answers)):
    text = val_texts[i]
    query = val_queries[i]
    answer = val_answers[i]
    key = val_qid[i]

    prediction = give_an_answer(text, query)

    predictions[key] = prediction

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
predictions = {}

for i in range(10):
    text = val_texts[i]
    query = val_queries[i]
    answer = val_answers[i]
    key = val_qid[i]

    prediction = give_an_answer(text, query)

    predictions[key] = prediction

In [13]:
with open("predictions-bert.json", "w") as outfile:
    json.dump(predictions, outfile)