In [None]:
# Transformers installation
! pip install transformers
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers.data.processors.squad import SquadV2Processor
import json
from pprint import pprint
import regex as re
import collections
from transformers.data.metrics.squad_metrics import squad_evaluate


Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 8.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 14.0MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 30.7MB/s 
Collecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |██

In [None]:
! mkdir squad
! wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
! wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

--2020-11-14 14:40:19--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘squad/train-v2.0.json’


2020-11-14 14:40:20 (104 MB/s) - ‘squad/train-v2.0.json’ saved [42123633/42123633]

--2020-11-14 14:40:20--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘squad/dev-v2.0.json’


2020-11-14 14:40:20 (41.8 MB/s) - ‘squad/dev-v2.0.json’ saved [4370528/4370528]



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def display_example(qid):    
  from pprint import pprint

  idx = qid_to_example_index[qid]
  q = examples[idx].question_text
  c = examples[idx].context_text
  a = [answer['text'] for answer in examples[idx].answers]
  
  print(f'Example {idx} of {len(examples)}\n---------------------')
  print(f"Q: {q}\n")
  print("Context:")
  pprint(c)
  print(f"\nTrue Answers:\n{a}")

In [None]:
question_answer_pair = {
    "when": ['before', 'after', 'about', 'around', 'from', 'during', 'in'],
    "where": ['in', 'at', 'on', 'behind', 'from', 'through', 'between', 'throughout'],
    "whose": ["'s"],
    "which": ["the"]
}

In [None]:
def question_type(question):
  q_type = "other"
  for q in question_answer_pair.keys():
    temp_type = re.findall(q, question)
    if len(temp_type) > 0:
      q_type = temp_type[0]
  return q_type

In [None]:
def answer_probability(question, answer, start_logit):
  start_text = answer.split()[0]
  q_type = question_type(question.lower())
  #print(q_type)
  if q_type == "other":
    return start_logit
  else:
    probable_answers = question_answer_pair[q_type]
    if q_type == "whose":
      if re.findall(probable_answers[0],start_text):
        return start_logit + 1.5        
    else:      
      if start_text in probable_answers:
        return start_logit + 1.5
  
  return start_logit

In [None]:
def to_list(tensor):
  return tensor.detach().cpu().tolist()

In [None]:
def get_prediction(qid, model, tokenizer, examples):
  # given a question id (qas_id or qid), load the example, get the model outputs and generate an answer
  question = examples[qid_to_example_index[qid]].question_text
  
  context = examples[qid_to_example_index[qid]].context_text

  inputs = tokenizer.encode_plus(question, context, return_tensors='pt', truncation=True, max_length=384)
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
  model.to(device)
  start_logits, end_logits = model(**inputs.to(device))

  # convert our start and end logit tensors to lists
  start_logits = to_list(start_logits)[0]
  end_logits = to_list(end_logits)[0]

  # sort our start and end logits from largest to smallest, keeping track of the index
  start_idx_and_logit = sorted(enumerate(start_logits), key=lambda x: x[1], reverse=True)
  end_idx_and_logit = sorted(enumerate(end_logits), key=lambda x: x[1], reverse=True)

  # select the top n (in this case, 5)
  start_indexes = [idx for idx, logit in start_idx_and_logit[:5]]
  end_indexes = [idx for idx, logit in end_idx_and_logit[:5]]

  # convert the token ids from a tensor to a list
  tokens = to_list(inputs['input_ids'])[0]

  # question tokens are defined as those between the CLS token (101, at position 0) and first SEP (102) token 
  question_indexes = [i+1 for i, token in enumerate(tokens[1:tokens.index(102)])]

  # keep track of all preliminary predictions
  PrelimPrediction = collections.namedtuple( 
      "PrelimPrediction", ["start_index", "end_index", "start_logit", "end_logit"]
  )

  prelim_preds = []
  for start_index in start_indexes:
    for end_index in end_indexes:
      # throw out invalid predictions
      if start_index in question_indexes:
        continue
      if end_index in question_indexes:
        continue
      if end_index < start_index:
        continue
      prelim_preds.append(
        PrelimPrediction(
            start_index = start_index,
            end_index = end_index,
            start_logit = start_logits[start_index],
            end_logit = end_logits[end_index]
        )
      )


    # keep track of all best predictions
  BestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
      "BestPrediction", ["text", "start_logit", "end_logit"]
  )

  nbest = []
  seen_predictions = []
  for pred in prelim_preds:
      
      # for now we only care about the top 5 best predictions
      if len(nbest) >= 5: 
          break
          
      # loop through predictions according to their start index
      if pred.start_index > 0: # non-null answers have start_index > 0

          text = tokenizer.convert_tokens_to_string(
              tokenizer.convert_ids_to_tokens(
                  tokens[pred.start_index:pred.end_index+1]
              )
          )
          # clean whitespace
          text = text.strip()
          text = " ".join(text.split())

          if text in seen_predictions:
              continue
          # flag this text as being seen -- if we see it again, don't add it to the nbest list
          seen_predictions.append(text) 

          # add this text prediction to a pruned list of the top 5 best predictions
          nbest.append(BestPrediction(text=text, start_logit=pred.start_logit, end_logit=pred.end_logit))

  # and don't forget -- include the null answer!
  nbest.append(BestPrediction(text="", start_logit=start_logits[0], end_logit=end_logits[0]))

  # compute the null score as the sum of the [CLS] token logits
  score_null = start_logits[0] + end_logits[0]

  # compute the difference between the null score and the best non-null score
  score_diff = score_null - nbest[0].start_logit - nbest[0].end_logit
  
  return score_diff, nbest[0].text

In [None]:
def get_prediction_postprocessing(qid, model, tokenizer, examples):
  # given a question id (qas_id or qid), load the example, get the model outputs and generate an answer
  question = examples[qid_to_example_index[qid]].question_text
  
  context = examples[qid_to_example_index[qid]].context_text


  inputs = tokenizer.encode_plus(question, context, return_tensors='pt', truncation=True, max_length=384)
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
  model.to(device)
  start_logits, end_logits = model(**inputs.to(device))

  # convert our start and end logit tensors to lists
  start_logits = to_list(start_logits)[0]
  end_logits = to_list(end_logits)[0]

  # sort our start and end logits from largest to smallest, keeping track of the index
  start_idx_and_logit = sorted(enumerate(start_logits), key=lambda x: x[1], reverse=True)
  end_idx_and_logit = sorted(enumerate(end_logits), key=lambda x: x[1], reverse=True)

  # select the top n (in this case, 5)
  start_indexes = [idx for idx, logit in start_idx_and_logit[:5]]
  end_indexes = [idx for idx, logit in end_idx_and_logit[:5]]

  # convert the token ids from a tensor to a list
  tokens = to_list(inputs['input_ids'])[0]
  question_indexes = [i+1 for i, token in enumerate(tokens[1:tokens.index(102)])]

  # keep track of all preliminary predictions
  PrelimPrediction = collections.namedtuple( 
      "PrelimPrediction", ["start_index", "end_index", "start_logit", "end_logit"]
  )

  prelim_preds = []
  for start_index in start_indexes:
    for end_index in end_indexes:
      # throw out invalid predictions
      if start_index in question_indexes:
        continue
      if end_index in question_indexes:
        continue
      if end_index < start_index:
        continue
      prelim_preds.append(
        PrelimPrediction(
            start_index = start_index,
            end_index = end_index,
            start_logit = start_logits[start_index],
            end_logit = end_logits[end_index]
        )
      )

    # keep track of all best predictions
  BestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
      "BestPrediction", ["text", "start_logit", "end_logit"]
  )

  nbest = []
  seen_predictions = []
  for pred in prelim_preds:
    if pred.start_index > 0: # non-null answers have start_index > 0

      text = tokenizer.convert_tokens_to_string(
          tokenizer.convert_ids_to_tokens(
              tokens[pred.start_index:pred.end_index+1]
          )
      )
      # clean whitespace
      text = text.strip()
      text = " ".join(text.split())

      if text in seen_predictions:
          continue

      # flag this text as being seen -- if we see it again, don't add it to the nbest list
      seen_predictions.append(text) 

      # add this text prediction to a pruned list of the top 5 best predictions
      new_start_logit = answer_probability(question, text, pred.start_logit)
      if new_start_logit == pred.start_logit:
        nbest.append(BestPrediction(text=text, start_logit=pred.start_logit, end_logit=pred.end_logit))
      else:
        nbest.append(BestPrediction(text=text, start_logit=new_start_logit, end_logit=pred.end_logit))
      

  nbest = sorted(nbest, key=lambda x: x.start_logit+x.end_logit, reverse=True)
  nbest = nbest[:5]
  # and don't forget -- include the null answer!
  nbest.append(BestPrediction(text="", start_logit=start_logits[0], end_logit=end_logits[0]))
  
  # compute the null score as the sum of the [CLS] token logits
  score_null = start_logits[0] + end_logits[0]
  # compute the difference between the null score and the best non-null score
  score_diff = score_null - nbest[0].start_logit - nbest[0].end_logit
  
  return score_diff, nbest[0].text

In [None]:
def dev_set_testing(file_dir, model, tokenizer, examples, answer_qids, no_answer_qids):
  score_diff_dev1 = {}
  predictions1 = {}
  score_diff_dev2 = {}
  predictions2 = {}

  for i in range(len(answer_qids)):
    score_diff_dev1[answer_qids[i]], predictions1[answer_qids[i]] = (get_prediction(answer_qids[i], model, tokenizer, examples))
    score_diff_dev2[answer_qids[i]], predictions2[answer_qids[i]] = (get_prediction_postprocessing(answer_qids[i], model, tokenizer, examples))
  
  for i in range(len(no_answer_qids)):
    score_diff_dev1[no_answer_qids[i]], predictions1[no_answer_qids[i]] = (get_prediction(no_answer_qids[i], model, tokenizer, examples))
    score_diff_dev2[no_answer_qids[i]], predictions2[no_answer_qids[i]] = (get_prediction_postprocessing(no_answer_qids[i], model, tokenizer, examples))

  filename_null_odds_1 = file_dir + 'null_odds_1.json'
  filename_predictions_1 = file_dir + 'predictions_1.json'
  filename_null_odds_2 = file_dir + 'null_odds_2.json'
  filename_predictions_2 = file_dir + 'predictions_2.json'

  with open(filename_null_odds_1, 'w') as outfile:
    json.dump(score_diff_dev1, outfile)

  with open(filename_predictions_1, 'w') as outfile:
      json.dump(predictions1, outfile)

  with open(filename_null_odds_2, 'w') as outfile:
      json.dump(score_diff_dev2, outfile)

  with open(filename_predictions_2, 'w') as outfile:
      json.dump(predictions2, outfile)



In [None]:
def evaluate_model(file_dir, model, tokenizer, examples, answer_qids, no_answer_qids):
  filename_null_odds_1 = file_dir + 'null_odds_1.json'
  filename_predictions_1 = file_dir + 'predictions_1.json'
  filename_null_odds_2 = file_dir + 'null_odds_2.json'
  filename_predictions_2 = file_dir + 'predictions_2.json'
  null_odds1 = json.load(open(filename_null_odds_1, 'rb'))
  predictions1 = json.load(open(filename_predictions_1, 'rb'))
  null_odds2 = json.load(open(filename_null_odds_2, 'rb'))
  predictions2 = json.load(open(filename_predictions_2, 'rb'))

  # the default threshold is set to 1.0 -- we'll leave it there for now
  results_default_thresh = get_evaluation_metrics(dev_examples, predictions2, null_odds2, 1.0)
  best_f1_thresh = results_default_thresh['best_f1_thresh']

  results_f1_thresh = get_evaluation_metrics(dev_examples, predictions2, null_odds2, best_f1_thresh)
  print('METRICS FOR DEV SET WITHOUT POSTPROCESSING')
  pprint(results_f1_thresh)
  
  # the default threshold is set to 1.0 -- we'll leave it there for now
  results_default_thresh = get_evaluation_metrics(dev_examples, predictions1, null_odds1, 1.0)
  best_f1_thresh = results_default_thresh['best_f1_thresh']
  results_f1_thresh = get_evaluation_metrics(dev_examples, predictions1, null_odds1, best_f1_thresh)
  print('METRICS FOR DEV SET WITH POSTPROCESSING')
  pprint(results_f1_thresh)


In [None]:
def get_evaluation_metrics(examples, predictions, null_odds, prob_threshhold=1):
  return squad_evaluate(examples, predictions, no_answer_probs=null_odds, 
                                        no_answer_probability_threshold=prob_threshhold)


Main

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained("/content/drive/My Drive/nlp_dataset/bert-base-uncased")
bert_model = AutoModelForQuestionAnswering.from_pretrained("/content/drive/My Drive/nlp_dataset/bert-base-uncased")

In [None]:
distilbert_tokenizer = AutoTokenizer.from_pretrained("/content/drive/My Drive/nlp_dataset/distilbert-base-uncased")
distilbert_model = AutoModelForQuestionAnswering.from_pretrained("/content/drive/My Drive/nlp_dataset/distilbert-base-uncased")

In [None]:
# this processor loads the SQuAD2.0 dev set examples
processor = SquadV2Processor()
dev_examples = processor.get_dev_examples("/content/squad", filename="dev-v2.0.json")
print(len(dev_examples))

100%|██████████| 35/35 [00:03<00:00,  8.85it/s]

11873





In [None]:
# generate some maps to help us identify examples of interest
qid_to_example_index = {example.qas_id: i for i, example in enumerate(dev_examples)}
qid_to_has_answer = {example.qas_id: bool(example.answers) for example in dev_examples}
answer_qids = [qas_id for qas_id, has_answer in qid_to_has_answer.items() if has_answer]
no_answer_qids = [qas_id for qas_id, has_answer in qid_to_has_answer.items() if not has_answer]

BERT TESTING

In [None]:
dev_set_testing("/content/drive/My Drive/nlp_dataset/bert-base-uncased/", 
                bert_model, bert_tokenizer, dev_examples, answer_qids, no_answer_qids)

In [None]:
evaluate_model("/content/drive/My Drive/nlp_dataset/bert-base-uncased/", 
                bert_model, bert_tokenizer, dev_examples, answer_qids, no_answer_qids)

METRICS FOR DEV SET WITHOUT POSTPROCESSING
OrderedDict([('exact', 69.34220500294786),
             ('f1', 73.80082311322276),
             ('total', 11873),
             ('HasAns_exact', 60.239541160593795),
             ('HasAns_f1', 69.16956356668244),
             ('HasAns_total', 5928),
             ('NoAns_exact', 78.4188393608074),
             ('NoAns_f1', 78.4188393608074),
             ('NoAns_total', 5945),
             ('best_exact', 69.52749936831466),
             ('best_exact_thresh', -6.801867485046387),
             ('best_f1', 73.8008231132222),
             ('best_f1_thresh', -3.7661972045898438)])
METRICS FOR DEV SET WITH POSTPROCESSING
OrderedDict([('exact', 69.43485218563126),
             ('f1', 73.8232516483557),
             ('total', 11873),
             ('HasAns_exact', 60.35762483130904),
             ('HasAns_f1', 69.14700857303089),
             ('HasAns_total', 5928),
             ('NoAns_exact', 78.4861227922624),
             ('NoAns_f1', 78.486122792262

DISTILBERT TESTING

In [None]:
dev_set_testing("/content/drive/My Drive/nlp_dataset/distilbert-base-uncased/", 
                distilbert_model, distilbert_tokenizer, dev_examples, answer_qids, no_answer_qids)

In [None]:
evaluate_model("/content/drive/My Drive/nlp_dataset/distilbert-base-uncased/", 
                bert_model, bert_tokenizer, dev_examples, answer_qids, no_answer_qids)

METRICS FOR DEV SET WITHOUT POSTPROCESSING
OrderedDict([('exact', 64.57508633032931),
             ('f1', 68.54706579411912),
             ('total', 11873),
             ('HasAns_exact', 53.9136302294197),
             ('HasAns_f1', 61.86897978636555),
             ('HasAns_total', 5928),
             ('NoAns_exact', 75.20605550883096),
             ('NoAns_f1', 75.20605550883096),
             ('NoAns_total', 5945),
             ('best_exact', 64.71826833993093),
             ('best_exact_thresh', -5.177046895027161),
             ('best_f1', 68.54706579411919),
             ('best_f1_thresh', -3.7676496505737305)])
METRICS FOR DEV SET WITH POSTPROCESSING
OrderedDict([('exact', 64.62562115724754),
             ('f1', 68.53062384308238),
             ('total', 11873),
             ('HasAns_exact', 53.93049932523617),
             ('HasAns_f1', 61.75170325386568),
             ('HasAns_total', 5928),
             ('NoAns_exact', 75.2901597981497),
             ('NoAns_f1', 75.2901597981

TESTING ON AUGMENTED DATASET

In [None]:
# this load augmented data
processor = SquadV2Processor()
dev_examples = processor.get_dev_examples("/content/drive/My Drive/nlp_dataset/", filename="augmented_dev.json")
print(len(dev_examples))

100%|██████████| 35/35 [00:04<00:00,  7.80it/s]


11873


In [None]:
# generate some maps to help us identify examples of interest
qid_to_example_index = {example.qas_id: i for i, example in enumerate(dev_examples)}
qid_to_has_answer = {example.qas_id: bool(example.answers) for example in dev_examples}
answer_qids = [qas_id for qas_id, has_answer in qid_to_has_answer.items() if has_answer]
no_answer_qids = [qas_id for qas_id, has_answer in qid_to_has_answer.items() if not has_answer]

In [None]:
dev_set_testing("/content/drive/My Drive/nlp_dataset/bert-base-uncased/", 
                bert_model, bert_tokenizer, dev_examples, answer_qids, no_answer_qids)

In [None]:
evaluate_model("/content/drive/My Drive/nlp_dataset/bert-base-uncased/", 
                bert_model, bert_tokenizer, dev_examples, answer_qids, no_answer_qids)

METRICS FOR DEV SET WITHOUT POSTPROCESSING
OrderedDict([('exact', 51.96664701423398),
             ('f1', 57.508129920756645),
             ('total', 11873),
             ('HasAns_exact', 18.387314439946017),
             ('HasAns_f1', 29.48617182003078),
             ('HasAns_total', 5928),
             ('NoAns_exact', 85.44995794785534),
             ('NoAns_f1', 85.44995794785534),
             ('NoAns_total', 5945),
             ('best_exact', 53.128948033352984),
             ('best_exact_thresh', -9.303107738494873),
             ('best_f1', 57.474440036144685),
             ('best_f1_thresh', -3.6766037940979004)])
METRICS FOR DEV SET WITH POSTPROCESSING
OrderedDict([('exact', 51.98349195654005),
             ('f1', 57.48830322603369),
             ('total', 11873),
             ('HasAns_exact', 18.42105263157895),
             ('HasAns_f1', 29.44646157265462),
             ('HasAns_total', 5928),
             ('NoAns_exact', 85.44995794785534),
             ('NoAns_f1', 85.4499

In [None]:
dev_set_testing("/content/drive/My Drive/nlp_dataset/distilbert-base-uncased/", 
                distilbert_model, distilbert_tokenizer, dev_examples, answer_qids, no_answer_qids)

In [None]:
evaluate_model("/content/drive/My Drive/nlp_dataset/distilbert-base-uncased/", 
                bert_model, bert_tokenizer, dev_examples, answer_qids, no_answer_qids)

METRICS FOR DEV SET WITHOUT POSTPROCESSING
OrderedDict([('exact', 51.06544260085909),
             ('f1', 55.48120675166491),
             ('total', 11873),
             ('HasAns_exact', 15.24966261808367),
             ('HasAns_f1', 24.093854210950795),
             ('HasAns_total', 5928),
             ('NoAns_exact', 86.77880571909168),
             ('NoAns_f1', 86.77880571909168),
             ('NoAns_total', 5945),
             ('best_exact', 52.08456161037648),
             ('best_exact_thresh', -8.188229084014893),
             ('best_f1', 55.455939338206),
             ('best_f1_thresh', -3.447478547692299)])
METRICS FOR DEV SET WITH POSTPROCESSING
OrderedDict([('exact', 51.579213341194304),
             ('f1', 55.44090730463834),
             ('total', 11873),
             ('HasAns_exact', 14.018218623481781),
             ('HasAns_f1', 21.752680908901812),
             ('HasAns_total', 5928),
             ('NoAns_exact', 89.03280067283431),
             ('NoAns_f1', 89.0328006