In [4]:
%load_ext autoreload
%autoreload 2
import json
import os
import sys
import logging
import re
import pickle
from collections import defaultdict

sys.path.append('..')
sys.path.append('/home/gpsnest/anaconda3/envs/qa-eval/lib/python3.6/site-packages')
from tqdm import tqdm
from utils.text_utils import simplify_nq_example

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
#curl -O https://raw.githubusercontent.com/google-research-datasets/natural-questions/master/text_utils.py

## Load Data 

In [16]:
jsonfilename = "../data/nq/v1.0-simplified_nq-dev-all.jsonl"

def convert_nq_dev_to_squad_format(filepath):
    '''
    Load NQ dev set from disk, simplify each record, convert them to SQuAD format
    
    '''
    
    nq_examples = []
    with open(jsonfilename, 'rb') as f:
        for i, line in enumerate(tqdm(f)):

            simp_example = simplify_nq_example(json.loads(line.decode('utf-8')))
            answers, yes_no_flag = get_short_answers_from_span(simp_example)

            if yes_no_flag:
                # exclude questions with any annotation indicating yes/no question
                print(f'Found a yes/no: {i}')
                continue

            clean_record = {'qas_id': simp_example['example_id'],
                            'title': extract_wiki_title(simp_example['document_url']),
                            'question_text': simp_example['question_text'],
                            'answers': answers,
                            'is_impossible': True if len(answers)==0 else False}
            
            nq_ex = NQSquadExample(**clean_record)

            nq_examples.append(nq_ex)
            
    return nq_examples

In [35]:
def get_short_answers_from_span(simplified_example):
    '''
    Extracts short answer text from a simplified NQ example using the short answer span and document text and
    returns flag if any annotation indicates a yes/no answer
    
    Note:
        1. Annotations that have multipart answers (more than 1 short answer) are dropped from list
            of short answers
        2. Answers with many tokens often resemble extractive snippets rather than canonical answers, 
            so we discard answers with more than 5 tokens. (https://arxiv.org/pdf/1906.00300.pdf)
    
    '''
    
    answers = []
    yes_no_flag = False
    for annotation in simplified_example['annotations']:
        
        # check for yes/no questions
        if annotation['yes_no_answer'] != 'NONE':
            yes_no_flag = True
        
        # extract short answers
        if len(annotation['short_answers']) > 1 or len(annotation['short_answers']) == 0:
            continue
                
        else:
            short_answer_span = annotation['short_answers'][0]
            short_answer = " ".join(simplified_example['document_text'].split(" ")\
                                    [short_answer_span['start_token']:short_answer_span['end_token']])
            
            if len(short_answer.split(' ')) > 5:
                continue
            
            answers.append(short_answer)
            
    return answers, yes_no_flag


def extract_wiki_title(document_url):
        '''
        This function applies a regular expression to an input wikipedia article URL
        to extract and return the article title.
        
        Args:
            document_url (string)
            
        Returns:
            title (string) - article title
        '''
        
        pattern = 'title=(.*?)&amp'
        
        try:
            title = re.search(pattern, document_url).group(1)
        except AttributeError:
            title = 'No Title Found'
            
        return title

class NQSquadExample(object):
    """
    A single dev example for the NQ dataset represented in SQuAD format
    
    Args:
        qas_id: The example's unique identifier
        question_text: The question string
        title: The title of the Wikipedia article
        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
        is_impossible: False by default, set to True if the example has no possible answer.
    """

    def __init__(
        self,
        qas_id,
        question_text,
        title,
        answers,
        is_impossible,
    ):
        self.qas_id = qas_id
        self.question_text = question_text
        self.title = title
        self.is_impossible = is_impossible
        self.answers = answers


In [36]:
nq_examples = convert_nq_dev_to_squad_format(jsonfilename)

30it [00:00, 83.15it/s]

Found a yes/no: 10
Found a yes/no: 15


128it [00:01, 80.82it/s]

Found a yes/no: 113


137it [00:01, 73.23it/s]

Found a yes/no: 132


191it [00:02, 73.64it/s]

Found a yes/no: 182


229it [00:02, 82.73it/s]

Found a yes/no: 217
Found a yes/no: 220


251it [00:03, 72.98it/s]

Found a yes/no: 242


295it [00:03, 86.30it/s]

Found a yes/no: 283


367it [00:04, 73.57it/s]

Found a yes/no: 353


399it [00:05, 58.33it/s]

Found a yes/no: 391


427it [00:05, 55.35it/s]

Found a yes/no: 415
Found a yes/no: 424


442it [00:05, 59.93it/s]

Found a yes/no: 433


518it [00:06, 72.66it/s]

Found a yes/no: 508


572it [00:07, 73.52it/s]

Found a yes/no: 569


794it [00:10, 85.42it/s]

Found a yes/no: 780
Found a yes/no: 785


829it [00:11, 57.09it/s]

Found a yes/no: 819


867it [00:12, 61.71it/s]

Found a yes/no: 858
Found a yes/no: 871


904it [00:12, 72.27it/s]

Found a yes/no: 895


966it [00:13, 67.08it/s]

Found a yes/no: 956


1011it [00:14, 63.19it/s]

Found a yes/no: 995


1127it [00:15, 60.46it/s]

Found a yes/no: 1117


1175it [00:16, 79.74it/s]

Found a yes/no: 1155


1185it [00:16, 75.84it/s]

Found a yes/no: 1182


1225it [00:17, 64.22it/s]

Found a yes/no: 1214


1256it [00:17, 74.11it/s]

Found a yes/no: 1243
Found a yes/no: 1256


1314it [00:18, 69.25it/s]

Found a yes/no: 1303


1393it [00:19, 78.47it/s]

Found a yes/no: 1374


1478it [00:20, 76.52it/s]

Found a yes/no: 1461
Found a yes/no: 1475


1558it [00:21, 67.30it/s]

Found a yes/no: 1542


1566it [00:22, 58.89it/s]

Found a yes/no: 1560


1609it [00:22, 63.13it/s]

Found a yes/no: 1593


1629it [00:23, 69.38it/s]

Found a yes/no: 1612


1650it [00:23, 80.10it/s]

Found a yes/no: 1639


1717it [00:24, 73.32it/s]

Found a yes/no: 1708
Found a yes/no: 1714


1781it [00:25, 76.07it/s]

Found a yes/no: 1766


1799it [00:25, 70.70it/s]

Found a yes/no: 1786


1824it [00:25, 60.37it/s]

Found a yes/no: 1815


1839it [00:26, 60.10it/s]

Found a yes/no: 1830


1862it [00:26, 63.87it/s]

Found a yes/no: 1848
Found a yes/no: 1853


1887it [00:26, 83.90it/s]

Found a yes/no: 1871
Found a yes/no: 1875


1968it [00:27, 68.18it/s]

Found a yes/no: 1955


2000it [00:28, 65.80it/s]

Found a yes/no: 1988


2061it [00:29, 68.72it/s]

Found a yes/no: 2052


2077it [00:29, 63.11it/s]

Found a yes/no: 2065


2169it [00:30, 76.39it/s]

Found a yes/no: 2152


2229it [00:31, 77.70it/s]

Found a yes/no: 2220
Found a yes/no: 2226
Found a yes/no: 2233


2269it [00:31, 87.61it/s]

Found a yes/no: 2256
Found a yes/no: 2265


2287it [00:32, 80.40it/s]

Found a yes/no: 2279
Found a yes/no: 2282


2331it [00:32, 64.39it/s]

Found a yes/no: 2321


2345it [00:33, 59.83it/s]

Found a yes/no: 2333


2464it [00:34, 77.62it/s]

Found a yes/no: 2455


2588it [00:36, 63.35it/s]

Found a yes/no: 2578


2803it [00:39, 86.96it/s]

Found a yes/no: 2783
Found a yes/no: 2797


2869it [00:40, 76.12it/s]

Found a yes/no: 2857


2941it [00:41, 64.44it/s]

Found a yes/no: 2930


2981it [00:41, 64.75it/s]

Found a yes/no: 2972


3038it [00:42, 65.10it/s]

Found a yes/no: 3032


3128it [00:44, 61.47it/s]

Found a yes/no: 3120


3168it [00:44, 66.71it/s]

Found a yes/no: 3157


3202it [00:45, 70.37it/s]

Found a yes/no: 3189


3270it [00:46, 48.71it/s]

Found a yes/no: 3262


3292it [00:46, 56.85it/s]

Found a yes/no: 3286


3381it [00:48, 63.72it/s]

Found a yes/no: 3368


3400it [00:48, 68.59it/s]

Found a yes/no: 3387


3420it [00:48, 76.51it/s]

Found a yes/no: 3403


3463it [00:48, 87.23it/s]

Found a yes/no: 3447


3498it [00:49, 67.46it/s]

Found a yes/no: 3495
Found a yes/no: 3498


3617it [00:51, 84.04it/s]

Found a yes/no: 3609


3770it [00:53, 68.68it/s]

Found a yes/no: 3750


3790it [00:53, 77.26it/s]

Found a yes/no: 3780


3853it [00:54, 63.95it/s]

Found a yes/no: 3848


3905it [00:55, 45.41it/s]

Found a yes/no: 3903


3921it [00:56, 39.27it/s]

Found a yes/no: 3914


3986it [00:57, 46.68it/s]

Found a yes/no: 3975
Found a yes/no: 3976


3998it [00:58, 51.03it/s]

Found a yes/no: 3992


4049it [00:59, 53.71it/s]

Found a yes/no: 4033


4091it [00:59, 44.05it/s]

Found a yes/no: 4084


4164it [01:01, 61.06it/s]

Found a yes/no: 4160


4243it [01:02, 52.47it/s]

Found a yes/no: 4233


4335it [01:04, 61.69it/s]

Found a yes/no: 4325
Found a yes/no: 4333


4349it [01:04, 59.98it/s]

Found a yes/no: 4338
Found a yes/no: 4344


4365it [01:04, 61.57it/s]

Found a yes/no: 4361


4387it [01:05, 65.62it/s]

Found a yes/no: 4379


4402it [01:05, 59.25it/s]

Found a yes/no: 4389


4574it [01:08, 47.49it/s]

Found a yes/no: 4569


4591it [01:08, 52.23it/s]

Found a yes/no: 4581
Found a yes/no: 4583


4614it [01:09, 51.54it/s]

Found a yes/no: 4602


4662it [01:10, 57.74it/s]

Found a yes/no: 4655


4824it [01:12, 69.88it/s]

Found a yes/no: 4814
Found a yes/no: 4820


4870it [01:13, 65.01it/s]

Found a yes/no: 4860


4934it [01:14, 58.37it/s]

Found a yes/no: 4927


4948it [01:14, 60.06it/s]

Found a yes/no: 4936


4982it [01:15, 58.17it/s]

Found a yes/no: 4974


4996it [01:15, 61.77it/s]

Found a yes/no: 4989
Found a yes/no: 4990


5019it [01:15, 69.48it/s]

Found a yes/no: 5003


5132it [01:17, 66.84it/s]

Found a yes/no: 5123


5149it [01:17, 66.67it/s]

Found a yes/no: 5138
Found a yes/no: 5141


5202it [01:18, 74.02it/s]

Found a yes/no: 5186
Found a yes/no: 5200


5231it [01:18, 74.67it/s]

Found a yes/no: 5223


5279it [01:19, 77.86it/s]

Found a yes/no: 5273


5313it [01:19, 64.33it/s]

Found a yes/no: 5306
Found a yes/no: 5311


5403it [01:20, 72.72it/s]

Found a yes/no: 5393
Found a yes/no: 5396


5439it [01:21, 67.50it/s]

Found a yes/no: 5427


5460it [01:21, 74.14it/s]

Found a yes/no: 5447


5504it [01:22, 77.36it/s]

Found a yes/no: 5497
Found a yes/no: 5502


5565it [01:23, 70.54it/s]

Found a yes/no: 5552


5620it [01:23, 61.32it/s]

Found a yes/no: 5615
Found a yes/no: 5623


5681it [01:24, 74.41it/s]

Found a yes/no: 5668


5709it [01:25, 74.94it/s]

Found a yes/no: 5693


5852it [01:27, 58.54it/s]

Found a yes/no: 5840


5942it [01:28, 67.65it/s]

Found a yes/no: 5928


6016it [01:30, 67.31it/s]

Found a yes/no: 6007
Found a yes/no: 6010


6024it [01:30, 65.34it/s]

Found a yes/no: 6020


6052it [01:30, 58.15it/s]

Found a yes/no: 6040


6082it [01:31, 71.13it/s]

Found a yes/no: 6065


6282it [01:34, 59.76it/s]

Found a yes/no: 6275


6315it [01:34, 60.13it/s]

Found a yes/no: 6306
Found a yes/no: 6317


6368it [01:35, 83.29it/s]

Found a yes/no: 6354


6406it [01:36, 56.59it/s]

Found a yes/no: 6400


6567it [01:38, 72.70it/s]

Found a yes/no: 6554


6662it [01:40, 80.50it/s]

Found a yes/no: 6645
Found a yes/no: 6653


6688it [01:40, 72.97it/s]

Found a yes/no: 6685


6751it [01:41, 59.96it/s]

Found a yes/no: 6744
Found a yes/no: 6749


6779it [01:41, 61.82it/s]

Found a yes/no: 6771


6895it [01:43, 68.82it/s]

Found a yes/no: 6882
Found a yes/no: 6887


6922it [01:44, 71.48it/s]

Found a yes/no: 6910


6938it [01:44, 69.54it/s]

Found a yes/no: 6933


7054it [01:45, 75.92it/s]

Found a yes/no: 7036


7119it [01:47, 62.89it/s]

Found a yes/no: 7111
Found a yes/no: 7126


7136it [01:47, 63.10it/s]

Found a yes/no: 7127
Found a yes/no: 7131


7175it [01:47, 65.40it/s]

Found a yes/no: 7156


7206it [01:48, 76.59it/s]

Found a yes/no: 7190


7244it [01:48, 58.50it/s]

Found a yes/no: 7235


7383it [01:50, 76.76it/s]

Found a yes/no: 7370


7402it [01:51, 70.61it/s]

Found a yes/no: 7395


7459it [01:52, 67.56it/s]

Found a yes/no: 7450
Found a yes/no: 7464


7489it [01:52, 77.11it/s]

Found a yes/no: 7479
Found a yes/no: 7492


7602it [01:53, 67.33it/s]

Found a yes/no: 7597


7676it [01:54, 64.29it/s]

Found a yes/no: 7667


7693it [01:55, 60.10it/s]

Found a yes/no: 7683


7734it [01:55, 72.56it/s]

Found a yes/no: 7724


7771it [01:56, 69.92it/s]

Found a yes/no: 7755


7795it [01:56, 64.24it/s]

Found a yes/no: 7785


7830it [01:57, 66.85it/s]


In [38]:
len(nq_examples)

7651

In [42]:
impossible_count = 0

for ex in nq_examples:
    if ex.is_impossible == True:
        impossible_count += 1
    

In [43]:
impossible_count

4470

In [44]:
4470/len(nq_examples)

0.5842373545941707

58% of NQ dev questions have no answer

In [41]:
nq_examples[0].is_impossible

False

In [11]:
nq_examples[0].answers

['the therefore sign', 'therefore sign']