# Natural Questions - Data Preparation for Retriever Evaluation

In [2]:
import json
import sys
import re
import pickle
from collections import defaultdict
from tqdm.notebook import tqdm

### Load Data

In [3]:
jsonfilename = "../data/v1.0-simplified_simplified-nq-train.jsonl"

data = []
with open(jsonfilename, 'rb') as f:
    for line in tqdm(f):
        data.append(json.loads(line.decode('utf-8')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




### Filter Short Answers

**In order to properly evaluate the retriever portion of an end-to-end QA system, we will make use of *short answer questions* only.**

In [4]:
def separate_short_answer_only(data):
    '''
    This function loops through all of the NQ simplified records and returns the subset
    of them that do have short answers.
    
    Args:
        data (list) - a list of jsonl records from NQ simplified dataset
        
    Returns:
        short_data (list) - a list of jsonl records
    
    '''
    
    short_data = []
    
    for i, rec in enumerate(tqdm(data)):
        
        if len(rec['annotations'][0]['short_answers']) > 0:
            short_data.append(rec)
            
    return short_data


In [5]:
# filter out records that don't have at least one short answer
short_data = separate_short_answer_only(data)

HBox(children=(FloatProgress(value=0.0, max=307373.0), HTML(value='')))




In [10]:
# this is validated in the paper: 
# https://storage.googleapis.com/pub-tools-public-publication-data/pdf/1f7b46b5378d757553d3e92ead36bda2e4254244.pdf

print(f'{len(short_data)} out of {len(data)} (or {round(len(short_data)/len(data),3)}) NQ questions contain at least one short answer.')


NameError: name 'data' is not defined

In [7]:
# drop full data from memory
del data

### Reduce Mulitple Short Answers in Single Annotation

For some NQ questions, an annotation will contain multiple short_answers. [Google's reference implementation](https://github.com/google-research/language/blob/master/language/question_answering/decatt_docreader/preprocessing/create_nq_short_pipeline_examples.py) (line 62) suggests that during training, they only utilize the first answer. For our purposes of evaluating the retriever, we will do the same.

In [11]:
def remove_multiple_short_answers(data):
    '''
    This function loops through a list of NQ simplified records and for any record that contains multiple
    short answers for a given annotation, drops all short answers except for the first one.
    
    Args:
        data (list) - a list of jsonl records from NQ simplified dataset
        
    Returns:
        dedup_data (list) - a list of jsonl records with multiple short answers removed
    
    '''
    
    dedup_data = []
    num_effected = 0
    
    for i, rec in enumerate(tqdm(data)):
        
        if len(rec['annotations'][0]['short_answers']) > 1:
            
            num_effected += 1
            
            # extract first dict and keep as one-element list
            temp = []
            short_ans = rec['annotations'][0]['short_answers'][0]
            temp.append(short_ans)
            
            # overwrite
            rec['annotations'][0]['short_answers'] = temp
            
            dedup_data.append(rec)
        
        else:
            dedup_data.append(rec)
            
    print(f'{num_effected} out of {len(data)} questions had multiple short answers that were effected.')
            
    return dedup_data
    

In [12]:
# drop multiple short answers
dedup_data = remove_multiple_short_answers(short_data)

HBox(children=(FloatProgress(value=0.0, max=106926.0), HTML(value='')))


10427 out of 106926 questions had multiple short answers that were effected.


In [14]:
del short_data

### Extract & Clean Data Needed for Retriever Evaluation

**For the purposes of evaluating retriever performance, the data items we will need are:**
- question_text
- document_text (cleaned by removing all HTML tags)
- short_answer (from annotations)
- example_id
- document_url

**While the "simplified" version of NQ does clean a vast majority of the HTML content from the Wikipedia page, the "document_text" field does still contain several types of HTML tags that are used for long_answer span identification.**

Namely, those tags are:
- Heading tags
- Table tags
- Table row tags
- List tags

**For simplicity, we will use regex to remove any elements contained inside angle brackets like this `<any_text_here>`**

In [34]:
def get_short_answer_from_span(example):
    '''
    Use the short answer span from a NQ json record to retreive
    and return the corresponding short answer text.
    
    Args:
        example - a jsonl record from NQ simplified dataset
        
    Returns:
        ans (string) - the string representation of text in the short answer span
    
    '''
    
    short_answer_span = example['annotations'][0]['short_answers'][0]
    
    short_answer = " ".join(example['document_text'].split(" ")\
                            [short_answer_span['start_token']:short_answer_span['end_token']])
    
    return short_answer


def clean_document_text(text):
    '''
    This function applies a regular expression to an input text string to remove
    any characters wrapped in <> with the goal of stripping HTML tags from a string.
    
    Args:
        text (string)
        
    Returns:
        text (string) - cleaned text
    
    '''
    
    cleaner = re.compile('<.*?>')
    
    return re.sub(cleaner, '', text)
    
    
def extract_data(data):
    '''
    This function loops through a list of NQ simplified records and extracts only the data items
    needed for retriever evaluation as described above.
    
    Args:
        data (list) - a list of jsonl records from NQ simplified dataset
        
    Returns:
        extracted_data (list) - a list of jsonl records
    
    '''
    
    extracted_data = []
    
    for i, rec in enumerate(tqdm(data)):
        
        example_id = rec['example_id']
        document_url = rec['document_url']
        question_text = rec['question_text']
        short_answer = get_short_answer_from_span(rec)
        document_text_clean = clean_document_text(rec['document_text'])
        
        # to ensure our dataset is completely solvable
        # this logic weeds out erroneous labeling activity
        # ex. 'Mickey Hart </Li> <Li> Bill Kreutzmann </Li> <Li> John Mayer </Li> was selected as long AND short answer
        # when really each of these should have been their own short answers
        if short_answer not in document_text_clean:
            continue
        
        new_rec = {'example_id': example_id,
                   'document_url': document_url,
                   'question_text': question_text,
                   'short_answer': short_answer,
                   'document_text_clean': document_text_clean}
        
        extracted_data.append(new_rec)
        
    print(f'{len(extracted_data)} of the {len(data)} records are complete and solvable.')
    
    return extracted_data
    

In [35]:
# extract data needed for retriever
extracted_data = extract_data(dedup_data)

HBox(children=(FloatProgress(value=0.0, max=106926.0), HTML(value='')))


106848 of the 106926 records are complete and solvable.


#### Save data object 

In [48]:
file_path = '../data/eval_data/'
file_name = 'extracted_data_records.pkl'

with open(file_path+file_name, 'wb') as f:
    
    pickle.dump(extracted_data, f)

#### Load data object

In [5]:
file_path = '../data/eval_data/'
file_name = 'extracted_data_records.pkl'

with open(file_path+file_name, 'rb') as f:
    
    extracted_data = pickle.load(f)

### Format QA Data For Evaluation

**For evaluating the retriever performance, we can subset records to exclude the document text**

In [18]:
def remove_doc_text_field(extracted_data):
    '''
    This function loops through the extracted_data list and removes the document_text_clean field
    from each record
    
    Args:
        extracted_data (list) 
        
    Returns:
        slim_data (list)
    '''
    
    slim_data = []
    
    for i, rec in enumerate(tqdm(extracted_data)):
        
        new_rec = {k:v for k,v in rec.items() if k != 'document_text_clean'}
        slim_data.append(new_rec)
        
    return slim_data


In [19]:
slim_data = remove_doc_text_field(extracted_data)

HBox(children=(FloatProgress(value=0.0, max=106848.0), HTML(value='')))




#### Save data object

In [23]:
file_path = '../data/eval_data/'
file_name = 'slim_extracted_data_records.pkl'

with open(file_path+file_name, 'wb') as f:
    
    pickle.dump(slim_data, f)

### Format Evidence Data For ElasticSearch Indexing

**For indexing, lets get a unique set of documents and experiment with different ways of segmenting large articles into indexes**

In [24]:
def compile_evidence_corpus(extracted_data):
    '''
    This function compiles all unique wikipedia documents into a dictionary
    
    Args:
        extracted_data (list) 
        
    Returns:
        wiki_docs (dict)
    
    
    '''
    
    wiki_docs = {}
    
    for i, rec in enumerate(tqdm(extracted_data)):
        
        if rec['document_url'] not in wiki_docs.keys():
            
            wiki_docs[rec['document_url']] = rec['document_text_clean']
            
    print(f'Of the {len(extracted_data)} records, there are {len(wiki_docs)} unique Wikipedia articles.')
        
    return wiki_docs


In [25]:
wiki_docs = compile_evidence_corpus(extracted_data)

HBox(children=(FloatProgress(value=0.0, max=106848.0), HTML(value='')))


Of the 106848 records, there are 87108 unique Wikipedia articles.


#### Save data object

In [26]:
file_path = '../data/eval_data/'
file_name = 'wiki_docs.pkl'

with open(file_path+file_name, 'wb') as f:
    
    pickle.dump(wiki_docs, f)

#### Load data object

In [3]:
file_path = '../data/eval_data/'
file_name = 'wiki_docs.pkl'

with open(file_path+file_name, 'rb') as f:
    
    wiki_docs = pickle.load(f)

In [30]:
wiki_docs['https://en.wikipedia.org//w/index.php?title=List_of_Premier_League_broadcasters&amp;oldid=808611916']

"List of Premier League broadcasters - wikipedia  List of Premier League broadcasters  Jump to : navigation , search  This is a list of television broadcasters which provide coverage of the Premier League , English football 's top level competition , which is the most watched league in the world . The main broadcasters in the United Kingdom are Sky Sports , who broadcast 126 of the 168 televised games in the UK , and BT Sport . The BBC shows weekly highlights of the Premier League on its Match of the Day and Match of the Day 2 programmes on Saturdays and Sundays .   The 168 UK televised games are also broadcast across the world ; the remaining 212 matches that are n't broadcast live in the UK are all broadcast elsewhere around the world . English - speaking countries ( excluding the UK ) are able to carry what is known as the ' International feed ' or ' World feed ' audio ; this is full match commentary provided by the Premier League . In Asia , and select other countries around the wo

#### Save as JSONL

In [13]:
def extract_wiki_title(document_url):
    
    pattern = 'title=(.*?)&amp'
    
    try:
        title = re.search(pattern, document_url).group(1)
    except AttributeError:
        title = 'No Title Found'
        
    return title

**Create and save unique Wikipedia articles as jsonl for elastic search to read**

In [17]:
ES_wiki_docs_full = [{'document_url':url, 'document_title':extract_wiki_title(url), 'document_text_clean':body}\
                     for url, body in wiki_docs.items()]

In [24]:
file_path = '../data/eval_data/'
file_name = 'ES_wiki_docs_full.jsonl'

with open(file_path+file_name, 'w') as outfile:
    for entry in ES_wiki_docs_full:
        json.dump(entry, outfile)
        outfile.write('\n')

In [53]:

test = []
with open(file_path+file_name, 'rb') as f:
    for i, line in enumerate(tqdm(f)):
        test.append(json.loads(line.decode('utf-8')))
    
        if i ==5: break


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [27]:
len(ES_wiki_docs_full)

87108

## Manual check to see how many records have the short answer in the cleaned text

In [31]:
extracted_data[8]

{'example_id': -341902602485554777,
 'document_url': 'https://en.wikipedia.org//w/index.php?title=President_of_the_United_Nations_General_Assembly&amp;oldid=807429262',
 'question_text': 'who is the current president of un general assembly',
 'short_answer': 'Miroslav Lajčák of Slovakia',
 'document_text_clean': "President of the United Nations General Assembly - wikipedia  President of the United Nations General Assembly  Jump to : navigation , search    President of the United Nations General Assembly     Emblem of the United Nations     Incumbent Miroslav Lajčák     Appointer   United Nations General Assembly     Term length   1 year     Inaugural holder   Paul - Henri Spaak     Formation   1946     Website   List of Presidents of the UN General Assembly     The President of the United Nations General Assembly is a position voted for by representatives in the United Nations General Assembly ( UNGA ) on a yearly basis . The President presides over the sessions of the General Assembly

In [32]:
extracted_data[8]['short_answer'] in extracted_data[8]['document_text_clean']

True

In [33]:
sanity_check = []

for i, rec in enumerate(tqdm(extracted_data)):
    sanity_check.append(int(rec['short_answer'] in rec['document_text_clean']))

HBox(children=(FloatProgress(value=0.0, max=106848.0), HTML(value='')))




In [34]:
sanity_check.count(1)/len(sanity_check)

1.0

In [36]:
sanity_check.count(0)

0