# Natural Questions - Data Preparation for Retriever Evaluation

In [64]:
import json
import os
import sys
import logging
import re
import pickle
from collections import defaultdict
from tqdm.notebook import tqdm

In [3]:
os.listdir()

['2_NQ_Retriever_ES_Setup.ipynb',
 'temp',
 '.DS_Store',
 '0_NQ_Data_Exploration.ipynb',
 '1_NQ_Retriever_Data_Prep.ipynb',
 '.ipynb_checkpoints']

In [4]:
os.getcwd()

'/Users/areed/Documents/FFL Research/FF14/qa_retriever_evaluation/notebooks'

### Load Data

In [65]:
jsonfilename = "../data/raw_data/v1.0-simplified_simplified-nq-train.jsonl"

data = []
with open(jsonfilename, 'rb') as f:
    for line in tqdm(f):
        data.append(json.loads(line.decode('utf-8')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




KeyboardInterrupt: 

In [67]:
data[0].keys()

dict_keys(['document_text', 'long_answer_candidates', 'question_text', 'annotations', 'document_url', 'example_id'])

In [73]:
data[0]['annotations']['short_answers']

TypeError: list indices must be integers or slices, not str

In [74]:
for i, rec in enumerate(data):
    if len(rec['annotations'][0]['short_answers']) == 0:
        print(i)
        break

2


In [76]:
data[2]['annotations']

[{'yes_no_answer': 'NONE',
  'long_answer': {'start_token': 319, 'candidate_index': 24, 'end_token': 438},
  'short_answers': [],
  'annotation_id': 10527123009892725162}]

### Prep NQ Train Data

**In order to properly evaluate the retriever portion of an end-to-end QA system, we will make use of *short answer questions* only.**

For some NQ questions, an annotation will contain multiple short_answers. [Google's reference implementation](https://github.com/google-research/language/blob/master/language/question_answering/decatt_docreader/preprocessing/create_nq_short_pipeline_examples.py) (line 62) suggests that during training, they only utilize the first answer. For our purposes of evaluating the retriever, we will do the same.

In [5]:
def filter_nq_train_data(raw_data):
    '''
    This function takes the full corpus of NQ training data and filters examples that
    are not relevant for proper retriever evaluation, including: a.) records that do no have at 
    least one short answer are discarded b.) records that have more than one short answer 
    are truncated to only use the first short answer.
    
    These filters are in line with standard retriever evaluation techniques as well as
    Google's suggested reference implementation:
    
    https://github.com/google-research/language/blob/master/language/question_answering/
    decatt_docreader/preprocessing/create_nq_short_pipeline_examples.py
    
    Args:
        raw_data (list) - python object representation of the raw jsonl file
        
    Returns:
        filtered_data (list) - a refined version of the raw jsonl file
    
    '''
    
    multi_count = 0 
    filtered_data = []
    
    for i, rec in enumerate(raw_data):
        
        # ignore questions that dont have at least one short answer
        if len(rec['annotations'][0]['short_answers']) == 0:
            continue
        
        # if an annotation contains multiple short answers, keep only the first
        if len(rec['annotations'][0]['short_answers']) > 1:
            
            multi_count += 1
            
            # extract first dict and keep as one-element list
            temp = []
            short_ans = rec['annotations'][0]['short_answers'][0]
            temp.append(short_ans)
            
            # overwrite
            new_rec = rec.copy()
            new_rec['annotations'][0]['short_answers'] = temp
            
            filtered_data.append(new_rec)
        
        else:
            filtered_data.append(rec)
            
            
    print(f'{len(raw_data)-len(filtered_data)} records (out of {len(raw_data)}) did not have at least one short answer and were dropped.')
    print(f'{multi_count} questions had multiple short answers that were effected by truncation.')
            
    return filtered_data

In [4]:
%%time 

filtered_data = filter_nq_train_data(data)

200447 records (out of 307373) did not have at least one short answer and were dropped.
10427 out of 307373 questions had multiple short answers that were effected by truncation.
CPU times: user 2.03 s, sys: 9.25 s, total: 11.3 s
Wall time: 34.5 s


In [6]:
len(filtered_data)

106926

### Extract & Clean Data Needed for Retriever Evaluation

**For the purposes of evaluating retriever performance, the data items we will need are:**
- question_text
- document_text (cleaned by removing all HTML tags)
- short_answer (from annotations)
- example_id
- document_url

**While the "simplified" version of NQ does clean a vast majority of the HTML content from the Wikipedia page, the "document_text" field does still contain several types of HTML tags that are used for long_answer span identification.**

Namely, those tags are:
- Heading tags
- Table tags
- Table row tags
- List tags

**For simplicity, we will use regex to remove any elements contained inside angle brackets like this `<any_text_here>`**

In [8]:
def get_short_answer_from_span(example):
    '''
    Use the short answer span from a NQ json record to retreive
    and return the corresponding short answer text.
    
    Args:
        example - a jsonl record from NQ simplified dataset
        
    Returns:
        ans (string) - the string representation of text in the short answer span
    
    '''
    
    short_answer_span = example['annotations'][0]['short_answers'][0]
    
    short_answer = " ".join(example['document_text'].split(" ")\
                            [short_answer_span['start_token']:short_answer_span['end_token']])
    
    return short_answer


def clean_document_text(text):
    '''
    This function applies a regular expression to an input text string to remove
    any characters wrapped in <> with the goal of stripping HTML tags from a string.
    
    Args:
        text (string)
        
    Returns:
        text (string) - cleaned text
    
    '''
    
    cleaner = re.compile('<.*?>')
    
    return re.sub(cleaner, '', text)


def extract_wiki_title(document_url):
    '''
    This function applies a regular expression to an input wikipedia article URL
    to extract and return the article title.
    
    Args:
        document_url (string)
        
    Returns:
        title (string) - article title
    '''
    
    pattern = 'title=(.*?)&amp'
    
    try:
        title = re.search(pattern, document_url).group(1)
    except AttributeError:
        title = 'No Title Found'
        
    return title
    
    
def extract_data(data):
    '''
    This function loops through a list of NQ simplified records and extracts only the data items
    needed for retriever evaluation as described above.
    
    Args:
        data (list) - a list of filtered jsonl records from NQ simplified dataset
        
    Returns:
        extracted_data (list) - a list of cleaned jsonl records
    
    '''
    
    extracted_data = []
    
    for i, rec in enumerate(tqdm(data)):
        
        example_id = rec['example_id']
        document_url = rec['document_url']
        question_text = rec['question_text']
        short_answer = get_short_answer_from_span(rec)
        document_text_clean = clean_document_text(rec['document_text'])
        document_title = extract_wiki_title(rec['document_url'])
        
        # to ensure our dataset is completely solveable
        # this logic weeds out erroneous labeling activity
        # ex. 'Mickey Hart </Li> <Li> Bill Kreutzmann </Li> <Li> John Mayer </Li> was selected as long AND short answer
        # when really each of these should have been their own short answers
        if short_answer not in document_text_clean:
            continue
        
        new_rec = {'example_id': example_id,
                   'document_title': document_title,
                   'document_url': document_url,
                   'question_text': question_text,
                   'short_answer': short_answer,
                   'document_text_clean': document_text_clean}
        
        extracted_data.append(new_rec)
        
    print(f'{len(extracted_data)} of the {len(data)} records are complete and solvable.')
    
    return extracted_data
    

In [9]:
# extract data needed for retriever
extracted_data = extract_data(filtered_data)

HBox(children=(FloatProgress(value=0.0, max=106926.0), HTML(value='')))


106848 of the 106926 records are complete and solvable.


#### Save data object 

In [48]:
file_path = '../data/eval_data/'
file_name = 'extracted_data_records.pkl'

with open(file_path+file_name, 'wb') as f:
    
    pickle.dump(extracted_data, f)

#### Load data object

In [5]:
file_path = '../data/eval_data/'
file_name = 'extracted_data_records.pkl'

with open(file_path+file_name, 'rb') as f:
    
    extracted_data = pickle.load(f)

### Drop records that have >5 words in the short answer

**"Answers with many tokens often resemble extractive snippets rather than canonical answers, so we discard answers with more than 5 tokens"** 

https://arxiv.org/pdf/1906.00300.pdf

In [26]:
def drop_longer_answers(data):
    '''
    This function loops through a list of NQ simplified records and drops any records where the short answer
    contains more than 5 tokens. 
    
    Answers with many tokens often resemble extractive snippets rather than canonical answers, so we discard
    answers with more than 5 tokens: https://arxiv.org/pdf/1906.00300.pdf
    
    Args:
        data (list) - a list of cleaned jsonl records from NQ simplified dataset
        
    Returns:
        extracted_data (list) - a list of cleaned jsonl records
    
    '''
    
    slim_data = []
    
    for i, rec in enumerate(tqdm(data)):
        
        if len(rec['short_answer'].split(' ')) <= 5:
            slim_data.append(rec)
            
    print(f'{len(data) - len(slim_data)} records were "long" short-answers and were dropped.')
    print(f'{len(slim_data)} records remain.')
            
    return slim_data
            

In [27]:
slim_data = drop_longer_answers(extracted_data)

HBox(children=(FloatProgress(value=0.0, max=106848.0), HTML(value='')))


19441 records were "long" short-answers and were dropped.
87407 records remain.


### Format Evidence Data For ElasticSearch Indexing

**For indexing, lets get a unique set of documents and experiment with different ways of segmenting large articles into indexes**
- deduplicate Wikipedia articles based on: Article Title

In [22]:
file_path = '../data/stage_data/'
file_name = 'extracted_clean_data.pkl'

with open(file_path+file_name, 'rb') as f:
    
    extracted_data = pickle.load(f)

In [23]:
len(extracted_data)

87407

In [28]:
extracted_data[5]

{'example_id': -341902602485554777,
 'document_title': 'President_of_the_United_Nations_General_Assembly',
 'document_url': 'https://en.wikipedia.org//w/index.php?title=President_of_the_United_Nations_General_Assembly&amp;oldid=807429262',
 'question_text': 'who is the current president of un general assembly',
 'short_answer': 'Miroslav Lajčák of Slovakia',
 'document_text_clean': "President of the United Nations General Assembly - wikipedia  President of the United Nations General Assembly  Jump to : navigation , search    President of the United Nations General Assembly     Emblem of the United Nations     Incumbent Miroslav Lajčák     Appointer   United Nations General Assembly     Term length   1 year     Inaugural holder   Paul - Henri Spaak     Formation   1946     Website   List of Presidents of the UN General Assembly     The President of the United Nations General Assembly is a position voted for by representatives in the United Nations General Assembly ( UNGA ) on a yearly b

In [39]:
def compile_evidence_corpus(extracted_data):
    '''
    This function compiles all unique wikipedia documents into a dictionary
    
    Args:
        extracted_data (list) 
        
    Returns:
        wiki_docs (dict)
    
    '''
    
    unique_titles = []
    evidence_docs = []
    
    for i, rec in enumerate(tqdm(extracted_data)):
        
        if rec['document_title'] not in unique_titles:
            
            unique_titles.append(rec['document_title'])
            
            fields = {'document_title': rec['document_title'],
                      'document_url': rec['document_url'],
                      'document_text_clean': rec['document_text_clean']}
            
            evidence_docs.append(fields)
                
            
    print(f'Of the {len(extracted_data)} records, there are {len(evidence_docs)} unique Wikipedia articles.')
        
    return evidence_docs


In [40]:
evidence_corpus = compile_evidence_corpus(extracted_data)

HBox(children=(FloatProgress(value=0.0, max=87407.0), HTML(value='')))


Of the 87407 records, there are 40971 unique Wikipedia articles.


#### Save as JSONL

**Create and save unique Wikipedia articles as jsonl for elastic search to read**

In [48]:
file_path = '../data/eval_data/'
file_name = 'evidence_corpus.jsonl'

with open(file_path+file_name, 'w') as outfile:
    for entry in evidence_corpus:
        json.dump(entry, outfile)
        outfile.write('\n')

In [49]:
len(evidence_corpus)

40971

In [52]:
file_path = '../data/eval_data/'
file_name = 'evidence_corpus.pkl'

with open(file_path+file_name, 'wb') as f:
    
    pickle.dump(evidence_corpus, f)

### Format question/answer records for evaluation

In [47]:
def remove_doc_text_field(extracted_data):
    '''
    This function loops through the extracted_data list and removes the document_text_clean field
    from each record
    
    Args:
        extracted_data (list) 
        
    Returns:
        slim_data (list)
    '''
    
    slim_data = []
    
    for i, rec in enumerate(tqdm(extracted_data)):
        
        new_rec = {k:v for k,v in rec.items() if k != 'document_text_clean'}
        slim_data.append(new_rec)
        
    return slim_data

In [50]:
qa_records = remove_doc_text_field(extracted_data)

HBox(children=(FloatProgress(value=0.0, max=87407.0), HTML(value='')))




In [51]:
file_path = '../data/eval_data/'
file_name = 'qa_records.pkl'

with open(file_path+file_name, 'wb') as f:
    
    pickle.dump(qa_records, f)

In [56]:
os.path.exists('/notebooks')

False

In [54]:
!pwd

/Users/areed/Documents/FFL Research/FF14/qa_retriever_evaluation/notebooks


In [59]:
os.listdir('../qa_retriever_evaluation')

FileNotFoundError: [Errno 2] No such file or directory: '../qa_retriever_evaluation'

In [60]:
os.getcwd()

'/Users/areed/Documents/FFL Research/FF14/qa_retriever_evaluation/notebooks'

In [63]:
os.path.exists('/Users/areed/Documents/FFL Research/FF14/qa_retriever_evaluation/')

True

In [None]:
os.path.join()