# Question Answering Receiver Evaluation

In [204]:
import json
import tqdm
import sys

sys.path.append('../natural-questions/')
import text_utils

## Discussion Points

1. The document_text **does still contain** HTML elements, but only those that are used in identifying long_answer spans (H-tags, p-tags, table-tags, list-tags). How do we want to handle this?
    - Implications on search
    - Implication on the reader
    
    
    
2. What is ideal format that you want the data in to be compatible with SQuAD training? How to format multiple short answers for a question?

## Load Data 

#### Train Set

In [3]:
jsonfilename = "../data/v1.0-simplified_simplified-nq-train.jsonl"

data = []
with open(jsonfilename, 'rb') as f:
    for line in tqdm.tqdm(f):
        data.append(json.loads(line.decode('utf-8')))

307373it [04:08, 1236.59it/s]


In [4]:
len(data)

307373

#### Dev Set

In [143]:
jsonfilename = "../data/v1.0-simplified_nq-dev-all.jsonl"

dev_data = []
with open(jsonfilename, 'rb') as f:
    for line in tqdm.tqdm(f):
        dev_data.append(json.loads(line.decode('utf-8')))

7830it [03:00, 43.33it/s]


In [223]:
len(dev_data)

7830

## Explore Train Data

### General Exploration

In [97]:
example = data[11]

In [98]:
example.keys()

dict_keys(['document_text', 'long_answer_candidates', 'question_text', 'annotations', 'document_url', 'example_id'])

In [99]:
example['document_text']

'Bangko Sentral ng Pilipinas - wikipedia <H1> Bangko Sentral ng Pilipinas </H1> <P> </P> <Table> Bangko Sentral ng Pilipinas <Tr> <Td_colspan="2"> <Table> <Tr> <Td> Logo </Td> <Td> The BSP complex in Manila </Td> </Tr> </Table> </Td> </Tr> <Tr> <Th> Headquarters </Th> <Td> Manila , Philippines </Td> </Tr> <Tr> <Th> Coordinates </Th> <Td> 14 ° 33 ′ 39 \'\' N 120 ° 59 ′ 18 \'\' E \ufeff / \ufeff 14.56083 ° N 120.98833 ° E \ufeff / 14.56083 ; 120.98833 Coordinates : 14 ° 33 ′ 39 \'\' N 120 ° 59 ′ 18 \'\' E \ufeff / \ufeff 14.56083 ° N 120.98833 ° E \ufeff / 14.56083 ; 120.98833 </Td> </Tr> <Tr> <Th> Established </Th> <Td> January 3 , 1949 ; 69 years ago ( January 3 , 1949 ) ( original , bankrupt ) July 3 , 1993 ; 25 years ago ( July 3 , 1993 ) ( re-established due to the New Central Bank Act ) </Td> </Tr> <Tr> <Th> Governor </Th> <Td> Nestor Espenilla , Jr . </Td> </Tr> <Tr> <Th> Central bank of </Th> <Td> Philippines </Td> </Tr> <Tr> <Th> Currency </Th> <Td> Philippine peso PHP ( ISO 421

In [100]:
example['long_answer_candidates']

[{'start_token': 14, 'top_level': True, 'end_token': 227},
 {'start_token': 19, 'top_level': False, 'end_token': 37},
 {'start_token': 21, 'top_level': False, 'end_token': 35},
 {'start_token': 22, 'top_level': False, 'end_token': 34},
 {'start_token': 37, 'top_level': False, 'end_token': 47},
 {'start_token': 47, 'top_level': False, 'end_token': 112},
 {'start_token': 112, 'top_level': False, 'end_token': 162},
 {'start_token': 162, 'top_level': False, 'end_token': 174},
 {'start_token': 174, 'top_level': False, 'end_token': 184},
 {'start_token': 184, 'top_level': False, 'end_token': 198},
 {'start_token': 198, 'top_level': False, 'end_token': 208},
 {'start_token': 208, 'top_level': False, 'end_token': 218},
 {'start_token': 218, 'top_level': False, 'end_token': 226},
 {'start_token': 227, 'top_level': True, 'end_token': 286},
 {'start_token': 393, 'top_level': True, 'end_token': 444},
 {'start_token': 394, 'top_level': False, 'end_token': 443},
 {'start_token': 444, 'top_level': Tr

In [101]:
example['question_text']

'bangko sentral ng pilipinas (central bank of the philippines)'

In [103]:
example['annotations']

[{'yes_no_answer': 'NONE',
  'long_answer': {'start_token': -1, 'candidate_index': -1, 'end_token': -1},
  'short_answers': [],
  'annotation_id': 16194740675034335824}]

In [104]:
example['document_url']

'https://en.wikipedia.org//w/index.php?title=Bangko_Sentral_ng_Pilipinas&amp;oldid=856719956'

In [105]:
example['example_id']

3008113339755798094

### The data isn't consistently good.

In [52]:
# example of a non-sensical question
ex11 = data[11]
print(ex11['question_text'])

bangko sentral ng pilipinas (central bank of the philippines)


In [84]:
# example of typo in question
ex1 = data[1]
print(ex1['question_text'])

how i.met your mother who is the mother


### The "clean" full text still contains some HTML tags (tags used for long_answer designation)

In [92]:
ex1['document_text']

'The Mother ( How I Met Your Mother ) - wikipedia <H1> The Mother ( How I Met Your Mother ) </H1> Jump to : navigation , search <Table> <Tr> <Th_colspan="2"> Tracy McConnell </Th> </Tr> <Tr> <Td_colspan="2"> How I Met Your Mother character </Td> </Tr> <Tr> <Td_colspan="2"> The Mother appearing in `` The Locket \'\' </Td> </Tr> <Tr> <Th> First appearance </Th> <Td> `` Lucky Penny ( unseen ) \'\' `` Something New \'\' ( seen ) </Td> </Tr> <Tr> <Th> Last appearance </Th> <Td> `` Last Forever \'\' </Td> </Tr> <Tr> <Th> Created by </Th> <Td> Carter Bays Craig Thomas </Td> </Tr> <Tr> <Th> Portrayed by </Th> <Td> Cristin Milioti </Td> </Tr> <Tr> <Th_colspan="2"> Information </Th> </Tr> <Tr> <Th> Aliases </Th> <Td> The Mother </Td> </Tr> <Tr> <Th> Gender </Th> <Td> Female </Td> </Tr> <Tr> <Th> Spouse ( s ) </Th> <Td> Ted Mosby </Td> </Tr> <Tr> <Th> Significant other ( s ) </Th> <Td> Max ( deceased former boyfriend ) Louis ( ex-boyfriend ) </Td> </Tr> <Tr> <Th> Children </Th> <Td> Penny Mosby (

### Questions can have multiple short answers

In [117]:
ex45 = data[45]

In [119]:
ex45['annotations']

[{'yes_no_answer': 'NONE',
  'long_answer': {'start_token': 22, 'candidate_index': 0, 'end_token': 235},
  'short_answers': [{'start_token': 204, 'end_token': 210},
   {'start_token': 211, 'end_token': 217},
   {'start_token': 218, 'end_token': 224},
   {'start_token': 226, 'end_token': 233}],
  'annotation_id': 12271164458330946017}]

In [218]:
def get_answers_from_span(example):
    '''
    Use the long and short answer spans from a NQ json record to retreive
    and print the corresponding long/short answer text.
    
    Args:
        example - a jsonl record from NQ simplified dataset
    
    '''
    print('Question:', example['question_text'], '\n')
    
    long_answer_span = example['annotations'][0]['long_answer']
    
    print('Long Answer:', " ".join(example['document_text'].split(" ")\
                                    [long_answer_span['start_token']:long_answer_span['end_token']]), '\n')
    
    for i, short_answer_span in enumerate(example['annotations'][0]['short_answers']):
    
        print(f'Answer {i}:', " ".join(example['document_text'].split(" ")\
                                        [short_answer_span['start_token']:short_answer_span['end_token']]))

In [221]:
get_answers_from_span(ex45)

Question: where are the upcoming olympics to be held 

Long Answer: <P> This is a list of host cities of the Olympic Games , both summer and winter , since the modern Olympics began in 1896 . Since then , summer games have usually -- but not always -- celebrated a four - year period known as an Olympiad . There have been 28 Summer Olympic Games held in 24 cities , and 23 Winter Olympic Games held in 20 cities . In addition , three summer and two winter editions of the Games were scheduled to take place but later cancelled due to war : Berlin ( summer ) in 1916 ; Tokyo / Helsinki ( summer ) and Sapporo / Garmisch - Partenkirchen ( winter ) in 1940 ; and London ( summer ) and Cortina d'Ampezzo , Italy ( winter ) in 1944 . The 1906 Summer Olympics were officially sanctioned and held in Athens . However , in 1949 , the International Olympic Committee ( IOC ) , decided to unrecognize the 1906 Games . Four cities have been chosen by the IOC to host upcoming Olympic Games : Tokyo for the 2020

## Explore Dev Data

#### General

In [205]:
dev_example = dev_data[0]

In [206]:
dev_example.keys()

dict_keys(['annotations', 'document_html', 'document_title', 'document_tokens', 'document_url', 'example_id', 'long_answer_candidates', 'question_text', 'question_tokens'])

#### Google's provided simplify function works

In [207]:
simplified_dev_example = text_utils.simplify_nq_example(dev_example)

In [208]:
simplified_dev_example.keys()

dict_keys(['question_text', 'example_id', 'document_url', 'document_text', 'long_answer_candidates', 'annotations'])

#### Dev data has 5 annotations

In [209]:
simplified_dev_example['annotations']

[{'annotation_id': 13591449469826568799,
  'long_answer': {'candidate_index': 92, 'end_token': 925, 'start_token': 808},
  'short_answers': [{'end_token': 837, 'start_token': 816}],
  'yes_no_answer': 'NONE'},
 {'annotation_id': 6237931520544082939,
  'long_answer': {'candidate_index': 92, 'end_token': 925, 'start_token': 808},
  'short_answers': [{'end_token': 819, 'start_token': 816}],
  'yes_no_answer': 'NONE'},
 {'annotation_id': 12127791536449879527,
  'long_answer': {'candidate_index': 92, 'end_token': 925, 'start_token': 808},
  'short_answers': [],
  'yes_no_answer': 'NONE'},
 {'annotation_id': 6421980561691125452,
  'long_answer': {'candidate_index': 92, 'end_token': 925, 'start_token': 808},
  'short_answers': [{'end_token': 837, 'start_token': 826}],
  'yes_no_answer': 'NONE'},
 {'annotation_id': 5015853435362506856,
  'long_answer': {'candidate_index': 92, 'end_token': 925, 'start_token': 808},
  'short_answers': [{'end_token': 819, 'start_token': 817}],
  'yes_no_answer': 

In [210]:
text_utils.get_nq_tokens(simplified_dev_example)

['Therefore',
 'sign',
 '-',
 'wikipedia',
 '<H1>',
 'Therefore',
 'sign',
 '</H1>',
 'Jump',
 'to',
 ':',
 'navigation',
 ',',
 'search',
 '<Table>',
 '<Tr>',
 '<Th_colspan="2">',
 '∴',
 '</Th>',
 '</Tr>',
 '<Tr>',
 '<Td_colspan="2">',
 'Therefore',
 'sign',
 '</Td>',
 '</Tr>',
 '<Tr>',
 '<Td_colspan="2">',
 '<Table>',
 '<Tr>',
 '<Th_colspan="2">',
 'Punctuation',
 '</Th>',
 '</Tr>',
 '<Tr>',
 '<Td>',
 'apostrophe',
 '</Td>',
 '<Td>',
 "'",
 "'",
 '</Td>',
 '</Tr>',
 '<Tr>',
 '<Td>',
 'brackets',
 '</Td>',
 '<Td>',
 '(',
 ')',
 '(',
 ')',
 '(',
 ')',
 '⟨',
 '⟩',
 '</Td>',
 '</Tr>',
 '<Tr>',
 '<Td>',
 'colon',
 '</Td>',
 '<Td>',
 ':',
 '</Td>',
 '</Tr>',
 '<Tr>',
 '<Td>',
 'comma',
 '</Td>',
 '<Td>',
 ',',
 '،',
 '、',
 '</Td>',
 '</Tr>',
 '<Tr>',
 '<Td>',
 'dash',
 '</Td>',
 '<Td>',
 '‒',
 '--',
 '--',
 '―',
 '</Td>',
 '</Tr>',
 '<Tr>',
 '<Td>',
 'ellipsis',
 '</Td>',
 '<Td>',
 '...',
 '...',
 '⋯',
 '᠁',
 'ฯ',
 '</Td>',
 '</Tr>',
 '<Tr>',
 '<Td>',
 'exclamation',
 'mark',
 '</Td>',
 '<

## Simplify Dev Data

In [224]:
len(dev_data)

7830

In [225]:
type(dev_data)

list

In [226]:
dev_data[0]

{'annotations': [{'annotation_id': 13591449469826568799,
   'long_answer': {'candidate_index': 92,
    'end_token': 925,
    'start_token': 808},
   'short_answers': [{'end_token': 837, 'start_token': 816}],
   'yes_no_answer': 'NONE'},
  {'annotation_id': 6237931520544082939,
   'long_answer': {'candidate_index': 92,
    'end_token': 925,
    'start_token': 808},
   'short_answers': [{'end_token': 819, 'start_token': 816}],
   'yes_no_answer': 'NONE'},
  {'annotation_id': 12127791536449879527,
   'long_answer': {'candidate_index': 92,
    'end_token': 925,
    'start_token': 808},
   'short_answers': [],
   'yes_no_answer': 'NONE'},
  {'annotation_id': 6421980561691125452,
   'long_answer': {'candidate_index': 92,
    'end_token': 925,
    'start_token': 808},
   'short_answers': [{'end_token': 837, 'start_token': 826}],
   'yes_no_answer': 'NONE'},
  {'annotation_id': 5015853435362506856,
   'long_answer': {'candidate_index': 92,
    'end_token': 925,
    'start_token': 808},
   'sho

In [None]:
def simplify_data(full_data):
    '''
    This function applies 
    
    '''

## Filter For Retriever Evaluation

For our evaluation purposes, we will only be utilizing questions that contain short answer or null answer types. Thus, we are excluding any questions that only have long or yes/no answers only.

**Identifying Exclusions**
The following field/value logic is used to identify short answer and no answer types.
- len(short_answers) > 0  ## QUESTION: can there be more than one short answer??
- yes_no_answer != 'NONE'
- long_answer: 

In [None]:
if (len(short_answers) == 0) & (yes_no_answer == 'NONE'): 'no_answer'
if (len(short_answers) > 0) & (yes_no_answer == 'NONE'): 'short_answer'

In [None]:
!wget https://storage.cloud.google.com/natural_questions/v1.0-simplified/simplified-nq-train.jsonl.gz

In [2]:
import urllib

In [6]:
url = 'https://storage.cloud.google.com/natural_questions/v1.0-simplified/simplified-nq-train.jsonl.gz'
url = 'https://00e9e64bac5fa14003581c8a55581c7f65d2b932543c109661-apidata.googleusercontent.com/download/storage/v1/b/natural_questions/o/v1.0-simplified%2Fsimplified-nq-train.jsonl.gz?qk=AD5uMEuhixdPI4RVW-Vp92Ctehkh0xgrPCBAuMKIT0EF0xHiLeqaOuQaFYTvLatFxlhRkylGfZEb0PpZwgda203f1EeK6IxrdoFKK3xDVZMsUqpF852yTnjj_aQmlM0NvLDMjoDBfza2N6oAnx3wdznwXZ-WJW0WfudIQT8tkBzo-Tt3mVvD5To5LjmdUI7a35GILzS4fmSix9tOGbaEVw4sSo3wSikMYNPfoBJU6Nfprr2VJjg_fuqBPfxqTciahIHBCyRKTuJVAxy40AxZGOHHBYbfmUcqfhTyS0IfbYy0pylvprptUZpivxRKkSn1dFxMXtIpV3sBh3D1Q04Q8I9Ygt2TX4kEIO6-DEtM3--odSgWKoc3sWRB7Vwv1nM5JxXdM049pI4tb3VFB7zch4_VfV0DggLtTYHnf5xsKOewtCxu8yvmhN0oshKo21ijgl8u6DvWi6HeEYT6ixvFbQRyB52hZmlxeegZh5dWvyeoGyQmxpGQLFWAGI7Lsst3232mTUBmE_H0Tj62YYGyc5kUQyqjkLMwiYqOK-dFdfAyW8yCKN2dg-XkjqoMZTg_7WaP0tnJ9cBrMD27n5Y3zSyPVjhkFk9gnSZOiJN14KUHpfNBVoprgkaXygyQWahv-69oa6qB2cjHEYmFgoPa24r_OPLPX9MtD1JHT5U8R4MsKveg5-RCPkd4h_Ot06YzoXsKsbJk_xNGwGX5ynjcrudEfrHpyACEfqN0aTGd1yZfhuBNS5ng0IEdjfvtQDzJyzwoKRjoqg9QQcUUAvHwMF11cWxUisQgIP6E0_lgHmsWITygeUeXi8pxR-XVzPFVOEO9NPCl4bZEPkTh9NZA1dBhjppmFv-LfQ&isca=1'

urllib.request.urlretrieve(url, 'temp/TESTING.jsonl.gz')


KeyboardInterrupt: 

In [3]:
!pwd

/Users/areed/Documents/FFL Research/FF14/qa_retriever_evaluation/notebooks


In [8]:
'hello.py.gz'[:-3]

'hello.py'

In [10]:
import os

In [None]:
os.g