In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
import re
import pandas as pd
from tqdm.notebook import tqdm
from elasticsearch import Elasticsearch

module_path = os.path.dirname(os.getcwd())
if module_path not in sys.path:
    sys.path.append(module_path)

from utils.data_utils import load_pkl_file
from utils.elastic_utils import connect_es, create_es_index, load_es_index

# Demo Notebook

The purpose of this notebook is to demonstrate how to work with NQ data in Elasticsearch and assumes that `prepare_data.py` has been executed per the repo's README.md.

## Connect to Local ElasticSearch Instance

In [15]:
# assumes running local on port 9200
es = connect_es()

In [7]:
es.ping()

True

## Evaluate a Baseline

#### Load baseline data

In [4]:
# load data
evidence_corpus = load_pkl_file('../data/eval_data/evidence_corpus.pkl')
qa_records = load_pkl_file('../data/eval_data/qa_records.pkl')

In [5]:
len(evidence_corpus), len(qa_records)

(13, 13)

#### Create the index

In [43]:
# create index

index_name = 'baseline'

settings = {
    "mappings": {
        "dynamic": "strict",        
        "properties": {
            "document_title": {"type": "text"},
            "document_url": {"type": "text"},
            "document_text_clean": {"type": "text"}
            }
        }
    }


create_es_index(es_obj=es, 
                settings=settings, 
                index_name=index_name)

In [12]:
es.indices.get_alias('*')

{'baseline': {'aliases': {}},
 'nq_wiki_data_test': {'aliases': {}},
 'nq_wiki_data': {'aliases': {}},
 'cases': {'aliases': {}}}

In [14]:
es.count(index='demo_index')['count']

46

In [11]:
es.indices.delete(index='demo_index', ignore=[400, 404])

{'acknowledged': True}

#### Populate index with baseline records

In [49]:
load_es_index(es_obj=es, 
              index_name=index_name,
              evidence_corpus=evidence_corpus)

100%|██████████| 40971/40971 [12:06<00:00, 56.40it/s]  


In [50]:
es.count(index=index_name)['count']

40971

#### Evaluate Recall

In [66]:
def analyze_retriever_recall(es_obj, index_name, qa_data):
    
    results = []
    
    for i, qa in enumerate(tqdm(qa_data)):
        
        question = qa['question_text']
        answer = qa['short_answer']
        ex_id = qa['example_id']
        truth_url = qa['document_url']
        
        # construct query
        query = {
            'query': {
                'query_string': {
                    'query': re.sub('[^A-Za-z0-9]+', ' ', question),
                    'default_field': 'document_text_clean'
                    }
                }
            }
        
        # execute query
        res = es_obj.search(index=index_name, body=query, size=5)
        
        # extract response info
        n_hits = res['hits']['total']['value']
        max_score = res['hits']['max_score']
        duration = res['took']
        
        # check if answer is present in results
        ans_in_res = int(any([answer.lower() in doc['_source']['document_text_clean'].lower() for doc in res['hits']['hits']]))
        
#         ans_in_res = int(any([truth_url == doc['_source']['document_url'] for doc in res['hits']['hits']]))
        
        
        rec = (ex_id, question, answer, n_hits, max_score, duration, ans_in_res)
        results.append(rec)
    
    # format results
    cols = ['example_id', 'question', 'answer', 'number_records_returned', 'max_score', 'query_duration', 'answer_present']
    test_results_df = pd.DataFrame(results, columns=cols)
        
    return test_results_df



In [67]:
baseline_results_df = analyze_retriever_recall(es_obj=es,
                                               index_name=index_name,
                                               qa_data=qa_records)

HBox(children=(FloatProgress(value=0.0, max=87407.0), HTML(value='')))




In [68]:
baseline_results_df.head()

Unnamed: 0,example_id,question,answer,number_records_returned,max_score,query_duration,answer_present
0,5328212470870865242,how i.met your mother who is the mother,Tracy McConnell,10000,14.74769,15,1
1,5289242154789678439,who had the most wins in the nfl,Tom Brady,10000,12.293189,12,1
2,-2500044561429484630,who played mantis guardians of the galaxy 2,Pom Klementieff,10000,28.207392,9,1
3,-1706790511507651062,what channel is the premier league on in france,SFR Sport,10000,17.24178,8,1
4,-7491001389340565191,god's not dead a light in the darkness release...,"March 30 , 2018",10000,19.870369,9,1


In [69]:
baseline_results_df.answer_present.value_counts(normalize=True)

1    0.844623
0    0.155377
Name: answer_present, dtype: float64

**84.5% Recall is pretty good (BERTserini got 85.8 at paragraph level). Is this driven by:**
- Only 40k documents to search over
- Full article is being used (not paragraphs or passages)
- These wikipedia articles contain the full text, tables included... I dont think BERTserini included all the extra HTML data which will be necessary for NQ

In [77]:
print(f'hello/my/name/{""}is/andrew')

hello/my/name/is/andrew


In [83]:
retriever_eval_only = False

In [84]:
if not retriever_eval_only:
    ext = ""
else:
    ext = "_fullsys"

outfile = f'../data/stage_data/extracted_clean_data{ext}.pkl' ## TO-DO: Make this implicit!   

In [85]:
outfile

'../data/stage_data/extracted_clean_data.pkl'

In [87]:
x = 5 if 3>2 else 9

In [95]:
!pwd

/Users/areed/Documents/FFL Research/FF14/qa_retriever_evaluation/notebooks


In [99]:
os.path.exists('/Users/areed/Documents/FFL Research/FF14/qa_retriever_evaluation/')

True

In [90]:
 a = "x" if not retriever_eval_only else "_fullsys"

In [93]:
bool(0)

False

## Next Steps - where to focus?

**Planned Experiments: to evaluate recall**
1. Chunking articles into passages of different lengths/approaches (expected drop in recall)
    - 100 tokens
    - 366 BERT tokens
    - Consideration: these articles include table/list data from wikipedia... chunks will be nonsensical??
2. Experiment with different forms of query expansion/enrichnment (expected to improve recall)
    - NER to extract things/people/organizaions and pass to query as an entity rather than multiple words
    - abbreviation expansion 
    - non-entity synonym enrichment
3. Maybe - getting full wikipedia extract to work with since we only have 40k unique articles...

**Concerns / Considerations / Next Steps:**
- only 40k unique articles
- There's nothing to baseline against, other than myself... need to look further for BM-25 performance on NQ evaluated by recall
- Qualitative analysis on caselaw data: train n-gram language model?
- Put NQ together with BERT to understand end-to-end system performance?



**What next: How does this fit in to the next blog post + the overall effort (paper/webinar)?**
- I think we ultimately want to try to show that small enhancements to a simple Retriever approach can [hopefully] have significant impact on end-to-end QA system on NQ (but this requires us to get all of Wikipedia + train BERT on NQ). I think this *might* be do-able considering ORQA only used short answers from BERT. So the heavy lifts would be:
    - Setting up Wikipedia search (which I think is doable)
    - Training BERT on NQ (short answers only) - not sure LOE here
    - Evaluating the end-to-end system - not sure the LOE here
    
- Do we focus away from open-domain QA and more towards case-law?
    - Qualitative analysis showing improvements in end to end system based solely on query enrichment


**Similar work**
- https://sigir.org/wp-content/uploads/2019/01/p040.pdf


In [103]:
def hi(a=True, b):
    
    return 

SyntaxError: non-default argument follows default argument (<ipython-input-103-c6db6d5f026f>, line 1)

In [102]:
hi(2)

## Testing

In [218]:
from routines import DataPreprocessingRoutine, DataCompilationRoutine

In [233]:
raw_data_path = '/Users/areed/Documents/FFL Research/FF14/qa_retriever_evaluation/data/raw_data/TESTING_v1.0-simplified_simplified-nq-train.jsonl'

dpr = DataPreprocessingRoutine(raw_data_path=raw_data_path,
                               retriever_eval_only=False)


In [234]:
dpr.run()

Exception: ('This file has already been created. Please delete it if you wish to recreate.:', '/Users/areed/Documents/FFL Research/FF14/qa_retriever_evaluation/data/stage_data/extracted_clean_data_fullsys.pkl')

In [231]:
clean_data_path = '/Users/areed/Documents/FFL Research/FF14/qa_retriever_evaluation/data/stage_data/extracted_clean_data_fullsys.pkl'

dcr = DataCompilationRoutine(clean_data_path=clean_data_path,
                             retriever_eval_only=False)


In [232]:
dcr.run()

Exception: These files have already been created. Please delete both to re-run.

In [230]:
bool('0')

True

In [140]:
dcr.evidence_corpus

[{'document_title': 'The_Mother_(How_I_Met_Your_Mother)',
  'document_url': 'https://en.wikipedia.org//w/index.php?title=The_Mother_(How_I_Met_Your_Mother)&amp;oldid=802354471',
  'document_text_clean': "The Mother ( How I Met Your Mother ) - wikipedia  The Mother ( How I Met Your Mother )  Jump to : navigation , search    Tracy McConnell     How I Met Your Mother character     The Mother appearing in `` The Locket ''     First appearance   `` Lucky Penny ( unseen ) '' `` Something New '' ( seen )     Last appearance   `` Last Forever ''     Created by   Carter Bays Craig Thomas     Portrayed by   Cristin Milioti     Information     Aliases   The Mother     Gender   Female     Spouse ( s )   Ted Mosby     Significant other ( s )   Max ( deceased former boyfriend ) Louis ( ex-boyfriend )     Children   Penny Mosby ( daughter , born in 2015 , played by Lyndsy Fonseca ) Luke Mosby ( son , born in 2017 , played by David Henrie )     Nationality   American     Tracy McConnell , better known

In [141]:
dcr.qa_records

[{'example_id': 5328212470870865242,
  'document_title': 'The_Mother_(How_I_Met_Your_Mother)',
  'document_url': 'https://en.wikipedia.org//w/index.php?title=The_Mother_(How_I_Met_Your_Mother)&amp;oldid=802354471',
  'question_text': 'how i.met your mother who is the mother',
  'short_answer': 'Tracy McConnell'},
 {'example_id': 4435104480114867852,
  'document_title': 'Human_fertilization',
  'document_url': 'https://en.wikipedia.org//w/index.php?title=Human_fertilization&amp;oldid=831042507',
  'question_text': 'what type of fertilisation takes place in humans',
  'short_answer': ''},
 {'example_id': 5289242154789678439,
  'document_title': 'List_of_National_Football_League_career_quarterback_wins_leaders',
  'document_url': 'https://en.wikipedia.org//w/index.php?title=List_of_National_Football_League_career_quarterback_wins_leaders&amp;oldid=818143757',
  'question_text': 'who had the most wins in the nfl',
  'short_answer': 'Tom Brady'},
 {'example_id': 5489863933082811018,
  'docu

In [None]:
dcr.evidence_corpus

### Chunker

In [154]:
dcr.evidence_corpus

[{'document_title': 'The_Mother_(How_I_Met_Your_Mother)',
  'document_url': 'https://en.wikipedia.org//w/index.php?title=The_Mother_(How_I_Met_Your_Mother)&amp;oldid=802354471',
  'document_text_clean': "The Mother ( How I Met Your Mother ) - wikipedia  The Mother ( How I Met Your Mother )  Jump to : navigation , search    Tracy McConnell     How I Met Your Mother character     The Mother appearing in `` The Locket ''     First appearance   `` Lucky Penny ( unseen ) '' `` Something New '' ( seen )     Last appearance   `` Last Forever ''     Created by   Carter Bays Craig Thomas     Portrayed by   Cristin Milioti     Information     Aliases   The Mother     Gender   Female     Spouse ( s )   Ted Mosby     Significant other ( s )   Max ( deceased former boyfriend ) Louis ( ex-boyfriend )     Children   Penny Mosby ( daughter , born in 2015 , played by Lyndsy Fonseca ) Luke Mosby ( son , born in 2017 , played by David Henrie )     Nationality   American     Tracy McConnell , better known

In [159]:
dcr.evidence_corpus[0]

{'document_title': 'The_Mother_(How_I_Met_Your_Mother)',
 'document_url': 'https://en.wikipedia.org//w/index.php?title=The_Mother_(How_I_Met_Your_Mother)&amp;oldid=802354471',
 'document_text_clean': "The Mother ( How I Met Your Mother ) - wikipedia  The Mother ( How I Met Your Mother )  Jump to : navigation , search    Tracy McConnell     How I Met Your Mother character     The Mother appearing in `` The Locket ''     First appearance   `` Lucky Penny ( unseen ) '' `` Something New '' ( seen )     Last appearance   `` Last Forever ''     Created by   Carter Bays Craig Thomas     Portrayed by   Cristin Milioti     Information     Aliases   The Mother     Gender   Female     Spouse ( s )   Ted Mosby     Significant other ( s )   Max ( deceased former boyfriend ) Louis ( ex-boyfriend )     Children   Penny Mosby ( daughter , born in 2015 , played by Lyndsy Fonseca ) Luke Mosby ( son , born in 2017 , played by David Henrie )     Nationality   American     Tracy McConnell , better known as

In [161]:
reader.tokenize(dcr.evidence_corpus[0])

TypeError: tokenize() missing 1 required positional argument: 'text'

In [155]:
from utils.model_utils import DocumentReader

reader = DocumentReader("deepset/bert-base-cased-squad2")



In [156]:
question = "Who ruled Macedonia"

context = """Macedonia was an ancient kingdom on the periphery of Archaic and Classical Greece, 
and later the dominant state of Hellenistic Greece. The kingdom was founded and initially ruled 
by the Argead dynasty, followed by the Antipatrid and Antigonid dynasties. Home to the ancient 
Macedonians, it originated on the northeastern part of the Greek peninsula. Before the 4th 
century BC, it was a small kingdom outside of the area dominated by the city-states of Athens, 
Sparta and Thebes, and briefly subordinate to Achaemenid Persia."""

In [158]:
reader.tokenize(question, context)
print(f"Answer: {reader.get_answer()}")

Answer: the Argead dynasty


In [111]:
import json

In [112]:
data = []
filepath = '/Users/areed/Documents/FFL Research/FF14/qa_retriever_evaluation/data/raw_data/v1.0-simplified_simplified-nq-train.jsonl'
with open(filepath, 'rb') as f:
    for i, line in enumerate(tqdm(f)):
        data.append(json.loads(line.decode('utf-8')))
        if i == 50: break

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [114]:
filepath = '/Users/areed/Documents/FFL Research/FF14/qa_retriever_evaluation/data/raw_data/TESTING_v1.0-simplified_simplified-nq-train.jsonl'
with open(filepath, 'w') as outfile:
    for entry in data:
        json.dump(entry, outfile)
        outfile.write('\n')

## Downloader

In [180]:
import requests
from selenium import webdriver
from bs4 import BeautifulSoup, SoupStrainer

In [164]:
nq_url = 'https://ai.google.com/research/NaturalQuestions/download'

In [176]:
# get url response
r = requests.get(nq_url)

In [214]:
from selenium import webdriver

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
#options.add_argument('--incognito')
#options.add_argument('--headless')
driver = webdriver.Chrome("/usr/local/bin/chromedriver", chrome_options=options)


  driver = webdriver.Chrome("/usr/local/bin/chromedriver", chrome_options=options)


In [215]:
driver.get(nq_url)

In [216]:
download = driver.find_element_by_class_name('mat-button-wrapper')

In [217]:
download.click()

In [209]:
driver

TypeError: 'WebDriver' object is not subscriptable

In [204]:
driver.

<selenium.webdriver.chrome.webdriver.WebDriver (session="74807b257a49091382e2c2b5ccc3b803")>

In [197]:
links = driver.find_element_by_link_text('Download')

In [200]:
links.

<selenium.webdriver.remote.webelement.WebElement (session="97f088496c5880714d371d4c78606949", element="5531f773-b860-4aa4-89cd-be91b083fbe4")>

In [None]:
# brew tap homebrew/cask && brew cask install chromedriver

In [179]:
# create beautiful-soup object 
soup = BeautifulSoup(r.content,'html.parser') 

In [178]:
soup



In [173]:
soup.find_all('a')

[]

In [None]:
# module_path = os.path.dirname(os.path.realpath(__file__))
# if module_path not in sys.path:
#     sys.path.append(module_path)

# print(module_path)