In [59]:
%load_ext autoreload
%autoreload 2

import sys
import os
import re
import pandas as pd
from tqdm.notebook import tqdm
from elasticsearch import Elasticsearch

module_path = os.path.dirname(os.getcwd())
if module_path not in sys.path:
    sys.path.append(module_path)

from utils.data_utils import load_pkl_file
from utils.elastic_utils import connect_es, create_es_index, load_es_index

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Connect to Local ElasticSearch Instance

In [44]:
local_config = [{'host': 'localhost', 'port': 9200}]
es = Elasticsearch(local_config)

In [45]:
es.ping()

True

## Evaluate a Baseline

#### Load baseline data

In [16]:
# load data
evidence_corpus = load_pkl_file('../data/eval_data/evidence_corpus.pkl')
qa_records = load_pkl_file('../data/eval_data/qa_records.pkl')

In [23]:
len(evidence_corpus), len(qa_records)

(40971, 87407)

In [21]:
evidence_corpus[3]

{'document_title': 'List_of_Premier_League_broadcasters',
 'document_url': 'https://en.wikipedia.org//w/index.php?title=List_of_Premier_League_broadcasters&amp;oldid=808611916',
 'document_text_clean': "List of Premier League broadcasters - wikipedia  List of Premier League broadcasters  Jump to : navigation , search  This is a list of television broadcasters which provide coverage of the Premier League , English football 's top level competition , which is the most watched league in the world . The main broadcasters in the United Kingdom are Sky Sports , who broadcast 126 of the 168 televised games in the UK , and BT Sport . The BBC shows weekly highlights of the Premier League on its Match of the Day and Match of the Day 2 programmes on Saturdays and Sundays .   The 168 UK televised games are also broadcast across the world ; the remaining 212 matches that are n't broadcast live in the UK are all broadcast elsewhere around the world . English - speaking countries ( excluding the UK )

In [22]:
qa_records[3]

{'example_id': -1706790511507651062,
 'document_title': 'List_of_Premier_League_broadcasters',
 'document_url': 'https://en.wikipedia.org//w/index.php?title=List_of_Premier_League_broadcasters&amp;oldid=808611916',
 'question_text': 'what channel is the premier league on in france',
 'short_answer': 'SFR Sport'}

#### Create the index

In [43]:
# create index

index_name = 'baseline'

settings = {
    "mappings": {
        "dynamic": "strict",        
        "properties": {
            "document_title": {"type": "text"},
            "document_url": {"type": "text"},
            "document_text_clean": {"type": "text"}
            }
        }
    }


create_es_index(es_obj=es, 
                settings=settings, 
                index_name=index_name)

In [46]:
es.indices.get_alias('*')

{'baseline': {'aliases': {}},
 'nq_wiki_data_test': {'aliases': {}},
 'nq_wiki_data': {'aliases': {}},
 'cases': {'aliases': {}}}

#### Populate index with baseline records

In [49]:
load_es_index(es_obj=es, 
              index_name=index_name,
              evidence_corpus=evidence_corpus)

100%|██████████| 40971/40971 [12:06<00:00, 56.40it/s]  


In [50]:
es.count(index=index_name)['count']

40971

#### Evaluate Recall

In [66]:
def analyze_retriever_recall(es_obj, index_name, qa_data):
    
    results = []
    
    for i, qa in enumerate(tqdm(qa_data)):
        
        question = qa['question_text']
        answer = qa['short_answer']
        ex_id = qa['example_id']
        truth_url = qa['document_url']
        
        # construct query
        query = {
            'query': {
                'query_string': {
                    'query': re.sub('[^A-Za-z0-9]+', ' ', question),
                    'default_field': 'document_text_clean'
                    }
                }
            }
        
        # execute query
        res = es_obj.search(index=index_name, body=query, size=5)
        
        # extract response info
        n_hits = res['hits']['total']['value']
        max_score = res['hits']['max_score']
        duration = res['took']
        
        # check if answer is present in results
        ans_in_res = int(any([answer.lower() in doc['_source']['document_text_clean'].lower() for doc in res['hits']['hits']]))
        
#         ans_in_res = int(any([truth_url == doc['_source']['document_url'] for doc in res['hits']['hits']]))
        
        
        rec = (ex_id, question, answer, n_hits, max_score, duration, ans_in_res)
        results.append(rec)
    
    # format results
    cols = ['example_id', 'question', 'answer', 'number_records_returned', 'max_score', 'query_duration', 'answer_present']
    test_results_df = pd.DataFrame(results, columns=cols)
        
    return test_results_df



In [67]:
baseline_results_df = analyze_retriever_recall(es_obj=es,
                                               index_name=index_name,
                                               qa_data=qa_records)

HBox(children=(FloatProgress(value=0.0, max=87407.0), HTML(value='')))




In [68]:
baseline_results_df.head()

Unnamed: 0,example_id,question,answer,number_records_returned,max_score,query_duration,answer_present
0,5328212470870865242,how i.met your mother who is the mother,Tracy McConnell,10000,14.74769,15,1
1,5289242154789678439,who had the most wins in the nfl,Tom Brady,10000,12.293189,12,1
2,-2500044561429484630,who played mantis guardians of the galaxy 2,Pom Klementieff,10000,28.207392,9,1
3,-1706790511507651062,what channel is the premier league on in france,SFR Sport,10000,17.24178,8,1
4,-7491001389340565191,god's not dead a light in the darkness release...,"March 30 , 2018",10000,19.870369,9,1


In [69]:
baseline_results_df.answer_present.value_counts(normalize=True)

1    0.844623
0    0.155377
Name: answer_present, dtype: float64

**84.5% Recall is pretty good (BERTserini got 85.8 at paragraph level). Is this driven by:**
- Only 40k documents to search over
- Full article is being used (not paragraphs or passages)
- These wikipedia articles contain the full text, tables included... I dont think BERTserini included all the extra HTML data which will be necessary for NQ

In [77]:
print(f'hello/my/name/{""}is/andrew')

hello/my/name/is/andrew


In [83]:
retriever_eval_only = False

In [84]:
if not retriever_eval_only:
    ext = ""
else:
    ext = "_fullsys"

outfile = f'../data/stage_data/extracted_clean_data{ext}.pkl' ## TO-DO: Make this implicit!   

In [85]:
outfile

'../data/stage_data/extracted_clean_data.pkl'

In [87]:
x = 5 if 3>2 else 9

In [95]:
!pwd

/Users/areed/Documents/FFL Research/FF14/qa_retriever_evaluation/notebooks


In [99]:
os.path.exists('/Users/areed/Documents/FFL Research/FF14/qa_retriever_evaluation/')

True

In [90]:
 a = "x" if not retriever_eval_only else "_fullsys"

In [93]:
bool(0)

False

## Next Steps - where to focus?

**Planned Experiments: to evaluate recall**
1. Chunking articles into passages of different lengths/approaches (expected drop in recall)
    - 100 tokens
    - 366 BERT tokens
    - Consideration: these articles include table/list data from wikipedia... chunks will be nonsensical??
2. Experiment with different forms of query expansion/enrichnment (expected to improve recall)
    - NER to extract things/people/organizaions and pass to query as an entity rather than multiple words
    - abbreviation expansion 
    - non-entity synonym enrichment
3. Maybe - getting full wikipedia extract to work with since we only have 40k unique articles...

**Concerns / Considerations / Next Steps:**
- only 40k unique articles
- There's nothing to baseline against, other than myself... need to look further for BM-25 performance on NQ evaluated by recall
- Qualitative analysis on caselaw data: train n-gram language model?
- Put NQ together with BERT to understand end-to-end system performance?



**What next: How does this fit in to the next blog post + the overall effort (paper/webinar)?**
- I think we ultimately want to try to show that small enhancements to a simple Retriever approach can [hopefully] have significant impact on end-to-end QA system on NQ (but this requires us to get all of Wikipedia + train BERT on NQ). I think this *might* be do-able considering ORQA only used short answers from BERT. So the heavy lifts would be:
    - Setting up Wikipedia search (which I think is doable)
    - Training BERT on NQ (short answers only) - not sure LOE here
    - Evaluating the end-to-end system - not sure the LOE here
    
- Do we focus away from open-domain QA and more towards case-law?
    - Qualitative analysis showing improvements in end to end system based solely on query enrichment


**Similar work**
- https://sigir.org/wp-content/uploads/2019/01/p040.pdf
