# Natural Questions - Elastic Search Setup for Retriever Evaluation

In [1]:
import re
import json
import pickle
import logging
import pandas as pd
from tqdm.notebook import tqdm
from elasticsearch import Elasticsearch

**Resources:**
- https://towardsdatascience.com/getting-started-with-elasticsearch-in-python-c3598e718380

## Connect to ElasticSearch

In [17]:
local_config = [{'host': 'localhost', 'port': 9200}]
es = Elasticsearch(local_config)

es.ping()

True

In [3]:
es.indices.get_alias('*')

{'nq_wiki_data': {'aliases': {}}, 'cases': {'aliases': {}}}

In [114]:
# Delete an index
#es.indices.delete(index='nq_wiki_data')

{'acknowledged': True}

## Create and Load and Index

In [8]:
es_settings = {
    "mappings": {
        "dynamic": "strict",        
        "properties": {
            "document_title": {"type": "text"},
            "document_url": {"type": "text"},
            "document_text_clean": {"type": "text"}
            }
        }
    }

index_name = 'nq_wiki_data_test'

es.indices.create(index=index_name, body=es_settings, ignore=400)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'nq_wiki_data_test'}

In [11]:
file_path = '../data/eval_data/'
file_name = 'evidence_corpus.pkl'

with open(file_path+file_name, 'rb') as f:
    
    evidence_corpus = pickle.load(f)

In [12]:
evidence_corpus[0]

{'document_title': 'The_Mother_(How_I_Met_Your_Mother)',
 'document_url': 'https://en.wikipedia.org//w/index.php?title=The_Mother_(How_I_Met_Your_Mother)&amp;oldid=802354471',
 'document_text_clean': "The Mother ( How I Met Your Mother ) - wikipedia  The Mother ( How I Met Your Mother )  Jump to : navigation , search    Tracy McConnell     How I Met Your Mother character     The Mother appearing in `` The Locket ''     First appearance   `` Lucky Penny ( unseen ) '' `` Something New '' ( seen )     Last appearance   `` Last Forever ''     Created by   Carter Bays Craig Thomas     Portrayed by   Cristin Milioti     Information     Aliases   The Mother     Gender   Female     Spouse ( s )   Ted Mosby     Significant other ( s )   Max ( deceased former boyfriend ) Louis ( ex-boyfriend )     Children   Penny Mosby ( daughter , born in 2015 , played by Lyndsy Fonseca ) Luke Mosby ( son , born in 2017 , played by David Henrie )     Nationality   American     Tracy McConnell , better known as

In [13]:
# load into test index

for i, rec in enumerate(tqdm(evidence_corpus)):
    
    try:
        index_status = es.index(index='nq_wiki_data_test', id=i, body=rec)

    except Exception as ex:
        print(str(ex))

HBox(children=(FloatProgress(value=0.0, max=40971.0), HTML(value='')))




KeyboardInterrupt: 

In [14]:
es.count(index='nq_wiki_data_test')['count']

3763

In [116]:
file_path = '../data/eval_data/'
file_name = 'ES_wiki_docs_full.jsonl'


with open(file_path+file_name, 'rb') as f:
    for i, line in enumerate(tqdm(f)):
        
        doc = json.loads(str(line, 'utf8'))
        
        try:
            index_status = es.index(index=index_name, id=i, body=doc)

        except Exception as ex:
            print(str(ex))


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=10))
ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=10))
ConnectionTimeout caused by - ReadTimeoutError(HTTPConnectionPool(host='localhost', port=9200): Read timed out. (read timeout=10))



In [None]:
pickle.dump()

In [125]:
assert es.count(index=index_name)['count'] == 87108

In [4]:
query = {
    'query': {
        'query_string': {
            'query': 'why did argentina attack the falklands malvinas islands',
            'default_field': 'document_text_clean'
            }
        }
    }

In [187]:
re.sub('[^A-Za-z0-9]+', ' ', 'why did argentina attack the falklands/malvinas islands')

'why did argentina attack the falklands malvinas islands'

In [6]:
index_name = 'nq_wiki_data'
res = es.search(index=index_name, body=query, size=5)

In [7]:
res

{'took': 1129,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': 38.610443,
  'hits': [{'_index': 'nq_wiki_data',
    '_type': '_doc',
    '_id': '23',
    '_score': 38.610443,
    '_source': {'document_url': 'https://en.wikipedia.org//w/index.php?title=Falklands_War&amp;oldid=865589436',
     'document_title': 'Falklands_War',
   {'_index': 'nq_wiki_data',
    '_type': '_doc',
    '_id': '40295',
    '_score': 34.177273,
    '_source': {'document_url': 'https://en.wikipedia.org//w/index.php?title=Falkland_Islands&amp;oldid=817794632',
     'document_title': 'Falkland_Islands',
     'document_text_clean': "Falkland Islands - wikipedia  Falkland Islands  Jump to : navigation , search `` Falklands '' and `` Malvinas '' redirect here . For other uses , see Falklands ( disambiguation ) and Malvinas ( disambiguation ) .    Falkland Islands     Flag Coat of arms     Motto : `` Desi

In [140]:
for hit in res['hits']['hits']:
    print(hit['_score'])

8.69899
8.698753
8.678847
8.66811
8.602217


In [111]:
a = es.search(index=index_name, body=query)

In [54]:
es.indices.stats(index_name)

{'_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_all': {'primaries': {'docs': {'count': 0, 'deleted': 0},
   'store': {'size_in_bytes': 41164},
   'indexing': {'index_total': 4,
    'index_time_in_millis': 129,
    'index_current': 0,
    'index_failed': 0,
    'delete_total': 0,
    'delete_time_in_millis': 0,
    'delete_current': 0,
    'noop_update_total': 0,
    'is_throttled': False,
    'throttle_time_in_millis': 0},
   'get': {'total': 0,
    'time_in_millis': 0,
    'exists_total': 0,
    'exists_time_in_millis': 0,
    'missing_total': 0,
    'missing_time_in_millis': 0,
    'current': 0},
   'search': {'open_contexts': 0,
    'query_total': 0,
    'query_time_in_millis': 0,
    'query_current': 0,
    'fetch_total': 0,
    'fetch_time_in_millis': 0,
    'fetch_current': 0,
    'scroll_total': 0,
    'scroll_time_in_millis': 0,
    'scroll_current': 0,
    'suggest_total': 0,
    'suggest_time_in_millis': 0,
    'suggest_current': 0},
   'merges': {'current': 0,
   

## Analyze ES Retreival Performance

#### Load QA data

In [143]:
file_path = '../data/eval_data/'
file_name = 'slim_extracted_data_records.pkl'

with open(file_path+file_name, 'rb') as f:
    
    slim_data = pickle.load(f)

In [145]:
slim_data[0]

{'example_id': 5655493461695504401,
 'document_url': 'https://en.wikipedia.org//w/index.php?title=Email_marketing&amp;oldid=814071202',
 'question_text': 'which is the most common use of opt-in e-mail marketing',
 'short_answer': "a newsletter sent to an advertising firm 's customers"}

#### Analyze QA's

In [337]:
def analyze_retriever_recall(es_object, index_name, qa_data):
    
    results = []
    
    for i, qa in enumerate(tqdm(qa_data)):
        
        question = qa['question_text']
        answer = qa['short_answer']
        ex_id = qa['example_id']
        truth_url = qa['document_url']
        
        # construct query
        query = {
            'query': {
                'query_string': {
                    'query': re.sub('[^A-Za-z0-9]+', ' ', question),
                    'default_field': 'document_text_clean'
                    }
                }
            }
        
        # execute query
        res = es_object.search(index=index_name, body=query, size=5)
        
        # extract response info
        n_hits = res['hits']['total']['value']
        max_score = res['hits']['max_score']
        duration = res['took']
        
        # check if answer is present in results
        
        ans_in_res = int(any([answer.lower() in doc['_source']['document_text_clean'].lower() for doc in res['hits']['hits']]))
        
#         ans_in_res = int(any([truth_url == doc['_source']['document_url'] for doc in res['hits']['hits']]))
        
        
        rec = (ex_id, question, answer, n_hits, max_score, duration, ans_in_res)
        results.append(rec)
        
    return results

In [338]:
# match of lowercase answer string in answer
test_results = analyze_retriever_recall(es, index_name, slim_data)

HBox(children=(FloatProgress(value=0.0, max=106848.0), HTML(value='')))




In [339]:
cols = ['Example_ID', 'Question', 'Answer', 'Number_Records_Returned', 'Max_Score', 'Query_Duration', 'Answer_Present']
test_results_df = pd.DataFrame(test_results, columns=cols)



In [340]:
test_results_df.Answer_Present.value_counts()

1    80624
0    26224
Name: Answer_Present, dtype: int64

In [341]:
test_results_df.Answer_Present.value_counts(normalize=True)

1    0.754567
0    0.245433
Name: Answer_Present, dtype: float64

In [342]:
test_results_df.shape

(106848, 7)

In [343]:
test_results_df.head()

Unnamed: 0,Example_ID,Question,Answer,Number_Records_Returned,Max_Score,Query_Duration,Answer_Present
0,5655493461695504401,which is the most common use of opt-in e-mail ...,a newsletter sent to an advertising firm 's cu...,10000,21.880262,22,1
1,5328212470870865242,how i.met your mother who is the mother,Tracy McConnell,10000,13.217962,31,1
2,5289242154789678439,who had the most wins in the nfl,Tom Brady,10000,11.294436,22,1
3,-2500044561429484630,who played mantis guardians of the galaxy 2,Pom Klementieff,10000,26.414034,14,1
4,212419696952141239,the nashville sound brought a polished and cos...,the use of lush string arrangements with a rea...,10000,21.313108,33,0


In [325]:
test_results_df.Answer_Present.value_counts()

1    3776
0    1225
Name: Answer_Present, dtype: int64

In [193]:
test_results[0]

(5655493461695504401,
 'which is the most common use of opt-in e-mail marketing',
 "a newsletter sent to an advertising firm 's customers",
 10000,
 21.880262,
 94,
 0)

In [181]:
slim_data[23]

{'example_id': -6432014180752712230,
 'document_url': 'https://en.wikipedia.org//w/index.php?title=Falklands_War&amp;oldid=865589436',
 'question_text': 'why did argentina attack the falklands/malvinas islands',
 'short_answer': 'an attempt to establish the sovereignty it had claimed over them'}

In [178]:
slim_data

[{'example_id': 5655493461695504401,
  'document_url': 'https://en.wikipedia.org//w/index.php?title=Email_marketing&amp;oldid=814071202',
  'question_text': 'which is the most common use of opt-in e-mail marketing',
  'short_answer': "a newsletter sent to an advertising firm 's customers"},
 {'example_id': 5328212470870865242,
  'document_url': 'https://en.wikipedia.org//w/index.php?title=The_Mother_(How_I_Met_Your_Mother)&amp;oldid=802354471',
  'question_text': 'how i.met your mother who is the mother',
  'short_answer': 'Tracy McConnell'},
 {'example_id': 5289242154789678439,
  'document_url': 'https://en.wikipedia.org//w/index.php?title=List_of_National_Football_League_career_quarterback_wins_leaders&amp;oldid=818143757',
  'question_text': 'who had the most wins in the nfl',
  'short_answer': 'Tom Brady'},
 {'example_id': -2500044561429484630,
  'document_url': 'https://en.wikipedia.org//w/index.php?title=Pom_Klementieff&amp;oldid=829523540',
  'question_text': 'who played mantis 

In [157]:
test = ['blah blah blah', 'bleh yikes', 'loud car', 'angry dog']

In [160]:
test

['blah blah blah', 'bleh yikes', 'loud car', 'angry dog']

In [174]:
int('yikes' in test)

0

In [175]:
int(True)

1

In [165]:
any(['yikes' in substring for substring in test])

True

In [169]:
['Alice may not have a MUA on her computer' in doc['_source']['document_text_clean'] for doc in res['hits']['hits']]

[True, True, False, False, False]

In [173]:
# ms
res

{'took': 15,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1627, 'relation': 'eq'},
  'max_score': 8.69899,
  'hits': [{'_index': 'nq_wiki_data',
    '_type': '_doc',
    '_id': '21875',
    '_score': 8.69899,
    '_source': {'document_url': 'https://en.wikipedia.org//w/index.php?title=Email&amp;oldid=843387842',
     'document_title': 'Email',
     'document_text_clean': "Email - wikipedia  Email  Jump to : navigation , search This article is about the communications medium . For the former manufacturing conglomerate , see Email Limited . `` Inbox '' redirects here . For the Google product , see Inbox by Gmail . This screenshot shows the `` Inbox '' page of an email client , where users can see new emails and take actions , such as reading , deleting , saving , or responding to these messages The at sign , a part of every SMTP email address  Electronic mail ( email or e-mail ) is a method of exchanging messages 

In [170]:
res['hits']['hits'][0]

{'_index': 'nq_wiki_data',
 '_type': '_doc',
 '_id': '21875',
 '_score': 8.69899,
 '_source': {'document_url': 'https://en.wikipedia.org//w/index.php?title=Email&amp;oldid=843387842',
  'document_title': 'Email',
  'document_text_clean': "Email - wikipedia  Email  Jump to : navigation , search This article is about the communications medium . For the former manufacturing conglomerate , see Email Limited . `` Inbox '' redirects here . For the Google product , see Inbox by Gmail . This screenshot shows the `` Inbox '' page of an email client , where users can see new emails and take actions , such as reading , deleting , saving , or responding to these messages The at sign , a part of every SMTP email address  Electronic mail ( email or e-mail ) is a method of exchanging messages ( `` mail '' ) between people using electronic devices . Email first entered limited use in the 1960s and by the mid-1970s had taken the form now recognized as email . Email operates across computer networks , w

In [171]:
res['hits']['hits'][1]

{'_index': 'nq_wiki_data',
 '_type': '_doc',
 '_id': '76097',
 '_score': 8.698753,
 '_source': {'document_url': 'https://en.wikipedia.org//w/index.php?title=Email&amp;oldid=849062629',
  'document_title': 'Email',
  'document_text_clean': "Email - wikipedia  Email  This article is about the communications medium . For the former manufacturing conglomerate , see Email Limited . `` Inbox '' redirects here . For the Google product , see Inbox by Gmail . This screenshot shows the `` Inbox '' page of an email client , where users can see new emails and take actions , such as reading , deleting , saving , or responding to these messages The at sign , a part of every SMTP email address  Electronic mail ( email or e-mail ) is a method of exchanging messages ( `` mail '' ) between people using electronic devices . Invented by Ray Tomlinson , email first entered limited use in the 1960s and by the mid-1970s had taken the form now recognized as email . Email operates across computer networks , wh

In [16]:
type(es)

elasticsearch.client.Elasticsearch

In [None]:
try:
    jump
except:
    

# How to handle reserved characters in input string??