In [1]:
from haystack import document_stores
from haystack.nodes import BM25Retriever, FARMReader
from haystack.pipelines import ExtractiveQAPipeline

In [2]:
from datasets import load_dataset
from tqdm.notebook import tqdm

In [3]:
wiki_data = load_dataset("olm/wikipedia", language='simple', date="20221201", split=['train'])



  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
from haystack.utils import launch_es
launch_es()
document_store = document_stores.ElasticsearchDocumentStore()

In [5]:
import numpy as np
import pandas as pd
rng = np.random.default_rng(12345)
rand_idx = rng.random(50000)*len(wiki_data[0])
wiki_small = pd.DataFrame(data=wiki_data[0][rand_idx])

In [6]:
wiki_small.iloc[0,:]['text']

"Adenoids (also called pharyngeal tonsils, or nasopharyngeal tonsils) are tissues at the very back of the nose.  They are in the part of the nose where is joins the mouth.\n\nIn most children they make a soft bump on the top and back section of the nose's air passage.\n\nTaking away adenoids with surgery is called adenoidectomy.\n\nHead (body part)"

In [7]:
docs = []
batch_size = 1000
# total_doc_count = len(wiki_data)
# total_doc_count = 50000
# print('total_doc_count:', total_doc_count)
for i in tqdm(range(len(wiki_small)), desc='getting wiki articles'):
    to_dict = wiki_small.iloc[i,:]
    doc = {'meta':{'id': to_dict['id'], 'url': to_dict['url'], 'title': to_dict['title']}}
#     print('doc set up. adding text')
    doc['content'] = to_dict['text'] 
    # doc = document_stores.convert_to_doc_dict(article)
    docs.append(doc)
    if i % batch_size == 0:     
      document_store.write_documents(docs)      
      docs.clear()
#     if idx >= total_doc_count:
#       break  

getting wiki articles:   0%|          | 0/50000 [00:00<?, ?it/s]

In [8]:
retriever = BM25Retriever(document_store=document_store)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)


pipe = ExtractiveQAPipeline(reader, retriever)

In [23]:
jeopardy = "Whose theory did Galileo espouse and was under house arrest as a result for the last 8 years of his life?"
prediction = pipe.run(
    query=jeopardy, params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 1}}
)

Inferencing Samples: 100%|█| 1/1 [00:10<00:00, 10.83s/ Batches


KeyError: 'answer'

In [49]:
prediction['answers'][0].to_dict()['answer']

'Galileo Galilei'

In [35]:
import json
with open('JEOPARDY_QUESTIONS1.json','r') as file:
    questions = json.load(file)
    
rand_q_idx = rng.random(1000)*39495
sample_questions = []
for i in rand_q_idx.astype('int'):
    sample_questions.append(questions[i])

In [37]:
with open('sample_questions.json','w') as file:
    json.dump(sample_questions, file)

In [38]:
sample_questions[0]

{'category': 'COMMON BONDS',
 'air_date': '1998-09-07',
 'question': "'A game of footsie,<br />a bribe,<br />a drunk person'",
 'value': '$500',
 'answer': 'Things that are done under the table',
 'round': 'Jeopardy!',
 'show_number': '3216'}

In [76]:
import threading
def get_pred_ans(sample_questions, id, score, ans):
    for q in sample_questions:
        prediction = pipe.run(query = q['category'] +": "+ q['question'],params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 1}})
        pred_ans = prediction['answers'][0]
        ans.append(pred_ans.to_dict()['answer'])
        score.append(pred_ans.to_dict()['score'])
threads = 4
jobs = []
for i in range(0, threads):
    score = list()
    ans = list()
    thread = threading.Thread(target=get_pred_ans(sample_questions, i, score, ans))
    jobs.append(thread)
for j in jobs:
    j.start()
for j in jobs:
    j.join()
print(len(score))
print(len(ans))

Inferencing Samples: 100%|█| 1/1 [00:01<00:00,  1.82s/ Batches
Inferencing Samples: 100%|█| 1/1 [00:05<00:00,  5.16s/ Batches
Inferencing Samples: 100%|█| 1/1 [00:03<00:00,  3.72s/ Batches
Inferencing Samples: 100%|█| 1/1 [00:07<00:00,  7.77s/ Batches
Inferencing Samples:   0%|        | 0/1 [00:01<?, ? Batches/s]


KeyboardInterrupt: 

In [10]:
import json
with open('sample_questions_with_predicted.json','r') as file:
    sample_questions = json.load(file)

In [12]:
q = sample_questions[0]
print(q)
print(q['predicted_question'][0][0][10:])
# prediction = pipe.run(query = q['predicted_question'][0][0][10:],params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 1}})
prediction['answers'][0]

{'category': 'COMMON BONDS', 'air_date': '1998-09-07', 'question': "'A game of footsie,<br />a bribe,<br />a drunk person'", 'value': '$500', 'answer': 'Things that are done under the table', 'round': 'Jeopardy!', 'show_number': '3216', 'predicted_question': [['question: What is the difference between a game of footsie,br />a bribe and a drunk person?']]}
What is the difference between a game of footsie,br />a bribe and a drunk person?


<Answer {'answer': 'Obi rejects', 'type': 'extractive', 'score': 0.07148900628089905, 'context': 'hips to students. A man offers a bribe on behalf of his sister, which Obi rejects. The girl herself visits Obi and tries to bribe him with sex for a s', 'offsets_in_document': [{'start': 1182, 'end': 1193}], 'offsets_in_context': [{'start': 70, 'end': 81}], 'document_id': 'a4614d69f5bebbed2c708de34bbc77df', 'meta': {'id': '775815', 'url': 'https://simple.wikipedia.org/wiki/No%20Longer%20at%20Ease', 'title': 'No Longer at Ease'}}>

In [15]:
import time
ans = []
score = []
timer = 0
for i in tqdm(range(len(sample_questions))):
    start = time.time()
    q = sample_questions[i]
    prediction = pipe.run(query = q['predicted_question'][0][0][10:],params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 1}})
    pred_ans = prediction['answers'][0]
    timer += time.time()-start
    ans.append(pred_ans.to_dict()['answer'])
    score.append(pred_ans.to_dict()['score'])

  0%|          | 0/1000 [00:00<?, ?it/s]


Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|█| 1/1 [00:05<00:00,  5.12s/ Batches[A

Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|█| 1/1 [00:08<00:00,  8.73s/ Batches[A

Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|█| 1/1 [00:03<00:00,  3.60s/ Batches[A

Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|█| 1/1 [00:03<00:00,  3.24s/ Batches[A

Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|█| 1/1 [00:09<00:00,  9.03s/ Batches[A

Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|█| 1/1 [00:05<00:00,  5.38s/ Batches[A

Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|█| 1/1 [00:08<00:00,  8.37s/ Batches[A

Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
In

In [16]:
timer/len(sample_questions)

7.134882670879364

In [29]:
with open('haystack_ans.txt','r') as file:
    og_ans = json.load(file)
with open('haystack_score.txt','r') as file:
    og_score = json.load(file)

In [17]:
ans[0]

'Obi rejects'

In [18]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)
#https://qa.fastforwardlabs.com/no%20answer/null%20threshold/bert/distilbert/exact%20match/f1/robust%20predictions/2020/06/09/Evaluating_BERT_on_SQuAD.html#F1

In [19]:
f1s = 0
ems = 0
for idx, q in enumerate(sample_questions):
    f1 = compute_f1(ans[idx], q['answer'])
    em = compute_exact_match(ans[idx], q['answer'])
    f1s += f1
    ems += em
print(f1s/len(ans))
print(ems/len(ans))

0.051408561441512575
0.034


In [20]:
f1s = 0
ems = 0
for idx, q in enumerate(sample_questions[:len(ans)]):
    f1 = compute_f1(ans[idx], q['answer']) / score[idx]
    em = compute_exact_match(ans[idx], q['answer']) / score[idx]
    f1s += f1
    ems += em
print(f1s/len(ans))
print(ems/len(ans))

0.4840575724220811
0.24649207638425363


In [21]:
with open('haystack_score_reform_q.txt','w') as file:
    json.dump(score, file)

In [22]:
with open('haystack_ans_reform_q.txt','w') as file:
    json.dump(ans, file)

In [23]:
import time
rest_ans = []
rest_score = []
timer = 0
for i in tqdm(range(719,len(sample_questions))):
    start = time.time()
    q = sample_questions[i]
    prediction = pipe.run(query = q['category'] +": "+ q['question'],params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 1}})
    pred_ans = prediction['answers'][0]
    timer += time.time()-start
    rest_ans.append(pred_ans.to_dict()['answer'])
    rest_score.append(pred_ans.to_dict()['score'])

  0%|          | 0/281 [00:00<?, ?it/s]


Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|█| 1/1 [00:06<00:00,  6.16s/ Batches[A

Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|█| 1/1 [00:02<00:00,  2.98s/ Batches[A

Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|█| 1/1 [00:04<00:00,  4.13s/ Batches[A

Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|█| 1/1 [00:04<00:00,  4.61s/ Batches[A

Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|█| 1/1 [00:09<00:00,  9.31s/ Batches[A

Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|█| 1/1 [00:01<00:00,  1.87s/ Batches[A

Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|█| 1/1 [00:10<00:00, 10.20s/ Batches[A

Inferencing Samples:   0%|        | 0/1 [00:00<?, ? Batches/s][A
In

In [34]:
og_full_ans = np.append(og_ans, rest_ans)


In [36]:
og_full_score = np.append(og_score,rest_score)

In [37]:
f1s = 0
ems = 0
for idx, q in enumerate(sample_questions):
    f1 = compute_f1(og_full_ans[idx], q['answer'])
    em = compute_exact_match(og_full_ans[idx], q['answer'])
    f1s += f1
    ems += em
print(f1s/len(ans))
print(ems/len(ans))

0.045350003582828806
0.03


In [38]:
f1s = 0
ems = 0
for idx, q in enumerate(sample_questions):
    f1 = compute_f1(og_full_ans[idx], q['answer']) / og_full_score[idx]
    em = compute_exact_match(og_full_ans[idx], q['answer']) / og_full_score[idx]
    f1s += f1
    ems += em
print(f1s/len(ans))
print(ems/len(ans))

0.7675834374679207
0.534341242236703
