## Imports

In [3]:
%load_ext autoreload
%autoreload 2
import os
import re
import json
import copy
import sys
from tqdm import tqdm
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns

import pytrec_eval
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch


sys.path.append('../../')
import globals
from elastic_search_utils import elastic_utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Home path : /opt/bioasq/col-un-bioasq11
Eval path : /opt/bioasq/Evaluation-Measures


## Params

In [4]:
ELASTIC_SERVER = "http://localhost:9200"
QUESTIONS_PATH = "../../data/raw/training11b.json"
N_ANSWERS = 100
SEARCH_FIELDS = ['title', 'abstract', 'mesh_terms']
SEARCH_INDEX = 'pubmed2023-old'

SAVING_PATH_TEST = '/opt/bioasq/tmp/test_docs_11b.json'

## Constants

In [5]:
es = Elasticsearch(ELASTIC_SERVER)

In [6]:
questions = elastic_utils.load_questions(QUESTIONS_PATH)

## Making answers for test

### Single answer example

In [7]:
# search_doc_by_query -is an alias of method- ask_single_question 
sample_answers = elastic_utils.search_doc_by_query(
    question=questions[0],
    fields=SEARCH_FIELDS,
    size=N_ANSWERS,
    es_client=es,
    index=SEARCH_INDEX
)

NotFoundError: NotFoundError(404, 'index_not_found_exception', 'no such index [pubmed2023-old]', pubmed2023-old, index_or_alias)

In [8]:
print(elastic_utils.extract_title_abstract_mesh_terms(sample_answers))

NameError: name 'sample_answers' is not defined

In [7]:
sample_answers.keys()

dict_keys(['took', 'timed_out', '_shards', 'hits'])

In [8]:
pprint(sample_answers['hits']['hits'][0])

{'_id': '15858239',
 '_ignored': ['abstract.keyword'],
 '_index': 'pubmed2023-old',
 '_score': 34.125973,
 '_source': {'abstract': 'Hirschsprung disease is a congenital disorder with '
                         'the incidence of 1 per 5000 live births, '
                         'characterized by the absence of intestinal ganglion '
                         'cells. In the etiology of Hirschsprung disease '
                         'various genes play a role; these are: RET, EDNRB, '
                         'GDNF, EDN3 and SOX10, NTN3, ECE1, Mutations in these '
                         'genes may result in dominant, recessive or '
                         'multifactorial patterns of inheritance. Diverse '
                         'models of inheritance, co-existence of numerous '
                         'genetic disorders and detection of numerous '
                         'chromosomal aberrations together with involvement of '
                         'various genes confirm the gene

In [13]:
single_metrics = elastic_utils.answers_to_id_metric(
    sample_answers
)

In [14]:
single_metrics

{'d15858239': 0.34125973000000004,
 'd11106284': 0.3304699,
 'd15829955': 0.32477596,
 'd3578280': 0.32177456,
 'd10893883': 0.31766972,
 'd23283078': 0.31185501,
 'd6650562': 0.31127289999999996,
 'd17965226': 0.30465504,
 'd16965318': 0.29992252,
 'd30954460': 0.29147903,
 'd23465774': 0.28968151,
 'd28532160': 0.2871134,
 'd22891492': 0.28372358,
 'd22475248': 0.28174885,
 'd9600737': 0.27842356,
 'd11484199': 0.27693056,
 'd12239580': 0.27302563,
 'd11694544': 0.27269922,
 'd25323865': 0.27267227,
 'd10370906': 0.27205265,
 'd27639057': 0.27019136,
 'd8877363': 0.27014254,
 'd16986122': 0.26979127999999997,
 'd2213441': 0.26783308,
 'd9465697': 0.26710867,
 'd27370713': 0.26708973,
 'd9174404': 0.26617102,
 'd655638': 0.26497941999999997,
 'd30924549': 0.2646232,
 'd2335126': 0.26378609999999997,
 'd33768880': 0.26362247,
 'd29677472': 0.26322865,
 'd18472352': 0.26273043,
 'd35080777': 0.26265852,
 'd7634536': 0.26157421,
 'd23842858': 0.26141833999999997,
 'd20813343': 0.26130392

## Multiple question answering

In [11]:
%%time
# search_docs_by_query_set -is an alias of method- ask_several_questions 
multiple_ans = elastic_utils.search_docs_by_query_set(
    questions=questions[0:30],
    fields=SEARCH_FIELDS,
    size=N_ANSWERS,
    es_client=es,
    index=SEARCH_INDEX
)

Extracting docs from elastic search: 100%|██████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:14<00:00,  2.11it/s]

CPU times: user 139 ms, sys: 47.3 ms, total: 186 ms
Wall time: 14.2 s





## Saving dicts

In [20]:
elastic_utils.save_json(multiple_ans, SAVING_PATH_TEST)

## Compare read with saved json

In [21]:
multiple_ans_read = elastic_utils.load_json(SAVING_PATH_TEST)

In [22]:
lens_docs = []
weird_questions = []
for result in multiple_ans:
    result_id = list(result.keys())[0]
    len_docs = len(result[result_id]['documents'].keys())
    if len_docs < N_ANSWERS:
        print(result_id, len_docs)
        weird_questions.append(result_id)
        lens_docs.append(len_docs)

In [23]:
cleaned_weird_questions = [
    weird_question.replace('q','') for weird_question in weird_questions
]

weird_og_questions = [
    question for question in questions
    if question['id'] in cleaned_weird_questions
]
len(weird_og_questions), len(cleaned_weird_questions)

(0, 0)

## Flattening documents

In [24]:
multiple_ans_dict = {}
for ans in multiple_ans:
    raw_key = list(ans.keys())[0].replace('q','')
    multiple_ans_dict[raw_key] = list(ans.values())[0]

In [25]:
questions_answers = elastic_utils.load_json(QUESTIONS_PATH)

In [27]:
for question in questions_answers['questions']:
    question_id = question['id']
    question_documents = multiple_ans_dict[question_id]['documents']
    doc_responses = [
        {
            'id': document_id.replace('d', ''),
            'abstract': document_info['abstract'],
            'title': document_info['title'],
            'score': document_info['score'],
            'mesh_terms': document_info['mesh_terms']
        }
        for document_id, document_info in question_documents.items()
    ]
    sorted_doc_responses = sorted(doc_responses, key=lambda x:x['score'], reverse=True)
    question['documents'] = sorted_doc_responses
    break

In [28]:
with open('test_answers_10b_testset2.json', 'w') as ans_file:
    json.dump(questions_answers, ans_file)

In [29]:
for q in questions_answers['questions']:
    print(q.keys())
    print(len(q['documents']))
    print(q['documents'][0].keys())
    print('\n')

dict_keys(['body', 'documents', 'ideal_answer', 'concepts', 'type', 'id', 'snippets'])
100
dict_keys(['id', 'abstract', 'title', 'score', 'mesh_terms'])


dict_keys(['body', 'documents', 'triples', 'ideal_answer', 'exact_answer', 'concepts', 'type', 'id', 'snippets'])
100
dict_keys(['id', 'abstract', 'title', 'score', 'mesh_terms'])


dict_keys(['body', 'documents', 'ideal_answer', 'exact_answer', 'type', 'id', 'snippets'])
100
dict_keys(['id', 'abstract', 'title', 'score', 'mesh_terms'])


dict_keys(['body', 'documents', 'ideal_answer', 'exact_answer', 'concepts', 'type', 'id', 'snippets'])
100
dict_keys(['id', 'abstract', 'title', 'score', 'mesh_terms'])


dict_keys(['body', 'documents', 'ideal_answer', 'exact_answer', 'concepts', 'type', 'id', 'snippets'])
100
dict_keys(['id', 'abstract', 'title', 'score', 'mesh_terms'])


dict_keys(['body', 'documents', 'ideal_answer', 'exact_answer', 'concepts', 'type', 'id', 'snippets'])
100
dict_keys(['id', 'abstract', 'title', 'score', 'mesh_te

AttributeError: 'str' object has no attribute 'keys'