## Imports

In [3]:
%load_ext autoreload
%autoreload 2
import os
import re
import json
import copy
import sys
from tqdm import tqdm
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns

import pytrec_eval
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch


sys.path.append('../../')
import globals
from elastic_search_utils import elastic_utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Home path : /opt/bioasq/col-un-bioasq11
Eval path : /opt/bioasq/Evaluation-Measures


## Params

In [2]:
ELASTIC_SERVER = "http://localhost:9200"
QUESTIONS_PATH = "BioASQ-task10bPhaseA-testset2"
N_ANSWERS = 100
SEARCH_FIELDS = ['title', 'abstract', 'mesh_terms']
SEARCH_INDEX = 'pubmed2022'

SAVING_PATH_TEST = '/datasets/johan_tests/test_docs_10b.json'

## Constants

In [3]:
es = Elasticsearch(ELASTIC_SERVER)

In [4]:
questions = elastic_utils.load_questions(QUESTIONS_PATH)

## Making answers for test

### Single answer example

In [5]:
sample_answers = elastic_utils.ask_single_question(
    question=questions[0],
    fields=SEARCH_FIELDS,
    size=N_ANSWERS,
    es_client=es,
    index=SEARCH_INDEX
)

In [6]:
sample_answers.keys()

dict_keys(['took', 'timed_out', '_shards', 'hits'])

In [7]:
pprint(sample_answers['hits']['hits'][0])

{'_id': '34210339',
 '_ignored': ['affiliations.keyword', 'abstract.keyword', 'references.keyword'],
 '_index': 'pubmed2022',
 '_score': 54.14117,
 '_source': {'abstract': 'BACKGROUND\n'
                         'Primary ciliary dyskinesia (PCD) is a rare, highly '
                         'heterogeneous genetic disorder involving the '
                         'impairment of motile cilia. With no single gold '
                         'standard for PCD diagnosis and complicated '
                         'multiorgan dysfunction, the diagnosis of PCD can be '
                         'difficult in clinical settings. Some methods for '
                         'diagnosis, such as nasal nitric oxide measurement '
                         'and digital high-speed video microscopy with ciliary '
                         'beat pattern analysis, can be expensive or '
                         'unavailable. To confirm PCD diagnosis, we used a '
                         'strategy combining asses

In [8]:
single_metrics = elastic_utils.answers_to_id_metric(
    sample_answers
)

In [9]:
single_metrics

{'d34210339': 0.5414117,
 'd34556108': 0.5262727,
 'd26139845': 0.52551598,
 'd34851034': 0.52397076,
 'd28922056': 0.52181503,
 'd24203976': 0.5124834,
 'd31549486': 0.51169853,
 'd33966302': 0.50465458,
 'd25351953': 0.50065945,
 'd28801648': 0.4973589,
 'd27081490': 0.49151122999999997,
 'd31960620': 0.48774082,
 'd19953662': 0.48435817999999997,
 'd28925796': 0.481479,
 'd27349973': 0.4811725,
 'd23292041': 0.48020603,
 'd21907944': 0.47771637,
 'd18975248': 0.47727207,
 'd24963453': 0.4742592,
 'd24586956': 0.4725209,
 'd34445527': 0.47139034,
 'd30209139': 0.46869022,
 'd33447612': 0.46768172999999996,
 'd9387968': 0.46213696,
 'd18201450': 0.46016809999999997,
 'd20480745': 0.4601219,
 'd32367404': 0.45990643,
 'd10738019': 0.45840084,
 'd30004251': 0.45840084,
 'd17601719': 0.45791200000000004,
 'd33233428': 0.45681649999999996,
 'd22903970': 0.4559838,
 'd23261302': 0.4555873,
 'd31636325': 0.45477764,
 'd23798057': 0.45443405,
 'd31271036': 0.45434067,
 'd21926394': 0.4532202

## Multiple question answering

In [10]:
%%time
multiple_ans = elastic_utils.ask_several_questions(
    questions=questions,
    fields=SEARCH_FIELDS,
    size=N_ANSWERS,
    es_client=es,
    index=SEARCH_INDEX
)

Extracting docs from elastic search: 100%|██████| 90/90 [00:32<00:00,  2.73it/s]

CPU times: user 1.92 s, sys: 74.6 ms, total: 1.99 s
Wall time: 32.9 s





## Saving dicts

In [11]:
elastic_utils.save_json(multiple_ans, SAVING_PATH_TEST)

## Compare read with saved json

In [12]:
multiple_ans_read = elastic_utils.load_json(SAVING_PATH_TEST)

In [13]:
lens_docs = []
weird_questions = []
for result in multiple_ans:
    result_id = list(result.keys())[0]
    len_docs = len(result[result_id]['documents'].keys())
    if len_docs < N_ANSWERS:
        print(result_id, len_docs)
        weird_questions.append(result_id)
        lens_docs.append(len_docs)

In [14]:
cleaned_weird_questions = [
    weird_question.replace('q','') for weird_question in weird_questions
]

weird_og_questions = [
    question for question in questions
    if question['id'] in cleaned_weird_questions
]
len(weird_og_questions), len(cleaned_weird_questions)

(0, 0)

## Flattening documents

In [32]:
multiple_ans_dict = {}
for ans in multiple_ans:
    raw_key = list(ans.keys())[0].replace('q','')
    multiple_ans_dict[raw_key] = list(ans.values())[0]

In [33]:
questions_answers = elastic_utils.load_json(QUESTIONS_PATH)

In [34]:
for question in questions_answers['questions']:
    question_id = question['id']
    question_documents = multiple_ans_dict[question_id]['documents']
    doc_responses = [
        {
            'id': document_id.replace('d', ''),
            'abstract': document_info['abstract'],
            'title': document_info['title'],
            'score': document_info['score'],
            'mesh_terms': document_info['mesh_terms']
        }
        for document_id, document_info in question_documents.items()
    ]
    sorted_doc_responses = sorted(doc_responses, key=lambda x:x['score'], reverse=True)
    question['documents'] = sorted_doc_responses

In [35]:
with open('test_answers_10b_testset2.json', 'w') as ans_file:
    json.dump(questions_answers, ans_file)

In [19]:
for q in questions_answers['questions']:
    print(q.keys())
    print(len(q['documents']))
    print(q['documents'][0].keys())
    print('\n')

dict_keys(['id', 'type', 'body', 'documents'])
100
dict_keys(['id', 'abstract', 'title', 'score', 'mesh_terms'])


dict_keys(['id', 'type', 'body', 'documents'])
100
dict_keys(['id', 'abstract', 'title', 'score', 'mesh_terms'])


dict_keys(['id', 'type', 'body', 'documents'])
100
dict_keys(['id', 'abstract', 'title', 'score', 'mesh_terms'])


dict_keys(['id', 'type', 'body', 'documents'])
100
dict_keys(['id', 'abstract', 'title', 'score', 'mesh_terms'])


dict_keys(['id', 'type', 'body', 'documents'])
100
dict_keys(['id', 'abstract', 'title', 'score', 'mesh_terms'])


dict_keys(['id', 'type', 'body', 'documents'])
100
dict_keys(['id', 'abstract', 'title', 'score', 'mesh_terms'])


dict_keys(['id', 'type', 'body', 'documents'])
100
dict_keys(['id', 'abstract', 'title', 'score', 'mesh_terms'])


dict_keys(['id', 'type', 'body', 'documents'])
100
dict_keys(['id', 'abstract', 'title', 'score', 'mesh_terms'])


dict_keys(['id', 'type', 'body', 'documents'])
100
dict_keys(['id', 'abstract', 