## Imports

In [1]:
%load_ext autoreload
%autoreload 2
import os
import re
import json
import copy
import sys
from tqdm import tqdm
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns

import pytrec_eval
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch

from src.elastic_search_utils import elastic_utils

## Params

In [6]:
ELASTIC_SERVER = "http://localhost:9200"
QUESTIONS_PATH = "BioASQ-task10bPhaseA-testset2"
N_ANSWERS = 100
SEARCH_FIELDS = ['title', 'abstract', 'mesh_terms']
SEARCH_INDEX = 'pubmed2022'

SAVING_PATH_TEST = '/datasets/johan_tests_original_format/test_docs_10b.json'

## Constants

In [7]:
es = Elasticsearch(ELASTIC_SERVER)

In [9]:
questions = elastic_utils.load_json(QUESTIONS_PATH)

## Making answers for test

### Single answer example

In [11]:
sample_answers = elastic_utils.ask_single_question(
    question=questions['questions'][0],
    fields=SEARCH_FIELDS,
    size=N_ANSWERS,
    es_client=es,
    index=SEARCH_INDEX
)

In [12]:
sample_answers.keys()

dict_keys(['took', 'timed_out', '_shards', 'hits'])

In [13]:
pprint(sample_answers['hits']['hits'][0])

{'_id': '34210339',
 '_ignored': ['affiliations.keyword', 'abstract.keyword', 'references.keyword'],
 '_index': 'pubmed2022',
 '_score': 54.144127,
 '_source': {'abstract': 'BACKGROUND\n'
                         'Primary ciliary dyskinesia (PCD) is a rare, highly '
                         'heterogeneous genetic disorder involving the '
                         'impairment of motile cilia. With no single gold '
                         'standard for PCD diagnosis and complicated '
                         'multiorgan dysfunction, the diagnosis of PCD can be '
                         'difficult in clinical settings. Some methods for '
                         'diagnosis, such as nasal nitric oxide measurement '
                         'and digital high-speed video microscopy with ciliary '
                         'beat pattern analysis, can be expensive or '
                         'unavailable. To confirm PCD diagnosis, we used a '
                         'strategy combining asse

In [14]:
single_metrics = elastic_utils.answers_to_id_metric(
    sample_answers
)

In [15]:
single_metrics

{'d34210339': 0.54144127,
 'd34556108': 0.52635494,
 'd26139845': 0.5255413999999999,
 'd34851034': 0.52400833,
 'd28922056': 0.52183327,
 'd24203976': 0.5125139999999999,
 'd31549486': 0.5117185599999999,
 'd33966302': 0.50468178,
 'd25351953': 0.5006802,
 'd28801648': 0.49738144,
 'd27081490': 0.49153587000000004,
 'd31960620': 0.4877663,
 'd19953662': 0.4843753,
 'd28925796': 0.48149549999999997,
 'd27349973': 0.48120193,
 'd23292041': 0.48022038,
 'd21907944': 0.47773613,
 'd18975248': 0.47729736,
 'd24963453': 0.47428127000000003,
 'd24586956': 0.47253956,
 'd34445527': 0.47140823,
 'd30209139': 0.46870766,
 'd33447612': 0.4676977,
 'd9387968': 0.46215575999999997,
 'd18201450': 0.46018692,
 'd20480745': 0.4601353,
 'd32367404': 0.45992935,
 'd10738019': 0.45864075,
 'd30004251': 0.45864075,
 'd17601719': 0.45793255,
 'd33233428': 0.45682727999999995,
 'd22903970': 0.4560006,
 'd23261302': 0.4556147,
 'd31636325': 0.45479748000000003,
 'd23798057': 0.45444443,
 'd31271036': 0.4543

## Multiple question answering

In [16]:
%%time
multiple_ans = elastic_utils.ask_several_questions(
    questions=questions['questions'],
    fields=SEARCH_FIELDS,
    size=N_ANSWERS,
    es_client=es,
    index=SEARCH_INDEX
)

Extracting docs from elastic search: 100%|██████| 90/90 [00:29<00:00,  3.03it/s]

CPU times: user 1.68 s, sys: 73.1 ms, total: 1.75 s
Wall time: 29.7 s





## Saving dicts

In [17]:
elastic_utils.save_json(multiple_ans, SAVING_PATH_TEST)

## Compare read with saved json

In [18]:
multiple_ans_read = elastic_utils.load_json(SAVING_PATH_TEST)

In [19]:
lens_docs = []
weird_questions = []
for result in multiple_ans['questions']:
    len_docs = len(result['documents'])
    if len_docs < N_ANSWERS:
        print(result_id, len_docs)
        weird_questions.append(result_id)
        lens_docs.append(len_docs)

In [23]:
cleaned_weird_questions = [
    weird_question for weird_question in weird_questions
]

weird_og_questions = [
    question for question in questions['questions']
    if question['id'] in cleaned_weird_questions
]
len(weird_og_questions), len(cleaned_weird_questions)

(0, 0)