## Imports

In [1]:
%load_ext autoreload
%autoreload 2
import os
import re
import json
import copy
import sys
from tqdm import tqdm
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns

import pytrec_eval
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch

from src.elastic_search_utils import elastic_utils

## Params

In [2]:
ELASTIC_SERVER = "http://localhost:9200"
QUESTIONS_PATH = "BioASQ-task10bPhaseA-testset5"
N_ANSWERS = 100
SEARCH_FIELDS = ['title', 'abstract', 'mesh_terms']
SEARCH_INDEX = 'pubmed2022'

SAVING_PATH_TEST = '/datasets/johan_tests_original_format/test_docs_10b-testset5.json'

## Constants

In [3]:
es = Elasticsearch(ELASTIC_SERVER)

In [4]:
questions = elastic_utils.load_json(QUESTIONS_PATH)

## Making answers for test

### Single answer example

In [5]:
sample_answers = elastic_utils.ask_single_question(
    question=questions['questions'][0],
    fields=SEARCH_FIELDS,
    size=N_ANSWERS,
    es_client=es,
    index=SEARCH_INDEX
)

In [6]:
sample_answers.keys()

dict_keys(['took', 'timed_out', '_shards', 'hits'])

In [7]:
pprint(sample_answers['hits']['hits'][0])

{'_id': '34638130',
 '_ignored': ['affiliations.keyword', 'authors.keyword', 'abstract.keyword'],
 '_index': 'pubmed2022',
 '_score': 43.2665,
 '_source': {'abstract': 'Myelodysplastic syndromes (MDS) represent a '
                         'heterogeneous group of clonal hematopoietic '
                         'stem-cell disorders characterized by ineffective '
                         'hematopoiesis leading to peripheral cytopenias and '
                         'in a substantial proportion of cases to acute '
                         'myeloid leukemia. The deletion of the long arm of '
                         'chromosome 11, del(11q), is a rare but recurrent '
                         'clonal event in MDS. Here, we detail the largest '
                         'series of 113 cases of MDS and myelodysplastic '
                         'syndromes/myeloproliferative neoplasms (MDS/MPN) '
                         'harboring a del(11q) analyzed at clinical, '
                         'cy

In [8]:
single_metrics = elastic_utils.answers_to_id_metric(
    sample_answers
)

In [9]:
single_metrics

{'d34638130': 0.432665,
 'd32300432': 0.37429108,
 'd30045276': 0.33879723,
 'd26117058': 0.3328591,
 'd28321349': 0.32828,
 'd25765789': 0.32749413,
 'd26080752': 0.3271573,
 'd15475079': 0.32644047,
 'd23314345': 0.3231128,
 'd32513619': 0.3213364,
 'd20359629': 0.32110245,
 'd34129017': 0.3200952,
 'd29903757': 0.31980633,
 'd28119848': 0.31952208,
 'd33080073': 0.31862005,
 'd20670271': 0.31782923,
 'd32134844': 0.31765875,
 'd20924036': 0.31706507,
 'd17017876': 0.31636517999999997,
 'd30858933': 0.31614256,
 'd22928125': 0.31469234,
 'd21380700': 0.31316994000000004,
 'd31712933': 0.31248992999999997,
 'd14717237': 0.31242393,
 'd10489166': 0.312147,
 'd23453286': 0.31202106,
 'd31808894': 0.31196066,
 'd33221503': 0.31168543,
 'd9723026': 0.31167646,
 'd23029178': 0.31124617,
 'd25229970': 0.31112831,
 'd21867648': 0.31059265,
 'd9118602': 0.310531,
 'd31559387': 0.31046972,
 'd25159121': 0.31043491,
 'd29776423': 0.31039362,
 'd12592323': 0.3103933,
 'd30854142': 0.310258119999

## Multiple question answering

In [10]:
%%time
multiple_ans = elastic_utils.ask_several_questions(
    questions=questions['questions'],
    fields=SEARCH_FIELDS,
    size=N_ANSWERS,
    es_client=es,
    index=SEARCH_INDEX
)

Extracting docs from elastic search: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:06<00:00,  1.36it/s]

CPU times: user 1.73 s, sys: 91.6 ms, total: 1.82 s
Wall time: 1min 6s





## Saving dicts

In [12]:
elastic_utils.save_json(multiple_ans, SAVING_PATH_TEST)

## Compare read with saved json

In [13]:
multiple_ans_read = elastic_utils.load_json(SAVING_PATH_TEST)

In [14]:
lens_docs = []
weird_questions = []
for result in multiple_ans['questions']:
    len_docs = len(result['documents'])
    if len_docs < N_ANSWERS:
        print(result_id, len_docs)
        weird_questions.append(result_id)
        lens_docs.append(len_docs)

In [15]:
cleaned_weird_questions = [
    weird_question for weird_question in weird_questions
]

weird_og_questions = [
    question for question in questions['questions']
    if question['id'] in cleaned_weird_questions
]
len(weird_og_questions), len(cleaned_weird_questions)

(0, 0)