## Imports

In [1]:
%load_ext autoreload
%autoreload 2
import os
import re
import json
import copy
import sys
from tqdm import tqdm
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns

import pytrec_eval
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch

from src.elastic_search_utils import elastic_utils

## Params

In [11]:
ELASTIC_SERVER = "http://localhost:9200"
QUESTIONS_PATH = "BioASQ-task10bPhaseA-testset4"
N_ANSWERS = 100
SEARCH_FIELDS = ['title', 'abstract', 'mesh_terms']
SEARCH_INDEX = 'pubmed2022'

SAVING_PATH_TEST = '/datasets/johan_tests_original_format/test_docs_10b-testset4.json'

## Constants

In [3]:
es = Elasticsearch(ELASTIC_SERVER)

In [4]:
questions = elastic_utils.load_json(QUESTIONS_PATH)

## Making answers for test

### Single answer example

In [5]:
sample_answers = elastic_utils.ask_single_question(
    question=questions['questions'][0],
    fields=SEARCH_FIELDS,
    size=N_ANSWERS,
    es_client=es,
    index=SEARCH_INDEX
)

In [6]:
sample_answers.keys()

dict_keys(['took', 'timed_out', '_shards', 'hits'])

In [7]:
pprint(sample_answers['hits']['hits'][0])

{'_id': '33594368',
 '_ignored': ['authors.keyword', 'abstract.keyword', 'references.keyword'],
 '_index': 'pubmed2022',
 '_score': 47.579323,
 '_source': {'abstract': 'Olfaction relies on a coordinated partnership '
                         'between odorant flow and neuronal communication. '
                         'Disruption in our ability to detect odors, or '
                         'anosmia, has emerged as a hallmark symptom of '
                         'infection with SARS-CoV-2, yet the mechanism behind '
                         'this abrupt sensory deficit remains elusive. Here, '
                         'using molecular evaluation of human olfactory '
                         'epithelium (OE) from subjects succumbing to COVID-19 '
                         'and a hamster model of SARS-CoV-2 infection, we '
                         'discovered widespread downregulation of olfactory '
                         'receptors (ORs) as well as key components of their '
           

In [8]:
single_metrics = elastic_utils.answers_to_id_metric(
    sample_answers
)

In [9]:
single_metrics

{'d33594368': 0.47579323,
 'd34587819': 0.40223915,
 'd33965353': 0.3832602,
 'd32563019': 0.36767302999999996,
 'd34099977': 0.36702877,
 'd34660038': 0.36552288,
 'd33074449': 0.36334187,
 'd32802540': 0.35423897,
 'd34302637': 0.35409904,
 'd32523929': 0.354047,
 'd32837939': 0.35323790000000005,
 'd33295221': 0.34813552999999997,
 'd32776905': 0.34550262,
 'd34853850': 0.34461124,
 'd34609841': 0.34454437,
 'd32756107': 0.34424312999999995,
 'd32652405': 0.34412975,
 'd34075562': 0.34321228,
 'd34548231': 0.3414032,
 'd32277751': 0.34126506999999995,
 'd34166652': 0.34111870000000005,
 'd34884216': 0.34085476,
 'd33824716': 0.34032623,
 'd32556089': 0.3354148,
 'd33399169': 0.33458687,
 'd34178610': 0.334538,
 'd32909060': 0.33405994,
 'd32305563': 0.33319466,
 'd33868972': 0.3329877,
 'd32447496': 0.33261401999999995,
 'd32466862': 0.33250698,
 'd32587994': 0.33249873999999996,
 'd32930820': 0.33155926,
 'd34103823': 0.33132874,
 'd32587902': 0.33115375999999996,
 'd34298605': 0.3

## Multiple question answering

In [10]:
%%time
multiple_ans = elastic_utils.ask_several_questions(
    questions=questions['questions'],
    fields=SEARCH_FIELDS,
    size=N_ANSWERS,
    es_client=es,
    index=SEARCH_INDEX
)

Extracting docs from elastic search: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [01:16<00:00,  1.18it/s]

CPU times: user 991 ms, sys: 51.6 ms, total: 1.04 s
Wall time: 1min 16s





## Saving dicts

In [12]:
elastic_utils.save_json(multiple_ans, SAVING_PATH_TEST)

## Compare read with saved json

In [13]:
multiple_ans_read = elastic_utils.load_json(SAVING_PATH_TEST)

In [14]:
lens_docs = []
weird_questions = []
for result in multiple_ans['questions']:
    len_docs = len(result['documents'])
    if len_docs < N_ANSWERS:
        print(result_id, len_docs)
        weird_questions.append(result_id)
        lens_docs.append(len_docs)

In [15]:
cleaned_weird_questions = [
    weird_question for weird_question in weird_questions
]

weird_og_questions = [
    question for question in questions['questions']
    if question['id'] in cleaned_weird_questions
]
len(weird_og_questions), len(cleaned_weird_questions)

(0, 0)