## Imports

In [1]:
import json
import itertools
from elasticsearch import Elasticsearch
from tqdm import tqdm

from src.elastic_search_utils import elastic_utils

## Es client

In [2]:
es = Elasticsearch("http://localhost:9200")

## Result load

In [3]:
#results = elastic_utils.load_json('/datasets/johan_tests/queried_docs.json')
results = elastic_utils.load_json('/datasets/johan_tests/training_docs.json')

## Debug functions

In [4]:
def find_all_single_docs(document_test):
    documents = list(document_test.values())[0]['documents']
    return [doc_id for doc_id, doc_data in documents.items()]

In [5]:
def find_failed_single_docs(document_test):
    documents = list(document_test.values())[0]['documents']
    return [doc_id for doc_id, doc_data in documents.items() if doc_data == 'failed']

In [6]:
def find_no_abstract_single_docs(document_test):
    documents = list(document_test.values())[0]['documents']
    empty_abstract_docs = []
    for doc_id, doc_data in documents.items():
        if isinstance(doc_data, str):
            continue
        elif (doc_data['abstract'] == '') or (doc_data['abstract'] is None):
            empty_abstract_docs.append(doc_id)
            
    return empty_abstract_docs

## Listing failed document ids

In [7]:
all_docs =  set(list(itertools.chain.from_iterable(
    find_all_single_docs(result) for result in results
)))

In [8]:
failed_docs =  set(list(itertools.chain.from_iterable(
    find_failed_single_docs(result) for result in results
)))

In [9]:
no_abs_docs = set(list(itertools.chain.from_iterable(
    find_no_abstract_single_docs(result) for result in results
)))

## Review over failed id stats

In [10]:
failed_docs - all_docs

set()

In [11]:
len(failed_docs), len(all_docs), len(failed_docs)/len(all_docs)

(12926, 33330, 0.3878187818781878)

In [12]:
no_abs_docs - all_docs

set()

In [13]:
len(no_abs_docs), len(all_docs), len(no_abs_docs)/len(all_docs)

(29, 33330, 0.0008700870087008701)

## Checking if non found files in index were by elastic utils fault or indexing

In [14]:
my_fails = []
index_fails = []
for doc in tqdm(list(failed_docs), desc='Check fail'):
    doc_id = doc.replace('d', '')
    body={"query": {"match": {"pmid" : doc_id}}}
    resp = dict(
        es.search(index = 'pubmed2022_index_full', body=body)
    )
    if len(resp['hits']['hits']) == 0:
        index_fails.append(doc)
    else:
        my_fails.append(doc)


  es.search(index = 'pubmed2022_index_full', body=body)
Check fail: 100%|████████████████████████| 12926/12926 [00:37<00:00, 344.22it/s]


In [15]:
len(my_fails)

0

In [16]:
len(index_fails)

12926