## Imports

In [1]:
import json
import itertools
from elasticsearch import Elasticsearch
from tqdm import tqdm

from src.elastic_search_utils import elastic_utils

## Es client

In [2]:
es = Elasticsearch("http://localhost:9200")

## Result load

In [3]:
results = elastic_utils.load_json('/datasets/johan_tests/training_docs_10b.json')
#results = elastic_utils.load_json('/datasets/johan_tests/queried_docs_10b.json')

In [4]:
results[0]

{'q55031181e9bde69634000014': {'question': 'Is Hirschsprung disease a mendelian or a multifactorial disorder?',
  'documents': {'d15858239': {'score': 1,
    'title': '[The role of ret gene in the pathogenesis of Hirschsprung disease].',
    'abstract': 'Hirschsprung disease is a congenital disorder with the incidence of 1 per 5000 live births, characterized by the absence of intestinal ganglion cells. In the etiology of Hirschsprung disease various genes play a role; these are: RET, EDNRB, GDNF, EDN3 and SOX10, NTN3, ECE1, Mutations in these genes may result in dominant, recessive or multifactorial patterns of inheritance. Diverse models of inheritance, co-existence of numerous genetic disorders and detection of numerous chromosomal aberrations together with involvement of various genes confirm the genetic heterogeneity of Hirschsprung disease. Hirschsprung disease might well serve as a model for many complex disorders in which the search for responsible genes has only just been initi

## Debug functions

In [5]:
def find_all_single_docs(document_test):
    documents = list(document_test.values())[0]['documents']
    return [doc_id for doc_id, doc_data in documents.items()]

In [6]:
def find_failed_single_docs(document_test):
    documents = list(document_test.values())[0]['documents']
    return [doc_id for doc_id, doc_data in documents.items() if doc_data == 'failed']

In [7]:
def find_no_abstract_single_docs(document_test):
    documents = list(document_test.values())[0]['documents']
    empty_abstract_docs = []
    for doc_id, doc_data in documents.items():
        if isinstance(doc_data, str):
            continue
        elif (doc_data['abstract'] == '') or (doc_data['abstract'] is None):
            empty_abstract_docs.append(doc_id)
            
    return empty_abstract_docs

## Listing failed document ids

In [8]:
all_docs =  set(list(itertools.chain.from_iterable(
    find_all_single_docs(result) for result in results
)))

In [9]:
failed_docs =  set(list(itertools.chain.from_iterable(
    find_failed_single_docs(result) for result in results
)))

In [10]:
no_abs_docs = set(list(itertools.chain.from_iterable(
    find_no_abstract_single_docs(result) for result in results
)))

## Review over failed id stats

In [11]:
failed_docs - all_docs

set()

In [12]:
len(failed_docs), len(all_docs), len(failed_docs)/len(all_docs)

(54, 36844, 0.0014656389099989143)

In [13]:
no_abs_docs - all_docs

set()

In [14]:
len(no_abs_docs), len(all_docs), len(no_abs_docs)/len(all_docs)

(47, 36844, 0.001275648680924981)

## Checking if non found files in index were by elastic utils fault or indexing

In [15]:
my_fails = []
index_fails = []
for doc in tqdm(list(failed_docs), desc='Check fail'):
    doc_id = doc.replace('d', '')
    body={"query": {"match": {"pmid" : doc_id}}}
    resp = dict(
        es.search(index = 'pubmed2023-old', body=body)
    )
    if len(resp['hits']['hits']) == 0:
        index_fails.append(doc)
    else:
        my_fails.append(doc)

  es.search(index = 'pubmed2022', body=body)
Check fail: 100%|██████████████████████████████| 54/54 [00:00<00:00, 169.03it/s]


In [16]:
len(my_fails)

0

In [17]:
len(index_fails)

54

In [18]:
index_fails

['d24305403',
 'd21473027',
 'd20301293',
 'd26989023',
 'd20301416',
 'd20301331',
 'd24305278',
 'd23658991',
 'd21952424',
 'd20301308',
 'd23104528',
 'd29934319',
 'd27924029',
 'd29676625',
 'd27940438',
 'd23304742',
 'd22787616',
 'd20641567',
 'd20301510',
 'd21473029',
 'd25383801',
 'd22550943',
 'd22787626',
 'd21250223',
 'd20301494',
 'd22129433',
 'd20301454',
 'd20301427',
 'd23833797',
 'd33097476',
 'd29653952',
 'd20301779',
 'd20007090',
 'd24212220',
 'd27399455',
 'd23986914',
 'd20301585',
 'd20301577',
 'd25064957',
 'd20301628',
 'd22855961',
 'd27399411',
 'd24144986',
 'd20301466',
 'd20301588',
 'd20301462',
 'd20639591',
 'd20301420',
 'd30242068',
 'd27742610',
 'd25401082',
 'd21249951',
 'd21413253',
 'd23890950']

In [19]:
no_abs_docs

{'d10525005',
 'd10590441',
 'd11169342',
 'd11339660',
 'd11428324',
 'd11864366',
 'd14663844',
 'd15332726',
 'd15719064',
 'd15783264',
 'd16451554',
 'd16477543',
 'd16570042',
 'd16804530',
 'd16971728',
 'd17315395',
 'd17364293',
 'd17462970',
 'd17703631',
 'd18422033',
 'd18577682',
 'd18941554',
 'd19119983',
 'd19809570',
 'd20431548',
 'd20671410',
 'd21284896',
 'd21445329',
 'd21772710',
 'd21897718',
 'd21915623',
 'd22375228',
 'd22527824',
 'd23255116',
 'd23376948',
 'd23455231',
 'd23599675',
 'd23741179',
 'd23776883',
 'd24132372',
 'd24163818',
 'd24401652',
 'd2819350',
 'd33201862',
 'd4936649',
 'd7474061',
 'd9664617'}