In [4]:
from lxml import etree
from elasticsearch import Elasticsearch, helpers
from collections import deque
from typing import Dict
import collections

In [5]:
def genmapping():
    mapping = {"mappings":{
               "properties":{
               "Abstract":{
                    "type":"keyword"},
               "Authors":{
                    "type":"keyword"},
               "Journal":{
                    "type":"keyword"},
               "Keywords":{
                    "type":"keyword"},
               "MeshIDs":{
                    "type":"keyword"},
               "PMID":{
                    "type":"text"},
               "PublishDate":{
                    "type":"date",
                    "format": "yyyy-MM-dd"},
               "Title":{
                    "type":"text"}
               }}}
                   
    return mapping 

In [12]:
def index_pubmed(es):
    docs = ['/opt/bioasq/resources/pubmed_baseline_2023/pubmed23n0502.xml.gz']
    file_contents = etree.parse(docs[0])
    entry = collections.defaultdict(list)
    FoundTag = {'MeshIDs': True, 'Title': True, 'PublishDate': True, 'Authors': True, 'Keywords': True, 'Abstract': True, 
                'Journal': True}
    for element in file_contents.iter():
        if element.tag == 'PMID': # PMID
            for tag in FoundTag.keys(): # Fill dict value with empty string if it is not found
                if FoundTag[tag] == False and tag == 'PublishDate':
                    entry[tag].append(None)
                elif FoundTag[tag] == False:
                    entry[tag].append('')
                FoundTag[tag] = False
            if len(entry) != 0:
                yield entry
            entry.clear()
            entry['PMID'].append(element.text)
        elif element.tag == 'DateCompleted': # Article Date Completed
            DateCompleted = ''
            children = element.getchildren()
            for child in children:
                if child.tag == 'Year':
                    DateCompleted = child.text
                elif child.tag == 'Month':
                    DateCompleted = DateCompleted + '-' + child.text
                elif child.tag == 'Day':
                    DateCompleted = DateCompleted + '-' + child.text
            entry['PublishDate'].append(DateCompleted)
            FoundTag['PublishDate'] = True
        elif element.tag == 'MeshHeadingList': # Mesh IDs
            MeshIDsList = list()
            children = element.getchildren()
            for child in children:
                for grand in child:
                    if grand.tag == 'DescriptorName':
                        MeshIDsList.append(grand.attrib['UI'])
            entry['MeshIDs'].append(MeshIDsList)
            FoundTag['MeshIDs'] = True
        elif element.tag == 'ArticleTitle': # Title of Article
            entry['Title'].append(element.text)
            FoundTag['Title'] = True
        elif element.tag == 'AuthorList': # Author List
            AuthorList = list()
            children = element.getchildren()
            for child in children:
                Author = ''
                for grand in child:
                    if grand.tag == 'LastName':
                        Author = grand.text
                    elif grand.tag == 'ForeName':
                        Author = grand.text + ' ' + Author 
                AuthorList.append(Author)
            entry['Authors'].append(AuthorList)
            FoundTag['Authors'] = True
        elif element.tag == 'Abstract': # Abstract Text
            AbstractTextList = list()
            children = element.getchildren()
            for child in children:
                if child.tag == 'AbstractText':
                    AbstractTextList.append(child.text)
            entry['Abstract'].append(AbstractTextList)
            FoundTag['Abstract'] = True
        elif element.tag == 'KeywordList': # Keyword List
            KeywordList = list()
            children = element.getchildren()
            for child in children:
                if child.tag == 'Keyword':
                    KeywordList.append(child.text)
            entry['Keywords'].append(KeywordList)
            FoundTag['Keywords'] = True
        elif element.tag == 'Title': # Journal Title
            entry['Journal'].append(element.text)
            FoundTag['Journal'] = True
    
    for tag in FoundTag.keys(): # Done on last PMID iteration
        if FoundTag[tag] == False and tag == 'PublishDate':
            entry[tag].append(None)
        elif FoundTag[tag] == False:
            entry[tag].append(None)
        FoundTag[tag] = False
    if len(entry) != 0:
        yield entry
    entry.clear()
    
    return entry

In [13]:
def create_index(self, index_name: str, mapping: Dict) -> None:
    """
    Create an ES index.
    :param index_name: Name of the index.
    :param mapping: Mapping of the index
    """
    logging.info(f"Creating index {index_name} with the following schema: {json.dumps(mapping, indent=2)}")
    self.es_client.indices.create(index=index_name, ignore=400, body=mapping)

In [14]:
index_name = 'pubmed2023'
mapping = genmapping()
es = Elasticsearch(hosts=['localhost:9200'], timeout=100)
es.indices.delete(index=index_name, ignore=[400, 404])
es.indices.create(index=index_name, mappings=mapping['mappings'])
deque(helpers.parallel_bulk(es, index_pubmed(es), index=index_name), maxlen=0)
es.indices.refresh()

{'_shards': {'total': 6, 'successful': 3, 'failed': 0}}

In [15]:
next(index_pubmed(es))

defaultdict(list,
            {'PMID': ['15365562'],
             'PublishDate': ['2004-11-26'],
             'Journal': ['British journal of cancer'],
             'Title': ['Sustained antiproliferative mechanisms by RB24, a targeted precursor of multiple inhibitors of epidermal growth factor receptor and a DNA alkylating agent in the A431 epidermal carcinoma of the vulva cell line.'],
             'Abstract': [["Recently, with the purpose of enhancing the potency of epidermal growth factor receptor (EGFR)-based therapies, we designed a novel strategy termed 'Cascade-release targeting' that seeks to develop molecules capable of degrading to multiple tyrosine kinase (TK) inhibitors and highly reactive electrophiles, in a stepwise fashion. Here we report on the first prototype of this model, RB24, a masked methyltriazene, that in addition to being an inhibitor on its own was designed to degrade to RB14, ZR08, RB10+a DNA alkylating methyldiazonium species. The cascade degradation of RB24

In [16]:
result = es.search(index=index_name, query={"match_all":{}})

In [17]:
print(result['hits'])

{'total': {'value': 10000, 'relation': 'gte'}, 'max_score': 1.0, 'hits': [{'_index': 'pubmed2023', '_type': '_doc', '_id': 'BTZ6V4YBLon1h0PVHLaG', '_score': 1.0, '_source': {'PMID': ['15365562'], 'PublishDate': ['2004-11-26'], 'Journal': ['British journal of cancer'], 'Title': ['Sustained antiproliferative mechanisms by RB24, a targeted precursor of multiple inhibitors of epidermal growth factor receptor and a DNA alkylating agent in the A431 epidermal carcinoma of the vulva cell line.'], 'Abstract': [["Recently, with the purpose of enhancing the potency of epidermal growth factor receptor (EGFR)-based therapies, we designed a novel strategy termed 'Cascade-release targeting' that seeks to develop molecules capable of degrading to multiple tyrosine kinase (TK) inhibitors and highly reactive electrophiles, in a stepwise fashion. Here we report on the first prototype of this model, RB24, a masked methyltriazene, that in addition to being an inhibitor on its own was designed to degrade to

In [18]:
def SearchQuery(index=index_name, all_docs=False):
    es = Elasticsearch(hosts=['localhost:9200'])
    if all_docs:
        query = {"match_all":{}}
    else:
        query = {
                "bool": {
                    "must": [
                        {"match_phrase": { "Title": "" }}
                    ],
                    "filter": [
                        {"range": {"PublishDate": {"gte": 2020}}}
                    ]
                }
            }
    query = es.search(
                index=index_name,
                body={
                    "_source": [ "PMID", "Title", "Authors", "PublishDate", "MeshIDs", "Keywords", "Abstract", "_score" ],
                    "size": 3,
                    "query": query
                })

    return query

In [19]:
def PrintResults(result):
    print("{")
    print("   \"took\" : " + str(results['took']))
    print("   \"timed_out\" : " + str(results['timed_out']))
    print("   \"_shards\" : {")
    print("     \"total\" : " + str(results['_shards']['total']) + ",")
    print("     \"successful\" : " + str(results['_shards']['successful']) + ",")
    print("     \"skipped\" : " + str(results['_shards']['skipped']) + ",")
    print("     \"failed\" : " + str(results['_shards']['failed']))
    print("   },")
    print("   \"hits\" : {")
    print("     \"total\" : {")
    print("       \"value\" : " + str(results['hits']['total']['value']) + ",")
    print("       \"relation\" : \"" + str(results['hits']['total']['relation']) + "\"")
    print("     },")
    print("     \"max_score\" : " + str(results['hits']['max_score']) + ",")
    print("     \"hits\" : [")
    
    count = 0
    while count < 3:
        print("        {")
        for key in results['hits']['hits'][count].keys():
            if key == "_index":
                print("          \"_index\" : \"" + str(results['hits']['hits'][count][key]) + "\",")
            elif key == "_type":
                print("          \"_type\" : \"" + str(results['hits']['hits'][count][key]) + "\",")
            elif key == "_id":
                print("          \"_id\" : \"" + str(results['hits']['hits'][count]['_source']['PMID']).strip('[').strip(']').strip('\"').strip('\'') + "\",")
            elif key == "_score":
                print("          \"_score\" : " + str(results['hits']['hits'][count][key]) + ",")
            elif key == "_source":
                print("          \"_source\" : {")
                for grandKey in results['hits']['hits'][count][key]:
                    if grandKey == "Title":
                        print("            \"Title\" : \"" + str(results['hits']['hits'][count][key][grandKey]).replace('[', '').replace('.', '').replace(']', '').strip('\'').strip('\"') + "\",")
                    elif grandKey == "PMID":
                        print("            \"PMID\" : " + str(results['hits']['hits'][count][key][grandKey]).replace('[', '').replace('.', '').replace(']', '').replace('\'', '\"') + ",")
                    elif grandKey == "Uploader":
                        print("            \"Uploader\" : " + str(results['hits']['hits'][count][key][grandKey]).replace('[', '').replace('.', '').replace(']', '').replace('\'', '\"'))
                print("          }")
                if count < 2:
                    print("        },")
                else:
                    print("        }")
        
        count += 1
    print("      ]")
    print("    }")
    print("  }")

In [20]:
results = SearchQuery(all_docs=True)
PrintResults(results)

{
   "took" : 7
   "timed_out" : False
   "_shards" : {
     "total" : 1,
     "successful" : 1,
     "skipped" : 0,
     "failed" : 0
   },
   "hits" : {
     "total" : {
       "value" : 10000,
       "relation" : "gte"
     },
     "max_score" : 1.0,
     "hits" : [
        {
          "_index" : "pubmed2023",
          "_type" : "_doc",
          "_id" : "15365562",
          "_score" : 1.0,
          "_source" : {
            "Title" : "Sustained antiproliferative mechanisms by RB24, a targeted precursor of multiple inhibitors of epidermal growth factor receptor and a DNA alkylating agent in the A431 epidermal carcinoma of the vulva cell line",
            "PMID" : "15365562",
          }
        },
        {
          "_index" : "pubmed2023",
          "_type" : "_doc",
          "_id" : "15365563",
          "_score" : 1.0,
          "_source" : {
            "Title" : "Overexpression of the Ets-1 transcription factor in human breast cancer",
            "PMID" : "15365563",
   

  query = es.search(


In [21]:
import json
print(json.dumps(results, sort_keys=True, indent=4))

{
    "_shards": {
        "failed": 0,
        "skipped": 0,
        "successful": 1,
        "total": 1
    },
    "hits": {
        "hits": [
            {
                "_id": "BTZ6V4YBLon1h0PVHLaG",
                "_index": "pubmed2023",
                "_score": 1.0,
                "_source": {
                    "Abstract": [
                        [
                            "Recently, with the purpose of enhancing the potency of epidermal growth factor receptor (EGFR)-based therapies, we designed a novel strategy termed 'Cascade-release targeting' that seeks to develop molecules capable of degrading to multiple tyrosine kinase (TK) inhibitors and highly reactive electrophiles, in a stepwise fashion. Here we report on the first prototype of this model, RB24, a masked methyltriazene, that in addition to being an inhibitor on its own was designed to degrade to RB14, ZR08, RB10+a DNA alkylating methyldiazonium species. The cascade degradation of RB24 requires the generatio

In [50]:
if __name__ == '__main__':
    user = 'elastic'
    password = 'iYYX96TPlAJ000UJ0vqa'
    index_name = 'pubmed2023'
    mapping = genmapping()
    es = Elasticsearch(hosts=['localhost:9200'], timeout=100, http_auth=(user, password))
    es.indices.delete(index=index_name, ignore=[400, 404])
    es.indices.create(index=index_name, mappings=mapping['mappings'])
    deque(helpers.parallel_bulk(es, gendata(es), index=index_name), maxlen=0)
    es.indices.refresh()

KeyboardInterrupt: 