 * @ Author: Yohei Ohto
 * @ Create Time: 2025-11-27 15:14:31
 * @ Modified time: 2025-11-27 15:15:00
 * @ Description: search PubMed abstracts using Elasticsearch and find articles containing a drugbank compound name.

# parse drugbank xml

In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import tarfile
from tqdm import tqdm
from elasticsearch import Elasticsearch

In [3]:
# raw_db_path = "/workspace/1-project/2-tox_pred/2-patent_date_split/workspace/data/drugbank/raw/full database.xml.tar"
defreezed_db_path = "/workspace/0-utils/1-data/drugbank/full_database.xml"

# import tarfile
# with tarfile.open(raw_db_path, 'w:gz') as tar:
#   tar.add(defreezed_db_path)

In [4]:
tree = ET.parse(defreezed_db_path)
root = tree.getroot()
del tree

drug_ids = []
drug_names = []

ns = "{http://www.drugbank.ca}"

for drug in root:
    # groups 取得
    groups = drug.find(f"{ns}groups")
    if groups is None:
        continue

    group_values = [g.text for g in groups.findall(f"{ns}group")]

    # ID
    drug_id_ele = drug.find(f"{ns}drugbank-id")
    drug_id = drug_id_ele.text if drug_id_ele is not None else ""

    # name
    drug_name_ele = drug.find(f"{ns}name")
    drug_name = drug_name_ele.text if drug_name_ele is not None else ""

    drug_ids.append(drug_id)
    drug_names.append(drug_name)

df = pd.DataFrame({
    "id": drug_ids,
    "name": drug_names
})

In [5]:
df.head()

Unnamed: 0,id,name
0,DB00001,Lepirudin
1,DB00002,Cetuximab
2,DB00003,Dornase alfa
3,DB00004,Denileukin diftitox
4,DB00005,Etanercept


In [15]:
print(f"Total drugs in DrugBank: {df.shape[0]}")

Total drugs in DrugBank: 16581


# search drugbank compound names in pubmed abstracts

In [17]:
ES_HOST = "http://elasticsearch:9200"
ES_PASSWORD = "micgm1Gemini"
es = Elasticsearch(ES_HOST, basic_auth=("elastic", ES_PASSWORD))

query_result = []
too_many_hits = []
for i in tqdm(range(len(df))):
    query_word = df.loc[i, "name"]
    search_body = { "query": { "multi_match": { "query": query_word, "fields": ["SENTENCE"], "type": "phrase" } }, "sort": [ {"PMID": {"order": "asc"}} ], "_source": ["PMID", "SENTID", "SENTENCE"], "size": 3 }
    res = es.search(index="pubmed_sentence", body=search_body)
    total_hits = res['hits']['total']['value'] 

    if total_hits >= 1000:
        too_many_hits.append((query_word, total_hits))
        continue
    else:
        for hit in res['hits']['hits']:
            source = hit['_source']
            query_result.append({
                "drug_name": query_word,
                "PMID": source['PMID'],
                "SENTID": source['SENTID'],
                "SENTENCE": source['SENTENCE']
            })

100%|██████████| 16581/16581 [01:02<00:00, 264.77it/s]


In [18]:
print(len(too_many_hits), "drugs have too many hits (>=1000) and were skipped.")

2402 drugs have too many hits (>=1000) and were skipped.


In [20]:
es = Elasticsearch(ES_HOST, basic_auth=("elastic", ES_PASSWORD))

too_many_hits_result = []
for query_word, _ in tqdm(too_many_hits):
    search_body = {
        "query": {
            "multi_match": {
                "query": query_word,
                "fields": ["SENTENCE"],
                "type": "phrase"
            }
        },
        "_source": ["PMID", "SENTID", "SENTENCE"],
        "sort": [{"PMID": {"order": "asc"}}]
    }

    page = es.search(
        index="pubmed_sentence",
        body=search_body,
        scroll="2m",
        size=1000
    )

    sid = page['_scroll_id']
    hits = page['hits']['hits']

    while len(hits):
        for hit in hits:
            source = hit['_source']
            too_many_hits_result.append({
                "drug_name": query_word,
                "PMID": source['PMID'],
                "SENTID": source['SENTID'],
                "SENTENCE": source['SENTENCE']
            })
        page = es.scroll(scroll_id=sid, scroll="2m")
        sid = page['_scroll_id']
        hits = page['hits']['hits']

  page = es.search(
100%|██████████| 2402/2402 [29:04<00:00,  1.38it/s]  


In [21]:
print(len(query_result), "results from initial queries.")
print(len(too_many_hits_result), "results from too-many-hits queries.")

23981 results from initial queries.
36746297 results from too-many-hits queries.


# stemming search

In [6]:
name = df['name']

In [7]:
ES_HOST = "http://elasticsearch:9200"
ES_PASSWORD = "micgm1Gemini"
es = Elasticsearch(ES_HOST, basic_auth=("elastic", ES_PASSWORD))

query_result = []
too_many_hits = []
for i in tqdm(range(len(name))):
    query_word = name[i]
    search_body = { "query": { "multi_match": { "query": query_word, "fields": ["SENTENCE"], "type": "phrase" } }, "sort": [ {"PMID": {"order": "asc"}} ], "_source": ["PMID", "SENTID", "SENTENCE"]}
    res = es.search(index="pubmed_sentence_v2", body=search_body)
    total_hits = res['hits']['total']['value'] 

    if total_hits >= 1000:
        too_many_hits.append((query_word, total_hits))
        continue
    else:
        for hit in res['hits']['hits']:
            source = hit['_source']
            query_result.append({
                "drug_name": query_word,
                "PMID": source['PMID'],
                "SENTID": source['SENTID'],
                "SENTENCE": source['SENTENCE']
            })

100%|██████████| 16581/16581 [01:18<00:00, 212.12it/s]


In [8]:
print(f"Drugs with too many hits (>=1000): {len(too_many_hits)}")

Drugs with too many hits (>=1000): 2439


In [9]:
es = Elasticsearch(ES_HOST, basic_auth=("elastic", ES_PASSWORD))

too_many_hits_result = []
for query_word, _ in tqdm(too_many_hits):
    if query_word == "cell":
        continue
    search_body = {
        "query": {
            "multi_match": {
                "query": query_word,
                "fields": ["SENTENCE"],
                "type": "phrase"
            }
        },
        "_source": ["PMID", "SENTID", "SENTENCE"],
        "sort": [{"PMID": {"order": "asc"}}]
    }

    page = es.search(
        index="pubmed_sentence_v2",
        body=search_body,
        scroll="2m",
        size=1000
    )

    sid = page['_scroll_id']
    hits = page['hits']['hits']

    while len(hits):
        for hit in hits:
            source = hit['_source']
            too_many_hits_result.append({
                "drug_name": query_word,
                "PMID": source['PMID'],
                "SENTID": source['SENTID'],
                "SENTENCE": source['SENTENCE']
            })
        page = es.scroll(scroll_id=sid, scroll="2m")
        sid = page['_scroll_id']
        hits = page['hits']['hits']

  page = es.search(
100%|██████████| 2439/2439 [40:56<00:00,  1.01s/it]  


In [10]:
print(len(query_result), "results from initial queries.")
print(len(too_many_hits_result), "results from too-many-hits queries.")

71353 results from initial queries.
40185284 results from too-many-hits queries.


In [11]:
df = pd.DataFrame(query_result + too_many_hits_result)
df.to_csv("/workspace/0-utils/1-data/pubmed/pubmed_drugbank_sentence/drugbank_sentences_v2.csv", index=False)