In [6]:
import json

In [7]:
path = "/workspace/99-NAS_data/rxnorm/processed/rxnorm_drugname_clusters.json"
with open(path, "r") as f:
    rxnorm_drugname_clusters = json.load(f)
print(f"Loaded {len(rxnorm_drugname_clusters)} drug name clusters from {path}")

Loaded 98163 drug name clusters from /workspace/99-NAS_data/rxnorm/processed/rxnorm_drugname_clusters.json


In [9]:
ENGLISH_STOPWORDS = {
    "in", "on", "as", "at", "by", "for", "to", "with",
    "and", "or", "not", "all", "can", "may"
}

def is_safe_rxnorm_surface(name: str) -> bool:
    n = name.strip()

    # too short (IN, IV, etc.)
    if len(n) < 4:
        return False

    # all-uppercase abbreviation (PABA, ACE, etc.)
    if n.isupper():
        return False

    # common English token
    if n.lower() in ENGLISH_STOPWORDS:
        return False

    return True

In [13]:
rxnorm_ids

{'canonical': 'Penmenvy',
 'names_in': [],
 'names_pt': [],
 'names_sy': [],
 'names_bn': ['Penmenvy'],
 'relations': {'has_tradename': ['1593128.0',
   '901505.0',
   '901507.0',
   '901509.0',
   '901511.0']}}

In [16]:
whole_drug_name = set()
for rxnorm_ids, drug_name in rxnorm_drugname_clusters.items():
    for stage in drug_name:
        for name in drug_name[stage]:
            if is_safe_rxnorm_surface(name):
                whole_drug_name.add(name)
print(f"Extracted {len(whole_drug_name)} safe RxNorm drug names.")

Extracted 151295 safe RxNorm drug names.


In [17]:
list(whole_drug_name)[0:10]

['Bivatuzumab',
 'Norlyda',
 'Vi-Daylin/F Plus Iron',
 'Posterior pituitary hormone-containing product',
 'sweetleaf preparation',
 'Sodium pump',
 'Hb 5(A2), Pro-arg',
 'Ponstel',
 'Amphotericin B-containing product in parenteral dose form',
 'Simethicone-containing product']

In [18]:
from elasticsearch import Elasticsearch

ES_HOST = "http://elasticsearch:9200"
ES_PASSWORD = "micgm1Gemini"
es = Elasticsearch(ES_HOST, basic_auth=("elastic", ES_PASSWORD))

In [19]:
import os
import csv
from tqdm import tqdm
from elasticsearch import Elasticsearch

# 設定
ES_HOST = "http://elasticsearch:9200"
ES_PASSWORD = "micgm1Gemini"
es = Elasticsearch(ES_HOST, basic_auth=("elastic", ES_PASSWORD))

output_dir = "/workspace/99-NAS_data/pubmed/rxnorm/"
base_filename = "drugname_rxnorm_sentences" 

fieldnames = ["TERM", "PMID", "SENTID", "SENTENCE"]
too_many_hits = []


file_handles = {}
csv_writers = {}

filename = f"{base_filename}.csv"
filepath = os.path.join(output_dir, filename)
f = open(filepath, mode='w', newline='', encoding='utf-8')
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()

whole_drug_name = list(whole_drug_name)

for i in tqdm(range(len(whole_drug_name))):
    query_word = whole_drug_name[i]

    search_body = { 
        "query": { "multi_match": { "query": query_word, "fields": ["SENTENCE"], "type": "phrase" } }, 
        "sort": [ {"PMID": {"order": "asc"}} ], 
        "_source": ["PMID", "SENTID", "SENTENCE"],
    }

    res = es.search(index="pubmed_sentence", body=search_body)
    total_hits = res['hits']['total']['value'] 

    if total_hits >= 1000:
        too_many_hits.append(whole_drug_name[i])
        continue
    else:
        row_list = []
        for hit in res['hits']['hits']:
            source = hit['_source']
            row = {
                "TERM": query_word,
                "PMID": source['PMID'],
                "SENTID": source['SENTID'],
                "SENTENCE": source['SENTENCE']
            }
            row_list.append(row)

        writer.writerows(row_list)

  0%|          | 0/151295 [00:00<?, ?it/s]

100%|██████████| 151295/151295 [07:22<00:00, 341.89it/s]


In [11]:
print(es.indices.get_alias(index="*").keys())

dict_keys(['pubmed_sentence'])


In [20]:


for item in tqdm(too_many_hits):
    query_word = item
    search_body = {
        "query": {
            "multi_match": {
                "query": query_word,
                "fields": ["SENTENCE"],
                "type": "phrase"
            }
        },
        "_source": ["PMID", "SENTID", "SENTENCE"],
        "sort": [{"_doc": "asc"}]
    }
    page = es.search(
        index="pubmed_sentence",
        body=search_body,
        scroll="5m",
        size=1000
    )

    sid = page['_scroll_id']
    hits = page['hits']['hits']
    
    try:
        while len(hits) > 0:
            row_list = []
            for hit in hits:
                source = hit['_source']
                row_list.append({
                    "TERM": query_word,
                    "PMID": source['PMID'],
                    "SENTID": source['SENTID'],
                    "SENTENCE": source['SENTENCE']
                })
            
            writer.writerows(row_list)
            
            page = es.scroll(scroll_id=sid, scroll="5m")
            sid = page['_scroll_id']
            hits = page['hits']['hits']
    
    finally:
        es.clear_scroll(scroll_id=sid)

  page = es.search(
100%|██████████| 9368/9368 [2:24:52<00:00,  1.08it/s]   


In [1]:
# 毒性名 - 対象とするファイル
hlgt = "/workspace/99-NAS_data/pubmed/pubmed_meddra_sentence/meddra_sentences_hlgt.csv"
hlt = "/workspace/99-NAS_data/pubmed/pubmed_meddra_sentence/meddra_sentences_hlt.csv"
llt = "/workspace/99-NAS_data/pubmed/pubmed_meddra_sentence/meddra_sentences_llt.csv"
pt = "/workspace/99-NAS_data/pubmed/pubmed_meddra_sentence/meddra_sentences_pt.csv"
soc = "/workspace/99-NAS_data/pubmed/pubmed_meddra_sentence/meddra_sentences_soc.csv"

In [2]:
import pandas
import sqlite3
import pandas as pd
from tqdm import tqdm
import os

In [3]:
chunksize = 1000000
for i, chunk in enumerate(pd.read_csv(
    "/workspace/99-NAS_data/pubmed/rxnorm/drugname_rxnorm_sentences.csv",
    chunksize=chunksize
)):
    df = chunk
    if i == 0:
        break

In [4]:
df

Unnamed: 0,TERM,PMID,SENTID,SENTENCE
0,Bivatuzumab,14506195,52666635,"Aiming for a less immunogenic anti-CD44v6 MAb,..."
1,Bivatuzumab,14530488,52818394,"From December 1999 until July 2001, a phase I ..."
2,Bivatuzumab,14530488,52818395,The aim of the trial was to assess the safety ...
3,Bivatuzumab,14530488,52818410,"Given the acceptable tumor doses, (186)Re-labe..."
4,Bivatuzumab,14627130,53309660,"Therefore, humanized monoclonal antibody BIWA ..."
...,...,...,...,...
999995,alanine,19799019,82925906,"To evaluate the hepatic injury, the serum alan..."
999996,alanine,19799266,82927642,"Hemoglobin concentration, erythrocytes, leucoc..."
999997,alanine,19799980,82932926,"Moreover, concentrations of aspartate aminotra..."
999998,alanine,19800000,82933129,"Furthermore, alanine substitution of the argin..."


In [7]:
conn = sqlite3.connect("merge_work.db")

rxnorm = "/workspace/99-NAS_data/pubmed/rxnorm/drugname_rxnorm_sentences.csv"
for chunk in tqdm(pd.read_csv(rxnorm, chunksize=100000)):
    chunk.to_sql("drug_table", conn, if_exists="append", index=False)

for chunk in tqdm(pd.read_csv(hlgt, chunksize=100000)):
    chunk.to_sql("hlgt_table", conn, if_exists="append", index=False)

conn.execute("CREATE INDEX idx_drug ON drug_table(PMID, SENTID)")
conn.execute("CREATE INDEX idx_hlgt ON hlgt_table(PMID, SENTID)")

query = """
SELECT
    d.PMID,
    d.SENTID,
    d.SENTENCE,
    d.TERM,
    m.term,
    m.level
FROM
    drug_table d
INNER JOIN
    hlgt_table m
ON
    d.PMID = m.PMID AND d.SENTID = m.SENTID
"""

for chunk in tqdm(pd.read_sql_query(query, conn, chunksize=100000)):
    chunk.to_csv("co_occurrence_sentences_rxnorm_hlgt.csv", mode='a', index=False, header=not os.path.exists("co_occurrence_sentences_rxnorm_hlgt.csv"))

conn.close()

2837it [31:29,  1.50it/s]
22it [00:13,  1.68it/s]
20it [03:50, 11.53s/it]


In [8]:
conn = sqlite3.connect("merge_work.db")

for chunk in tqdm(pd.read_csv(hlt, chunksize=100000)):
    chunk.to_sql("hlt_table", conn, if_exists="append", index=False)

conn.execute("CREATE INDEX idx_hlt ON hlt_table(PMID, SENTID)")

query = """
SELECT
    d.PMID,
    d.SENTID,
    d.SENTENCE,
    d.TERM,
    m.term,
    m.level
FROM
    drug_table d
INNER JOIN
    hlt_table m
ON
    d.PMID = m.PMID AND d.SENTID = m.SENTID
"""

for chunk in tqdm(pd.read_sql_query(query, conn, chunksize=100000)):
    chunk.to_csv("/workspace/99-NAS_data/pubmed/merged_database/co_occurrence_sentences_rxnorm_hlt.csv", mode='a', index=False, header=not os.path.exists("/workspace/99-NAS_data/pubmed/merged_database/co_occurrence_sentences_rxnorm_hlt.csv"))

conn.close()
print("完了しました。")

99it [01:08,  1.44it/s]
159it [07:51,  2.96s/it]

完了しました。





In [9]:
conn = sqlite3.connect("merge_work.db")
for chunk in tqdm(pd.read_csv(llt, chunksize=100000)):
    chunk.to_sql("llt_table", conn, if_exists="append", index=False)

conn.execute("CREATE INDEX idx_llt ON llt_table(PMID, SENTID)")

query = """
SELECT
    d.PMID,
    d.SENTID,
    d.SENTENCE,
    d.TERM,
    m.term,
    m.level
FROM
    drug_table d
INNER JOIN
    llt_table m
ON
    d.PMID = m.PMID AND d.SENTID = m.SENTID
"""

for chunk in tqdm(pd.read_sql_query(query, conn, chunksize=100000)):
    chunk.to_csv("/workspace/99-NAS_data/pubmed/merged_database/co_occurrence_sentences_rxnorm_llt.csv", mode='a', index=False, header=not os.path.exists("/workspace/99-NAS_data/pubmed/merged_database/co_occurrence_sentences_rxnorm_llt.csv"))

conn.close()
print("完了しました。")

3418it [40:07,  1.42it/s]
5745it [6:15:32,  3.92s/it]

完了しました。





In [10]:
conn = sqlite3.connect("merge_work.db")

for chunk in tqdm(pd.read_csv(pt, chunksize=100000)):
    chunk.to_sql("pt_table", conn, if_exists="append", index=False)

conn.execute("CREATE INDEX idx_pt ON pt_table(PMID, SENTID)")

query = """
SELECT
    d.PMID,
    d.SENTID,
    d.SENTENCE,
    d.TERM,
    m.term,
    m.level
FROM
    drug_table d
INNER JOIN
    pt_table m
ON
    d.PMID = m.PMID AND d.SENTID = m.SENTID
"""

for chunk in tqdm(pd.read_sql_query(query, conn, chunksize=100000)):
    chunk.to_csv("/workspace/99-NAS_data/pubmed/merged_database/co_occurrence_sentences_rxnorm_pt.csv", mode='a', index=False, header=not os.path.exists("/workspace/99-NAS_data/pubmed/merged_database/co_occurrence_sentences_rxnorm_pt.csv"))

conn.close()
print("完了しました。")

1368it [16:18,  1.40it/s]
2094it [1:20:42,  2.31s/it]

完了しました。





In [11]:

conn = sqlite3.connect("merge_work.db")
for chunk in tqdm(pd.read_csv(soc, chunksize=100000)):
    chunk.to_sql("soc_table", conn, if_exists="append", index=False)

conn.execute("CREATE INDEX idx_soc ON soc_table(PMID, SENTID)")

query = """
SELECT
    d.PMID,
    d.SENTID,
    d.SENTENCE,
    d.TERM,
    m.term,
    m.level
FROM
    drug_table d
INNER JOIN
    soc_table m
ON
    d.PMID = m.PMID AND d.SENTID = m.SENTID
"""

for chunk in tqdm(pd.read_sql_query(query, conn, chunksize=100000)):
    chunk.to_csv("/workspace/99-NAS_data/pubmed/merged_database/co_occurrence_sentences_rxnorm_soc.csv", mode='a', index=False, header=not os.path.exists("/workspace/99-NAS_data/pubmed/merged_database/co_occurrence_sentences_rxnorm_soc.csv"))

conn.close()
print("完了しました。")

55it [00:41,  1.34it/s]
87it [05:44,  3.96s/it]

完了しました。



