 * @ Author: Yohei Ohto
 * @ Create Time: 2025-11-28 20:27:05
 * @ Modified time: 2025-11-28 20:28:31
 * @ Description: meddra term search

1. SOC  (System Organ Class)       : 器官別大分類 (最上位)  
    │  
    └─ 2. HLGT (High Level Group Term) : 高位グループ語  
         │  
         └─ 3. HLT  (High Level Term)       : 高位語  
              │  
              └─ 4. PT   (Preferred Term)        : 基本語 ★分析・集計の標準単位  
                   │  
                   └─ 5. LLT  (Lowest Level Term)     : 下層語 (入力用語・シノニム)  

In [1]:
import pandas as pd
from tqdm import tqdm
from elasticsearch import Elasticsearch
import csv

#  化合物名をすでに含んでいる文でのみ検索する

# pubmed × drugbank × meddra

## drugbank data

In [2]:
drugbank = pd.read_csv("/workspace/99-NAS_data/pubmed/pubmed_drugbank_sentence/drugbank_sentences_v2.csv")

In [3]:
drugbank.head()

Unnamed: 0,drug_name,PMID,SENTID,SENTENCE
0,Lepirudin,10073268,36206825,The central role of thrombin generation in thi...
1,Lepirudin,10209835,36692541,HIT type II was presumed and recanalization wa...
2,Lepirudin,10209835,36692547,This case report illustrates the complicated d...
3,Lepirudin,10218429,36748792,The patient was treated with lepirudin at body...
4,Lepirudin,10229643,36825405,"Lepirudin has a short half-life, and only 50-6..."


In [4]:
tatget_sentid = list(set(drugbank['SENTID'].tolist()))
print(len(drugbank), len(drugbank['SENTID'].unique()), len(tatget_sentid))

40256637 30324930 30324930


In [5]:
tatget_sent_id = sorted(tatget_sentid)

## meddra data

In [6]:
soc = pd.read_csv("/workspace/99-NAS_data/meddra/240828_processed_yoshikawa/soc.tsv", sep="\t")
hlgt = pd.read_csv("/workspace/99-NAS_data/meddra/240828_processed_yoshikawa/hlgt.tsv", sep="\t")
hlt = pd.read_csv("/workspace/99-NAS_data/meddra/240828_processed_yoshikawa/hlt.tsv", sep="\t")
pt = pd.read_csv("/workspace/99-NAS_data/meddra/240828_processed_yoshikawa/pt.tsv", sep="\t")
llt = pd.read_csv("/workspace/99-NAS_data/meddra/240828_processed_yoshikawa/llt.tsv", sep="\t")

In [7]:
print(len(soc), len(hlgt), len(hlt), len(pt), len(llt))
print(len(soc) + len(hlgt) + len(hlt) + len(pt) + len(llt))

27 337 1738 26409 88345
116856


In [8]:
data = []
for n in range(len(soc)):
    data.append({
        "level": "soc",
        "term": soc.loc[n, 'name']
    })

for n in range(len(hlgt)):
    data.append({
        "level": "hlgt",
        "term": hlgt.loc[n, 'name']
    })

for n in range(len(hlt)):
    data.append({
        "level": "hlt",
        "term": hlt.loc[n, 'name']
    })

for n in range(len(pt)):
    data.append({
        "level": "pt",
        "term": pt.loc[n, 'name']
    })

for n in range(len(llt)):
    data.append({
        "level": "llt",
        "term": llt.loc[n, 'name']
    })

In [9]:
import os
import csv
from tqdm import tqdm

# 設定
ES_HOST = "http://elasticsearch:9200"
ES_PASSWORD = "micgm1Gemini"
es = Elasticsearch(ES_HOST, basic_auth=("elastic", ES_PASSWORD))

# 保存先ディレクトリ（ここに出力されます）
output_dir = "/workspace/99-NAS_data/pubmed/pubmed_meddra_sentence/"
base_filename = "meddra_sentences" 

fieldnames = ["level", "term", "PMID", "SENTID", "SENTENCE"]
too_many_hits = []


file_handles = {}
csv_writers = {}

try:
    for i in tqdm(range(len(data))):
        query_word = data[i]["term"]
        current_level = data[i]["level"] # 現在のデータのlevelを取得

        # --- ファイル管理ロジック ---
        # そのlevelのファイルがまだ開かれていなければ作成する
        if current_level not in file_handles:
            # ファイル名を生成 (例: meddra_sentences_PT.csv)
            filename = f"{base_filename}_{current_level}.csv"
            filepath = os.path.join(output_dir, filename)
            
            # ファイルを開く
            f = open(filepath, mode='w', newline='', encoding='utf-8')
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            
            # 辞書に登録
            file_handles[current_level] = f
            csv_writers[current_level] = writer
        
        # 対応するwriterを取得
        writer = csv_writers[current_level]
        # ------------------------

        # 検索実行
        search_body = { 
            "query": { "multi_match": { "query": query_word, "fields": ["SENTENCE"], "type": "phrase" } }, 
            "sort": [ {"PMID": {"order": "asc"}} ], 
            "_source": ["PMID", "SENTID", "SENTENCE"],
            "size": 3 # 必要に応じて調整
        }

        res = es.search(index="pubmed_sentence_v2", body=search_body)
        total_hits = res['hits']['total']['value'] 

        if total_hits >= 1000:
            too_many_hits.append(data[i])
            continue
        else:
            row_list = []
            for hit in res['hits']['hits']:
                source = hit['_source']
                row = {
                    "level": current_level,
                    "term": query_word,
                    "PMID": source['PMID'],
                    "SENTID": source['SENTID'],
                    "SENTENCE": source['SENTENCE']
                }
                row_list.append(row)
            
            # 対応するlevelのファイルに書き込み
            writer.writerows(row_list)

finally:
    # エラーが起きても起きなくても、最後に必ず全てのファイルを閉じる
    print("Closing files...")
    for level, f in file_handles.items():
        f.close()
        print(f"Closed: {level}")

print("All done.")

100%|██████████| 116856/116856 [13:28<00:00, 144.57it/s]

Closing files...
Closed: soc
Closed: hlgt
Closed: hlt
Closed: pt
Closed: llt
All done.





In [10]:
print(f"Drugs with too many hits (>=1000): {len(too_many_hits)}")

Drugs with too many hits (>=1000): 12983


In [11]:
import os
import csv
from tqdm import tqdm

# 設定（前半と同じパス・ファイル名である必要があります）
output_dir = "/workspace/99-NAS_data/pubmed/pubmed_meddra_sentence/"
base_filename = "meddra_sentences"
fieldnames = ["level", "term", "PMID", "SENTID", "SENTENCE"]

# ファイルハンドラ管理用
file_handles = {}
csv_writers = {}

try:
    for item in tqdm(too_many_hits):
        query_word = item["term"]
        current_level = item["level"]

        # --- ファイル管理ロジック (追記モード) ---
        if current_level not in file_handles:
            filename = f"{base_filename}_{current_level}.csv"
            filepath = os.path.join(output_dir, filename)
            
            # ファイルが存在するか確認（ヘッダーを書くかどうかの判定用）
            file_exists = os.path.exists(filepath)
            
            # ★重要: mode='a' (追記モード) で開く
            f = open(filepath, mode='a', newline='', encoding='utf-8')
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            
            # もし前半の処理でファイルが作られていなかった場合だけヘッダーを書く
            if not file_exists:
                writer.writeheader()
            
            file_handles[current_level] = f
            csv_writers[current_level] = writer
        
        writer = csv_writers[current_level]
        # ------------------------------------

        search_body = {
            "query": {
                "multi_match": {
                    "query": query_word,
                    "fields": ["SENTENCE"],
                    "type": "phrase"
                }
            },
            "_source": ["PMID", "SENTID", "SENTENCE"],
            # Scroll時は _doc 順が一番速い
            "sort": [{"_doc": "asc"}]
        }

        # Scroll検索開始
        page = es.search(
            index="pubmed_sentence_v2",
            body=search_body,
            scroll="5m", # 少し長めに
            size=1000
        )

        sid = page['_scroll_id']
        hits = page['hits']['hits']
        
        try:
            while len(hits) > 0:
                row_list = []
                for hit in hits:
                    source = hit['_source']
                    row_list.append({
                        "level": current_level,
                        "term": query_word,
                        "PMID": source['PMID'],
                        "SENTID": source['SENTID'],
                        "SENTENCE": source['SENTENCE']
                    })
                
                # 該当レベルのファイルに書き込み
                writer.writerows(row_list)
                
                # 次のページへ
                page = es.scroll(scroll_id=sid, scroll="5m")
                sid = page['_scroll_id']
                hits = page['hits']['hits']
        
        finally:
            # 1単語の全件取得が終わったら、Scroll IDを削除してメモリ解放（重要）
            es.clear_scroll(scroll_id=sid)

finally:
    # 最後に全てのファイルを閉じる
    print("Closing files...")
    for level, f in file_handles.items():
        f.close()
        print(f"Closed: {level}")

print("All done.")

  page = es.search(
100%|██████████| 12983/12983 [4:01:48<00:00,  1.12s/it]    

Closing files...
Closed: soc
Closed: hlgt
Closed: hlt
Closed: pt
Closed: llt
All done.



