In [None]:
df_pubchem = pd.read_csv("/workspace/99-NAS_data/pubchem/synonym/CID-Synonym-cleaned-common-cid-no-four-digits.tsv", sep="\t")
df_pubchem.head()

In [None]:
synonym = list(set(df_pubchem["Synonym"].to_list()))
print(len(synonym))

In [None]:
synonym_cid = {}
for i in range(len(df)):
    syn = df_pubchem.iloc[i]['Synonym']
    cid = df_pubchem.iloc[i]['CID']
    synonym_cid[syn] = cid

In [None]:
import os
import csv
from tqdm import tqdm

# 設定
ES_HOST = "http://elasticsearch:9200"
ES_PASSWORD = "micgm1Gemini"
es = Elasticsearch(ES_HOST, basic_auth=("elastic", ES_PASSWORD))

# 保存先ディレクトリ（ここに出力されます）
output_dir = "/workspace/99-NAS_data/pubmed/pubmed_pubchem_sentence/"
base_filename = "pubchem_sentences" 

fieldnames = ["TERM", "PMID", "SENTID", "SENTENCE"]
too_many_hits = []


file_handles = {}
csv_writers = {}

filename = f"{base_filename}.csv"
filepath = os.path.join(output_dir, filename)
f = open(filepath, mode='w', newline='', encoding='utf-8')
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()

for i in tqdm(range(len(synonym))):
    query_word = synonym[i]

    search_body = { 
        "query": { "multi_match": { "query": query_word, "fields": ["SENTENCE"], "type": "phrase" } }, 
        "sort": [ {"PMID": {"order": "asc"}} ], 
        "_source": ["PMID", "SENTID", "SENTENCE"],
    }

    res = es.search(index="pubmed_sentence_v2", body=search_body)
    total_hits = res['hits']['total']['value'] 

    if total_hits >= 1000:
        too_many_hits.append(synonym[i])
        continue
    else:
        row_list = []
        for hit in res['hits']['hits']:
            source = hit['_source']
            row = {
                "TERM": query_word,
                "PMID": source['PMID'],
                "SENTID": source['SENTID'],
                "SENTENCE": source['SENTENCE']
            }
            row_list.append(row)

        writer.writerows(row_list)

In [None]:


for item in tqdm(too_many_hits):
    query_word = item
    search_body = {
        "query": {
            "multi_match": {
                "query": query_word,
                "fields": ["SENTENCE"],
                "type": "phrase"
            }
        },
        "_source": ["PMID", "SENTID", "SENTENCE"],
        "sort": [{"_doc": "asc"}]
    }
    page = es.search(
        index="pubmed_sentence_v2",
        body=search_body,
        scroll="5m",
        size=1000
    )

    sid = page['_scroll_id']
    hits = page['hits']['hits']
    
    try:
        while len(hits) > 0:
            row_list = []
            for hit in hits:
                source = hit['_source']
                row_list.append({
                    "TERM": query_word,
                    "PMID": source['PMID'],
                    "SENTID": source['SENTID'],
                    "SENTENCE": source['SENTENCE']
                })
            
            writer.writerows(row_list)
            
            page = es.scroll(scroll_id=sid, scroll="5m")
            sid = page['_scroll_id']
            hits = page['hits']['hits']
    
    finally:
        es.clear_scroll(scroll_id=sid)