 * @ Author: Yohei Ohto
 * @ Create Time: 2025-11-28 14:13:45
 * @ Modified time: 2025-11-28 14:14:46
 * @ Description: search cell name in pubmed

In [2]:
import pandas as pd
from tqdm import tqdm
from elasticsearch import Elasticsearch
import time

In [14]:
co_path = "/workspace/0-utils/1-data/cell_ontology/cells_is_a_single_path.csv"
df_co = pd.read_csv(co_path)

In [15]:
df_co.head()

Unnamed: 0,id,name,synonyms,synonyms_no_abbr,parent,children,layer
0,CL:0000000,cell,,,,CL:0000039;CL:0000064;CL:0000066;CL:0000080;CL...,0
1,CL:0000725,nitrogen fixing cell,,,CL:0000000,CL:0000724,1
2,CL:0002559,hair follicle cell,,,CL:0000000,CL:0002483;CL:0002560;CL:0002561;CL:0002562;CL...,1
3,CL:0000183,contractile cell,,,CL:0000000,CL:0000187;CL:0000185,1
4,CL:0000188,cell of skeletal muscle,skeletal muscle cell,skeletal muscle cell,CL:0000000,CL:0008017;CL:0000594;CL:0000355;CL:0011027;CL...,1


In [16]:
print(len(df_co), len(df_co["synonyms"].unique()), len(df_co["synonyms_no_abbr"].unique()))

2747 1057 1030


In [17]:
cell_name = df_co["synonyms_no_abbr"].dropna().tolist() + df_co["synonyms"].dropna().tolist() + df_co["name"].dropna().tolist()
cell_name = list(set(cell_name))
print("Total unique cell names to search:", len(cell_name))

Total unique cell names to search: 3850


In [56]:
ES_HOST = "http://elasticsearch:9200"
ES_PASSWORD = "micgm1Gemini"
es = Elasticsearch(ES_HOST, basic_auth=("elastic", ES_PASSWORD))

query_result = []
too_many_hits = []
for i in tqdm(range(len(cell_name))):
    query_word = cell_name[i]
    search_body = { "query": { "multi_match": { "query": query_word, "fields": ["SENTENCE"], "type": "phrase" } }, "sort": [ {"PMID": {"order": "asc"}} ], "_source": ["PMID", "SENTID", "SENTENCE"]}
    res = es.search(index="pubmed_sentence", body=search_body)
    total_hits = res['hits']['total']['value'] 

    if total_hits >= 1000:
        too_many_hits.append((query_word, total_hits))
        continue
    else:
        for hit in res['hits']['hits']:
            source = hit['_source']
            query_result.append({
                "drug_name": query_word,
                "PMID": source['PMID'],
                "SENTID": source['SENTID'],
                "SENTENCE": source['SENTENCE']
            })

100%|██████████| 3850/3850 [00:21<00:00, 175.53it/s]


In [57]:
print(f"Drugs with too many hits (>=1000): {len(too_many_hits)}")

Drugs with too many hits (>=1000): 210


In [58]:
es = Elasticsearch(ES_HOST, basic_auth=("elastic", ES_PASSWORD))

too_many_hits_result = []
for query_word, _ in tqdm(too_many_hits):
    if query_word == "cell":
        continue
    search_body = {
        "query": {
            "multi_match": {
                "query": query_word,
                "fields": ["SENTENCE"],
                "type": "phrase"
            }
        },
        "_source": ["PMID", "SENTID", "SENTENCE"],
        "sort": [{"PMID": {"order": "asc"}}]
    }

    page = es.search(
        index="pubmed_sentence",
        body=search_body,
        scroll="2m",
        size=1000
    )

    sid = page['_scroll_id']
    hits = page['hits']['hits']

    while len(hits):
        for hit in hits:
            source = hit['_source']
            too_many_hits_result.append({
                "drug_name": query_word,
                "PMID": source['PMID'],
                "SENTID": source['SENTID'],
                "SENTENCE": source['SENTENCE']
            })
        page = es.scroll(scroll_id=sid, scroll="2m")
        sid = page['_scroll_id']
        hits = page['hits']['hits']

  page = es.search(
100%|██████████| 210/210 [03:21<00:00,  1.04it/s]


In [43]:
print(len(query_result), "results from initial queries.")
print(len(too_many_hits_result), "results from too-many-hits queries.")

3117 results from initial queries.
5151453 results from too-many-hits queries.


In [44]:
df = pd.DataFrame(query_result + too_many_hits_result)
df.to_csv("/workspace/0-utils/1-data/pubmed/pubmed_cell_ontology_sentence/pubmed_cell_ontology_sentences.csv", index=False)

In [45]:
print(len(df["SENTID"].unique()))
print(len(df["PMID"].unique()))

4517567
2423164


In [46]:
df_multi_cells = df[df.duplicated(subset=["SENTID"], keep=False)]
df_multi_cells

Unnamed: 0,drug_name,PMID,SENTID,SENTENCE
0,asymmetric bistratified amacrine cell,7533018,21600457,The first cell type was classified as an asymm...
1,magnocellular neuron,1148858,2801665,mum of perikaryal surface of a magnocellular n...
2,magnocellular neuron,1148858,2801666,Seventy-four per cent of axosomatic terminals ...
3,magnocellular neuron,11796498,45897749,Colocalization might explain how V(1) autorece...
7,amniotic epithelial stem cell,36265471,202496766,"Furthermore, the epithelial thickness in the a..."
...,...,...,...,...
5154565,mature B cell,9862360,35624430,"Transcripts from the VH10 and VHS107 families,..."
5154566,mature B cell,9874569,35697312,"However, these Ras-RAG pre-B cells also upregu..."
5154567,mature B cell,9920868,35874232,"Previously, we have reported that these enhanc..."
5154568,mature B cell,9920868,35874240,"Moreover, the PU.1 motif appears to act as a n..."


In [47]:
print(len(df_multi_cells["SENTID"].unique()))

546224


In [48]:
import collections
c = collections.Counter(df_multi_cells["SENTID"])
c.most_common(10)

[(200168873, 13),
 (184880183, 11),
 (156037899, 10),
 (193530597, 10),
 (198971376, 10),
 (175708137, 10),
 (36517202, 9),
 (149840779, 9),
 (199225924, 9),
 (214418448, 9)]

In [None]:
def nC2(n):
    return n * (n - 1) // 2

k = 0
for count in c.values():
    k += nC2(count)
print(k, "total pairs of sentences associated with multiple cell types.")

749557 total pairs of sentences associated with multiple cell types.


# 複数形の定義がないから、非常に少なくなっている

In [3]:
NEW_MAPPING_BODY = {
  "settings": {
    "analysis": {
      "analyzer": {
        "my_english_stemmed": { 
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "porter_stem"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "PMID": { "type": "keyword" },
      "SENTID": { "type": "keyword" },
      "SENTENCE": { 
          "type": "text", 
          "analyzer": "my_english_stemmed"
      }
    }
  }
}

ES_HOST = "http://elasticsearch:9200"
ES_PASSWORD = "micgm1Gemini"
es = Elasticsearch(ES_HOST, basic_auth=("elastic", ES_PASSWORD))

es.indices.create(index="pubmed_sentence_v2", body=NEW_MAPPING_BODY)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'pubmed_sentence_v2'})

In [4]:
import time
print("DATE: ", time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))

reindex_body = {
  "source": {
    "index": "pubmed_sentence"
  },
  "dest": {
    "index": "pubmed_sentence_v2"
  }
}

task = es.reindex(body=reindex_body, wait_for_completion=False)
print(f"Reindex started. Task ID: {task['task']}")

DATE:  2025-12-06 19:12:34
Reindex started. Task ID: tUTvEWmUTWOKti67aqXkGQ:9172


In [7]:
task_id = "tUTvEWmUTWOKti67aqXkGQ:9172"
ES_HOST = "http://elasticsearch:9200"
ES_PASSWORD = "micgm1Gemini"
es = Elasticsearch(ES_HOST, basic_auth=("elastic", ES_PASSWORD))
print("Checking task status")
print("タスクID:", task_id)
print("DATE: ", time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()))

try:
    task = es.tasks.get(task_id=task_id)
    status = task['task']['status']
    
    total = status['total']
    created = status['created']
    updated = status['updated']
    processed = created + updated
    
    progress = (processed / total) * 100
    
    print(f"進捗: {progress:.2f}% ({processed} / {total})")
    print(f"詳細: {status}")

except Exception as e:
    print("タスクが見つかりません。すでに完了しているか、IDが間違っています。")
    print(f"エラー内容: {e}")

Checking task status
タスクID: tUTvEWmUTWOKti67aqXkGQ:9172
DATE:  2025-12-06 19:15:10
進捗: 100.00% (4559157 / 4559157)
詳細: {'total': 4559157, 'updated': 0, 'created': 4559157, 'deleted': 0, 'batches': 4560, 'version_conflicts': 0, 'noops': 0, 'retries': {'bulk': 0, 'search': 0}, 'throttled_millis': 0, 'requests_per_second': -1.0, 'throttled_until_millis': 0}


In [18]:
ES_HOST = "http://elasticsearch:9200"
ES_PASSWORD = "micgm1Gemini"
es = Elasticsearch(ES_HOST, basic_auth=("elastic", ES_PASSWORD))

query_result = []
too_many_hits = []
for i in tqdm(range(len(cell_name))):
    query_word = cell_name[i]
    search_body = { "query": { "multi_match": { "query": query_word, "fields": ["SENTENCE"], "type": "phrase" } }, "sort": [ {"PMID": {"order": "asc"}} ], "_source": ["PMID", "SENTID", "SENTENCE"]}
    res = es.search(index="pubmed_sentence_v2", body=search_body)
    total_hits = res['hits']['total']['value'] 

    if total_hits >= 1000:
        too_many_hits.append((query_word, total_hits))
        continue
    else:
        for hit in res['hits']['hits']:
            source = hit['_source']
            query_result.append({
                "drug_name": query_word,
                "PMID": source['PMID'],
                "SENTID": source['SENTID'],
                "SENTENCE": source['SENTENCE']
            })

100%|██████████| 3850/3850 [00:31<00:00, 120.45it/s]


In [19]:
print(f"Drugs with too many hits (>=1000): {len(too_many_hits)}")

Drugs with too many hits (>=1000): 426


In [25]:
es = Elasticsearch(ES_HOST, basic_auth=("elastic", ES_PASSWORD))

too_many_hits_result = []
for query_word, _ in tqdm(too_many_hits):
    if query_word == "cell":
        continue
    search_body = {
        "query": {
            "multi_match": {
                "query": query_word,
                "fields": ["SENTENCE"],
                "type": "phrase"
            }
        },
        "_source": ["PMID", "SENTID", "SENTENCE"],
        "sort": [{"PMID": {"order": "asc"}}]
    }

    page = es.search(
        index="pubmed_sentence_v2",
        body=search_body,
        scroll="2m",
        size=1000
    )

    sid = page['_scroll_id']
    hits = page['hits']['hits']

    while len(hits):
        for hit in hits:
            source = hit['_source']
            too_many_hits_result.append({
                "drug_name": query_word,
                "PMID": source['PMID'],
                "SENTID": source['SENTID'],
                "SENTENCE": source['SENTENCE']
            })
        page = es.scroll(scroll_id=sid, scroll="2m")
        sid = page['_scroll_id']
        hits = page['hits']['hits']

  page = es.search(
100%|██████████| 426/426 [15:11<00:00,  2.14s/it]  


In [26]:
print(len(query_result), "results from initial queries.")
print(len(too_many_hits_result), "results from too-many-hits queries.")

10356 results from initial queries.
14227230 results from too-many-hits queries.


In [31]:
df = pd.DataFrame(query_result + too_many_hits_result)
df_multi_cells = df[df.duplicated(subset=["SENTID"], keep=False)]

c = collections.Counter(df_multi_cells["SENTID"])

k = 0
for count in c.values():
    k += nC2(count)
    
print(k, "total pairs of sentences associated with multiple cell types.")

4181928 total pairs of sentences associated with multiple cell types.


In [32]:
print(len(df["SENTID"].unique()))

11058903


In [None]:
df_multi_cells = df_multi_cells.sort_values(by=["SENTID"])
df_multi_cells.head()

Unnamed: 0,drug_name,PMID,SENTID,SENTENCE
14219088,blood cell,76,470,"The effect of 2,3-diphosphoglycerate is intere..."
1644573,red blood cell,76,470,"The effect of 2,3-diphosphoglycerate is intere..."
1614695,red blood cell,325,1208,Effects on PFC responses to thymus-dependent (...
14162620,blood cell,325,1208,Effects on PFC responses to thymus-dependent (...
12420399,B cell,325,1212,The delayed transfer of GVH cells to irradiate...


In [33]:
df.to_csv("/workspace/0-utils/1-data/pubmed/pubmed_cell_ontology_sentence/pubmed_cell_ontology_sentences_v2.csv", index=False)