In [12]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm

# 경로 설정
PROGRAM_PATH = os.path.abspath(os.getcwd())
ASSETS_PATH = os.path.join(PROGRAM_PATH, "assets")

TERMINOLOGY_PATH = os.path.join(ASSETS_PATH, "dataflattened_terminology.csv") 
FAISS_INDEX_PATH = os.path.join(ASSETS_PATH, "faiss_index_snomed")

# 모델 로드
model_name = "sentence-transformers/all-MiniLM-L12-v2"
model = SentenceTransformer(model_name)

In [13]:
# 데이터 로드
df_snomed_ct = pd.read_csv(TERMINOLOGY_PATH)
concept_names = df_snomed_ct["concept_name"].astype(str).tolist()
concept_ids = df_snomed_ct["concept_id"].tolist()

concept_type_subset = [
    "procedure", #top level category
    "body structure", #top level category
    "finding", #top level category
    "disorder", #child of finding
    "morphologic abnomrality", #child of body structure
    "regime/therapy", #child of procedure
    "cell structure", #child of body structure
]

In [14]:
df_snomed_ct['hierarchy'].value_counts()

hierarchy
disorder                   88804
procedure                  56329
finding                    36362
body structure             36251
morphologic abnormality     5143
regime/therapy              3165
cell structure               519
Name: count, dtype: int64

In [18]:
for concept_type in concept_type_subset:
    df_concept_type = df_snomed_ct[df_snomed_ct["hierarchy"] == concept_type]
    concept_names = df_concept_type["concept_name"].astype(str).tolist()
    concept_ids = df_concept_type["concept_id"].tolist()

    print(f"Embedding {len(concept_names)} SNOMED terms for '{concept_type}'...")
    embeddings = model.encode(concept_names, show_progress_bar=True, batch_size=64)

    # FAISS index 생성
    dimension = embeddings[0].shape[0]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    # 저장 경로 지정
    safe_concept_type = concept_type.strip().lower().replace("/", "_").replace(" ", "_")
    id_map_path = os.path.join(ASSETS_PATH, f"snomed_id_mapping_{safe_concept_type}.tsv")
    faiss_index_path = os.path.join(ASSETS_PATH, f"faiss_index_{safe_concept_type}.index")

    # ID 매핑 저장
    pd.DataFrame({
        "concept_id": concept_ids,
        "concept_name": concept_names
    }).to_csv(id_map_path, sep="\t", index=False)

    # FAISS 인덱스 저장
    faiss.write_index(index, faiss_index_path)
    print(f"Saved: {safe_concept_type}")

Embedding 56329 SNOMED terms for 'procedure'...


Batches: 100%|██████████| 881/881 [06:25<00:00,  2.29it/s]


Saved: procedure
Embedding 36251 SNOMED terms for 'body structure'...


Batches: 100%|██████████| 567/567 [03:44<00:00,  2.52it/s]


Saved: body structure
Embedding 36362 SNOMED terms for 'finding'...


Batches: 100%|██████████| 569/569 [03:27<00:00,  2.74it/s]


Saved: finding
Embedding 88804 SNOMED terms for 'disorder'...


Batches: 100%|██████████| 1388/1388 [09:46<00:00,  2.37it/s]


Saved: disorder
Embedding 5143 SNOMED terms for 'morphologic abnormality'...


Batches: 100%|██████████| 81/81 [00:38<00:00,  2.09it/s]


Saved: morphologic abnormality
Embedding 3165 SNOMED terms for 'regime/therapy'...


Batches: 100%|██████████| 50/50 [00:18<00:00,  2.71it/s]


Saved: regime_therapy
Embedding 519 SNOMED terms for 'cell structure'...


Batches: 100%|██████████| 9/9 [00:03<00:00,  2.77it/s]

Saved: cell structure



