In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# UMLS 전처리한 데이터 RAG만들기
"/content/drive/MyDrive/DILAB/MARS/UMLS/mapped_disease_drug_with_defs_clean_unstructured.txt"

## 0) 필요한거 설치

In [2]:
!pip -q install faiss-cpu sentence-transformers transformers accelerate pandas tqdm

## 1) 경로 설정

In [4]:
import os, re, json
import numpy as np
import pandas as pd
from tqdm import tqdm

IN_TXT    = "/content/drive/MyDrive/DILAB/MARS/UMLS/mapped_disease_drug_with_defs_clean_unstructured.txt"
SAVE_DIR  = "/content/drive/MyDrive/DILAB/MARS/UMLS/vector_db_qwen2"
os.makedirs(SAVE_DIR, exist_ok=True)

BATCH_SIZE  = 128
USE_FP16    = True

## 2) 블록 분리 (줄 바꿈 2번 기준)

In [5]:
with open(IN_TXT, "r", encoding="utf-8") as f:
    raw = f.read()

blocks = [b.strip() for b in re.split(r"\n{2,}", raw) if b.strip()]
print("총 블록 수:", len(blocks))
print("샘플 미리보기:", blocks[0][:200].replace("\n"," "), "...")

총 블록 수: 27682
샘플 미리보기: disease name: Arthritis (disorder) disease description: If you feel pain and stiffness in your body or have trouble moving around, you might have arthritis. Most kinds of arthritis cause pain and swel ...


## 3) 임베딩 모델 로드

In [6]:
# 설치
!pip -q install langchain langchain-community faiss-cpu sentence-transformers transformers accelerate

import re, os, json
from tqdm import tqdm
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document  # langchain>=0.2 기준

# 1) 경로
IN_TXT   = "/content/drive/MyDrive/DILAB/MARS/UMLS/mapped_disease_drug_with_defs_clean_unstructured.txt"
SAVE_DIR = "/content/drive/MyDrive/DILAB/MARS/UMLS/faiss_e5_base"  # 저장 위치
os.makedirs(SAVE_DIR, exist_ok=True)

# 2) 텍스트 → 블록(줄바꿈 2번 기준)
with open(IN_TXT, "r", encoding="utf-8") as f:
    raw = f.read()
blocks = [b.strip() for b in re.split(r"\n{2,}", raw) if b.strip()]
print("블록 수:", len(blocks))

# 3) E5 임베딩 (권장: v2)
#    - 영어 전용: "intfloat/e5-base-v2" (또는 large-v2)
#    - 한글 질의 섞이면: "intfloat/multilingual-e5-base"
MODEL_NAME = "intfloat/e5-base-v2"

emb_lc = HuggingFaceEmbeddings(
    model_name=MODEL_NAME,
    model_kwargs={"device": "cuda"},                # CPU면 "cpu"
    encode_kwargs={"normalize_embeddings": True, "batch_size": 64},
)

# 4) LangChain Document 생성 (★ E5는 문서에 'passage: ' prefix 권장)
docs = [Document(page_content="passage: " + b, metadata={"id": i}) for i, b in enumerate(blocks)]

# 5) 벡터 DB 구축 (FAISS)
#    메모리 아끼려면 from_texts 대신 from_documents + 내부 배치 인코딩 사용 권장
vectorstore = FAISS.from_documents(docs, emb_lc)

# 6) 저장
vectorstore.save_local(SAVE_DIR)
print("✅ 저장 완료:", SAVE_DIR)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m90.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.[0m[31m
[0m블록 

  emb_lc = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

✅ 저장 완료: /content/drive/MyDrive/DILAB/MARS/UMLS/faiss_e5_base


## 4) 벡터 DB 점검해보기

In [9]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

SAVE_DIR = "/content/drive/MyDrive/DILAB/MARS/UMLS/faiss_e5_base"
MODEL_NAME = "intfloat/e5-base-v2"

emb = HuggingFaceEmbeddings(
    model_name=MODEL_NAME,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True, "batch_size": 64},
)

vs = FAISS.load_local(SAVE_DIR, emb, allow_dangerous_deserialization=True)

q = "query: ascites due to portal hypertension diuretic regimen"
docs = vs.similarity_search(q, k=3)
for i,d in enumerate(docs,1):
    print(f"\n[{i}] {d.page_content[:1000].replace('\n',' ')} ...")



[1] passage: disease name: esophageal gastric varices disease description: Dilated blood vessels in the ESOPHAGUS or GASTRIC FUNDUS that shunt blood from the portal circulation (PORTAL SYSTEM) to the systemic venous circulation. Often they are observed in individuals with portal hypertension (HYPERTENSION, PORTAL). drug name: ADH preparation relationship attribute: may_be_treated_by source abbreviation: MED-RT ...

[2] passage: disease name: esophageal gastric varices disease description: Dilated blood vessels in the ESOPHAGUS or GASTRIC FUNDUS that shunt blood from the portal circulation (PORTAL SYSTEM) to the systemic venous circulation. Often they are observed in individuals with portal hypertension (HYPERTENSION, PORTAL). drug name: ADH preparation relationship attribute: may_treat source abbreviation: MED-RT ...

[3] passage: disease name: esophageal gastric varices disease description: Dilated blood vessels in the ESOPHAGUS or GASTRIC FUNDUS that shunt blood from the portal circ