In [1]:
#!pip install -q sentence-transformers chromadb

In [None]:
#pip install ipywidgets
#jupyter nbextension enable --py widgetsnbextension
#아래 오류 없애는 법

In [2]:
from sentence_transformers import SentenceTransformer
import chromadb
import os, json

# ✅ KB 파일 경로 (네가 만든 결과물)
KB_PATHS = [
    "data/processed/knowledge_base.jsonl",   # 보통 여기
]
KB_PATH = next((p for p in KB_PATHS if os.path.exists(p)), None)
assert KB_PATH is not None, "knowledge_base.jsonl 경로를 확인하세요."

# ✅ 임베딩 모델 (가볍고 성능 준수)
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)

# ✅ ChromaDB 초기화 (메모리 모드; 노트북 실험에 적합)
chroma_client = chromadb.Client()
# 이미 있으면 재사용하려면 get_or_create_collection 이용 가능
collection = chroma_client.get_or_create_collection("regulations")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 중복 추가 방지: 이미 채워진 경우 skip할지 여부
SKIP_IF_EXISTS = True

# 현재 collection에 몇 개 들어있는지 확인
existing_count = collection.count()
print("Before:", existing_count)

# JSONL 로딩 & 벡터 삽입
added = 0
with open(KB_PATH, "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)              # {'id','source','article','text'}
        _id = item["id"]

        if SKIP_IF_EXISTS and existing_count > 0:
            # 간단히 존재 체크(엄밀히는 get 통해 확인 가능하지만 비용↑)
            # 여기서는 빠르게 넘어감. 필요 시 정확 체크 코드로 바꿔도 됨.
            pass

        emb = model.encode(item["text"]).tolist()

        collection.add(
            ids=[_id],
            embeddings=[emb],
            metadatas=[{"article": item["article"], "source": item["source"]}],
            documents=[item["text"]]
        )
        added += 1

print("Added:", added)
print("Now:", collection.count())


Before: 0
Added: 270
Now: 269


In [4]:
def search(query: str, top_k: int = 5, with_scores: bool = False):
    """
    query 문장에 대해 상위 top_k 규정 조각을 반환/출력.
    with_scores=True면 유사도 점수도 함께 보여줌.
    """
    q_emb = model.encode(query).tolist()
    res = collection.query(
        query_embeddings=[q_emb],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]  # distances = (작을수록 유사)
    )

    docs = res["documents"][0]
    metas = res["metadatas"][0]
    dists = res.get("distances", [[]])[0] if with_scores else [None] * len(docs)

    # 보기 좋게 출력
    for i, (doc, meta, dist) in enumerate(zip(docs, metas, dists), start=1):
        header = f"[{i}] {meta['source']} Article {meta['article']}"
        if with_scores and dist is not None:
            header += f"  |  distance: {dist:.4f}"
        print(header)
        print(doc.strip()[:800])   # 너무 길면 앞부분만
        print("-" * 80)

    return res  # 필요하면 호출부에서 raw result 사용


In [5]:
# 예시 1: GDPR 핵심 원칙(데이터 최소화)
_ = search("GDPR data minimization principle and personal data processing", top_k=3)

# 예시 2: 익명화/가명처리 관련
_ = search("anonymization and pseudonymization requirements under EU law", top_k=3)

# 예시 3: EU AI Act의 데이터 거버넌스/고위험 시스템
_ = search("data governance obligations for high-risk AI systems", top_k=3)


[1] GDPR Article 5
Article 5 
Principles relating to processing of personal data
1. Personal data shall be: 
(a) processed lawfully, fairly and in a transparent manner in relation to the data subject ('lawfulness, fairness and transparency'); 
(b) collected for specified, explicit and legitimate purposes and not further processed in a manner that is incompatible with those purposes; further processing for archiving purposes in the public interest, scientific or historical research purposes or statistical purposes shall, in accordance with Article 89(1), not be considered to be incompatible with the initial purposes ('purpose limitation'); 
(c) adequate, relevant and limited to what is necessary in relation to the purposes for which they are processed ('data minimisation'); 
(d) accurate and, where necessary,
--------------------------------------------------------------------------------
[2] GDPR Article 4
Article 4 Definitions
For the purposes of this Regulation: 
(1) 'personal data' 