In [1]:
import os
import re
import json
from typing import List, Dict, Iterable, Optional

In [2]:
# Optional PDF support
def extract_text(path: str) -> str:
    ext = os.path.splitext(path)[1].lower()
    if ext == ".txt":
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    elif ext == ".pdf":
        try:
            import PyPDF2  # type: ignore
            text = []
            with open(path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                for page in reader.pages:
                    text.append(page.extract_text() or "")
            return "\n".join(text)
        except Exception as e:
            raise RuntimeError("PDF parsing failed. Install PyPDF2 or provide a .txt file.") from e
    else:
        raise ValueError("Unsupported file type. Provide .txt or .pdf")


In [3]:
def normalize_whitespace(text: str) -> str:
    return re.sub(r"[ \t]+", " ", re.sub(r"\r\n|\r", "\n", text)).strip()


In [4]:
def chunk_text(text: str, max_chars: int = 900, overlap: int = 120) -> List[str]:
    """Naive chunker by sentences, with overlap to preserve context."""
    # Split by paragraph/sentences
    sentences = re.split(r"(?<=[.!?])\s+|\n{2,}", text)
    chunks = []
    cur = ""
    for s in sentences:
        if not s.strip():
            continue
        if len(cur) + len(s) + 1 <= max_chars:
            cur = (cur + " " + s).strip()
        else:
            if cur:
                chunks.append(cur)
            # overlap: carry the tail
            tail = cur[-overlap:] if overlap > 0 else ""
            cur = (tail + " " + s).strip()
    if cur:
        chunks.append(cur)
    return chunks


In [5]:
# Very light-weight keyword tagger for privacy topics
TOPIC_RULES = {
    "data_minimization": [r"\bdata minimization\b", r"\bminimi[sz]e data\b"],
    "purpose_limitation": [r"\bpurpose limitation\b", r"\bspecified, explicit and legitimate purposes\b"],
    "lawfulness": [r"\blawful(ness)?\b", r"\blegal basis\b"],
    "accuracy": [r"\baccuracy\b", r"\binaccurate data\b"],
    "storage_limitation": [r"\bstorage limitation\b", r"\bretention\b"],
    "integrity_confidentiality": [r"\bintegrity\b", r"\bconfidentiality\b", r"\bsecurity\b"],
    "data_subject_rights": [r"\bdata subject\b", r"\bright of access\b", r"\berasure\b", r"\brectification\b"],
    "pii_identifiers": [r"\bname\b", r"\bemail\b", r"\bphone\b", r"\baddress\b", r"\bid(entifier)?\b"],
    "high_risk_ai": [r"\bhigh[- ]risk\b", r"\brisk management\b"],
    "transparency": [r"\btransparen(t|cy)\b", r"\bexplainability\b"],
    "human_oversight": [r"\bhuman oversight\b"],
    "data_governance": [r"\bdata governance\b", r"\btraining, validation and testing\b"],
}


In [6]:
def tag_chunk(text: str) -> List[str]:
    tags = []
    for topic, patterns in TOPIC_RULES.items():
        if any(re.search(p, text, flags=re.IGNORECASE) for p in patterns):
            tags.append(topic)
    return tags or ["misc"]


In [7]:
def heuristic_summary(text: str, max_len: int = 240) -> str:
    """Simple extractive 'summary': take the first sentence up to max_len."""
    sent = re.split(r"(?<=[.!?])\s+", text.strip())[0]
    return (sent[:max_len] + ("…" if len(sent) > max_len else ""))


In [8]:
def build_kb(doc_paths: Iterable[str], source_labels: Optional[Iterable[str]] = None,
             out_jsonl: str = "regulatory_kb.jsonl",
             max_chars: int = 900, overlap: int = 120) -> str:
    if source_labels is None:
        source_labels = [os.path.basename(p) for p in doc_paths]
    records = []
    for path, label in zip(doc_paths, source_labels):
        raw = extract_text(path)
        raw = normalize_whitespace(raw)
        for idx, chunk in enumerate(chunk_text(raw, max_chars=max_chars, overlap=overlap)):
            record = {
                "source": label,
                "chunk_id": f"{label}:{idx:04d}",
                "text": chunk,
                "summary": heuristic_summary(chunk),
                "tags": tag_chunk(chunk)
            }
            records.append(record)
    with open(out_jsonl, "w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    return os.path.abspath(out_jsonl)

In [12]:
from typing import Iterable
from tools.kb_builder import build_kb

# ✅ Jupyter에서는 직접 build_kb() 호출!
build_kb(
    doc_paths=[
        "data/raw/EU_AI_act.txt",
        "data/raw/GDPR.txt"
    ],
    out_jsonl="data/kb/regulatory_kb.jsonl",
    max_chars=900,
    overlap=120
)

print("✅ KB 생성 완료!")


ModuleNotFoundError: No module named 'tools.kb_builder'