In [1]:
import os, re, json
import pandas as pd

# ✅ 디렉토리 설정
BASE_DIR = "data"
RAW_DIR = os.path.join(BASE_DIR, "raw")
OUT_DIR = os.path.join(BASE_DIR, "processed")

os.makedirs(OUT_DIR, exist_ok=True)

INPUT_FILES = {
    "EU_AI_act": os.path.join(RAW_DIR, "EU_AI_act.txt"),
    "GDPR": os.path.join(RAW_DIR, "GDPR.txt"),
}


In [2]:
def read_text(path):
    for enc in ("utf-8-sig", "utf-8", "latin-1"):
        try:
            with open(path, "r", encoding=enc) as f:
                return f.read()
        except Exception:
            continue
    raise RuntimeError(f"Failed to read {path} with common encodings")


In [3]:
def normalize_whitespace(s: str) -> str:
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()


In [4]:
def strip_leading_headers(s: str, key_markers):
    low = s.lower()
    idx = -1
    for marker in key_markers:
        m = low.find(marker.lower())
        if m != -1:
            idx = m if idx == -1 else min(idx, m)
    if idx > 0:
        return s[idx:]
    return s

In [5]:
def split_by_articles(text: str):
    pattern = re.compile(r"(?m)^(Article\s+\d+[A-Za-z\-]*\.?\s*[^\n]*)")
    parts = pattern.split(text)
    if len(parts) <= 1:
        return []
    preface = parts[0].strip()
    out = []
    for i in range(1, len(parts), 2):
        header = parts[i].strip()
        body = parts[i+1].strip() if i+1 < len(parts) else ""
        m = re.search(r"Article\s+(\d+[A-Za-z\-]*)", header)
        art_num = m.group(1) if m else f"Unknown_{i//2}"
        out.append((art_num, header + "\n" + body))
    if preface:
        out.insert(0, ("Recitals", preface))
    return out

In [6]:
def paragraph_chunks(text: str, max_chars=1200):
    paras = [p.strip() for p in text.split("\n\n") if p.strip()]
    merged = []
    buff = ""
    for p in paras:
        if not buff:
            buff = p
        elif len(buff) + 2 + len(p) <= max_chars:
            buff = buff + "\n\n" + p
        else:
            merged.append(buff)
            buff = p
    if buff:
        merged.append(buff)
    return merged


In [7]:
def chunk_article(article_id: str, text: str, namespace: str, max_chars=1200):
    chunks = paragraph_chunks(text, max_chars=max_chars)
    rows = []
    for i, ch in enumerate(chunks, start=1):
        cid = f"{namespace}_Article_{article_id}_p{i}" if article_id != "Recitals" else f"{namespace}_Recitals_p{i}"
        rows.append({"id": cid, "source": namespace, "article": article_id, "text": ch})
    return rows


In [8]:
def preprocess_file(ns_name: str, raw_text: str):
    t = normalize_whitespace(raw_text)
    markers = ["REGULATION (EU)", "Artificial Intelligence Act", "General Data Protection Regulation", "REGULATION (EU) 2016/679", "REGULATION (EU) 2024/1689"]
    t = strip_leading_headers(t, markers)
    arts = split_by_articles(t)

    rows = []
    if arts:
        for art_num, block in arts:
            rows.extend(chunk_article(art_num, block, ns_name))
    else:
        for i, ch in enumerate(paragraph_chunks(t), start=1):
            rows.append({"id": f"{ns_name}_p{i}", "source": ns_name, "article": "Unknown", "text": ch})
    return rows

In [9]:
all_rows = []
for ns, path in INPUT_FILES.items():
    txt = read_text(path)
    rows = preprocess_file(ns, txt)
    all_rows.extend(rows)

In [10]:
jsonl_path = os.path.join(OUT_DIR, "knowledge_base.jsonl")
csv_path = os.path.join(OUT_DIR, "knowledge_base.csv")

with open(jsonl_path, "w", encoding="utf-8") as f:
    for r in all_rows:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

pd.DataFrame(all_rows).to_csv(csv_path, index=False, encoding="utf-8")

print(f"✅ JSONL saved to: {jsonl_path}")
print(f"✅ CSV saved to:   {csv_path}")
print(f"Total chunks: {len(all_rows)}")

✅ JSONL saved to: data/processed/knowledge_base.jsonl
✅ CSV saved to:   data/processed/knowledge_base.csv
Total chunks: 270
