## 1️⃣ 환경설정 & 라이브러리 로드

In [None]:
import os, json
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise EnvironmentError("❌ OPENAI_API_KEY not found in .env")

client = OpenAI(api_key=api_key)

print("✅ OpenAI API Key loaded successfully.")

✅ OpenAI API Key loaded successfully.


## 2️⃣ 전체 법률 데이터 로드

In [10]:
law_path = "../../../../dataset/PIPA/law/law.json"
decree_path = "../../../../dataset/PIPA/law/decree.json"

with open(law_path, "r", encoding="utf-8") as f:
    law_data = json.load(f)
with open(decree_path, "r", encoding="utf-8") as f:
    decree_data = json.load(f)

law_dict = {i["id"]: i for i in law_data}
decree_dict = {i["id"]: i for i in decree_data}

print(f"📘 개인정보보호법 조항 수: {len(law_dict)}개")
print(f"📗 시행령 조항 수: {len(decree_dict)}개")

📘 개인정보보호법 조항 수: 979개
📗 시행령 조항 수: 1084개


## 3️⃣ aggregate / get 함수 정의 / reference 함수

In [14]:
def get_item(law_type: str, id_: str):
    if law_type == "법":
        return law_dict.get(id_)
    elif law_type == "시행령":
        return decree_dict.get(id_)
    else:
        raise ValueError("law_type은 '법' 또는 '시행령'이어야 합니다.")

def aggregate_desc(law_type: str, id_: str) -> str:
    """항상 해당 조 전체를 포함"""
    target = law_dict if law_type == "법" else decree_dict
    results = []
    def dfs(current):
        if current not in target: return
        item = target[current]
        results.append(f'"{law_type}", {item["id"]} ({item["var_name"]}) {item["content"]}')
        for child in [c["id"] for c in target.values() if c.get("parent") == current]:
            dfs(child)
    dfs(id_)
    return "\n".join(results)

def get_single(law_type: str, id_: str) -> str:
    item = get_item(law_type, id_)
    if not item:
        return f"⚠️ {law_type}의 {id_}를 찾을 수 없습니다."
    return f'"{law_type}", {item["id"]} ({item["var_name"]}) {item["content"]}'

def get_references_text(entry):
    """reference 항목에 포함된 법률/시행령 전체 텍스트 수집"""
    refs_text = ""
    for ref in entry.get("reference", []):
        law_name = ref.get("law", "")
        ref_id = ref.get("id", "")
        if "개인정보보호법" in law_name:
            refs_text += "\n" + aggregate_desc("법", ref_id)
        elif "시행령" in law_name:
            refs_text += "\n" + aggregate_desc("시행령", ref_id)
    return refs_text.strip()

## 4️⃣ 기본 비즈니스 변수

In [None]:
base_variables = [
    {
        "variable": "transfers_overseas",
        "question": "귀사는 고객 개인정보를 해외(예: 해외 서버, 해외 본사, 해외 협력업체 등)로 전송하거나 저장하는 과정이 있습니까?"
    },
    {
        "variable": "received_consent",
        "question": "귀사는 고객의 개인정보를 수집하거나 이용하기 전에, 수집 목적·항목·보유기간 등을 명확히 안내하고 명시적 동의를 받았습니까?"
    }
]

## 5️⃣ Few-shot 예시 구성

In [15]:
fewshot_examples = {
"제16조 제1항": {
  "content": "개인정보처리자는 제15조 제1항 각 호의 어느 하나에 해당하여 개인정보를 수집하는 경우에는 그 목적에 필요한 최소한의 개인정보를 수집하여야 한다.",
  "pseudocode": {
    "applicability_pseudocode": "received_consent and (LAW_A15_P1_S1['applicability'] == True or LAW_A15_P1_S2['applicability'] == True or LAW_A15_P1_S3['applicability'] == True or LAW_A15_P1_S4['applicability'] == True or LAW_A15_P1_S5['applicability'] == True or LAW_A15_P1_S6['applicability'] == True or LAW_A15_P1_S7['applicability'] == True)",
    "legal_pseudocode": "not BUSINESS_COLLECTS_MINIMUM_ONLY",
    "action_pseudocode": ""
  },
  "added_variables": [
    {
      "variable": "BUSINESS_COLLECTS_MINIMUM_ONLY",
      "question": "귀사는 고객의 개인정보를 수집할 때 서비스 제공에 반드시 필요한 최소한의 항목만을 수집합니까? 예를 들어 불필요한 생년월일, 주소, 직업, 가족정보 등을 요구하지 않습니까?"
    }
  ]
},
"제9조 제2항 제1호": {
  "content": "기본계획에는 개인정보 보호의 기본목표와 추진방향이 포함되어야 한다.",
  "pseudocode": {
    "applicability_pseudocode": "BUSINESS_IS_GOV_AGENCY",
    "legal_pseudocode": "not BUSINESS_HAS_PRIVACY_POLICY_GOAL",
    "action_pseudocode": ""
  },
  "added_variables": [
    {
      "variable": "BUSINESS_IS_GOV_AGENCY",
      "question": "귀사의 조직은 공공기관 또는 정부 산하기관입니까?"
    },
    {
      "variable": "BUSINESS_HAS_PRIVACY_POLICY_GOAL",
      "question": "귀사는 개인정보 보호를 위한 목표 및 추진방향을 내부 정책 또는 계획 문서로 수립했습니까?"
    }
  ]
},
"제9조 제2항": {
  "content": "기본계획에는 다음 각 호의 사항이 포함되어야 한다.",
  "pseudocode": {
    "applicability_pseudocode": "BUSINESS_IS_GOV_AGENCY",
    "legal_pseudocode": "not (LAW_A9_P2_S1['legal'] and LAW_A9_P2_S2['legal'] and LAW_A9_P2_S3['legal'] and LAW_A9_P2_S4['legal'] and LAW_A9_P2_S5['legal'] and LAW_A9_P2_S6['legal'] and LAW_A9_P2_S7['legal'])",
    "action_pseudocode": ""
  },
  "added_variables": [
    {
      "variable": "BUSINESS_IS_GOV_AGENCY",
      "question": "귀사의 조직은 공공기관 또는 정부 산하기관입니까?"
    }
  ]
},
"제15조 제3항": {
  "content": "개인정보처리자는 당초 수집 목적과 관련된 범위에서 정보주체에게 불이익이 발생하지 않도록 필요한 조치를 하였을 때 대통령령으로 정하는 바에 따라 정보주체의 동의 없이 개인정보를 이용할 수 있다.",
  "pseudocode": {
    "applicability_pseudocode": "BUSINESS_ENCRYPTS_AND_SECURES_DATA",
    "legal_pseudocode": "data_breach_occurred",
    "action_pseudocode": "if LAW_A15_P3['applicability']: LAW_A15_P1['legal'] = True"
  },
  "added_variables": [
    {
      "variable": "BUSINESS_ENCRYPTS_AND_SECURES_DATA",
      "question": "귀사는 개인정보를 처리할 때 암호화, 접근통제, 안전성 확보 조치를 수행합니까?"
    },
    {
      "variable": "data_breach_occurred",
      "question": "최근 1년 내 고객 개인정보 유출이나 무단 접근 등의 보안사고가 있었습니까?"
    }
  ]
},
# ✅ 정의 조항 예시 (새로운 few-shot)
"제2조 제1호": {
  "content": "‘개인정보’란 살아 있는 개인에 관한 정보로서 성명, 주민등록번호 및 영상 등을 통하여 개인을 식별할 수 있는 정보를 말한다.",
  "pseudocode": {
    "applicability_pseudocode": "BUSINESS_USES_PERSONAL_INFORMATION",
    "legal_pseudocode": "True",
    "action_pseudocode": ""
  },
  "added_variables": []
}
}

## 6️⃣ System Prompt

In [24]:
from tqdm import tqdm

system_prompt = """
You are a legal reasoning assistant specialized in generating Python-style pseudocode for articles of the Personal Information Protection Act (PIPA) and its Enforcement Decree.

Your task is to interpret each legal clause and express its logical meaning as three pseudocode elements:

1️⃣ applicability_pseudocode – A Python condition that defines *when* this clause applies.
   - For definition articles, this represents whether the definition applies to the business context.
   - For regulatory articles, this represents whether the business falls under the scope of the rule.

2️⃣ legal_pseudocode – A Python condition that defines *when* the clause becomes illegal (False).
   - For definition-type clauses, this must always be `True`.

3️⃣ action_pseudocode – Python logic that defines *actions or causal effects* that occur when this clause applies 
   (e.g., modifying another clause’s `legal` value).

📘 Output Format (strict JSON):
{{
  "pseudocode": {{
    "applicability_pseudocode": "<Python condition>",
    "legal_pseudocode": "<Python condition>",
    "action_pseudocode": "<Python code>"
  }},
  "added_variables": [
    {{"variable": "<name>", "question": "<very concrete business-level question>"}}
  ]
}}

⚙️ Variable Naming Rules:
- Clauses from the Personal Information Protection Act (PIPA) are represented as **LAW_...**
  Example: `LAW_A16_P1_S1` → Article 16, Paragraph 1, Subparagraph 1 of the PIPA.
- Clauses from the Enforcement Decree are represented as **DECREE_...**
  Example: `DECREE_A16_P1_S2` → Article 16, Paragraph 1, Subparagraph 2 of the Enforcement Decree.
- Use these variable names when referencing other articles’ `applicability` or `legal` states.

💡 Logical Referencing Guidelines:
- Actively reference **applicability** and **legal** properties of other clauses when forming logical conditions.
  For example:
    - `LAW_A15_P1_S1['applicability'] == True`
    - `DECREE_A16_P1['legal'] == False`
- Leverage relationships between clauses rather than creating unnecessary new variables.

⚠️ Rules for added_variables:
- Only add a new variable if absolutely necessary.
- Always prefer reusing existing variables listed below.
- Each question must be an *extremely specific business question*, never referencing any law.
- Do not add abstract or vague variables.

You must analyze the entire article context but clearly focus on the [target] clause.

[BUSINESS VARIABLES (INCLUDING PREVIOUSLY ADDED ONES)]
{combined_variables_json}

[FEW-SHOT EXAMPLES]
{fewshot_examples_json}
"""



## 7️⃣ Read data to process

In [17]:
filtered_decree_path = "./decree_filtered.json"
filtered_law_path = "./law_filtered.json"

with open(filtered_decree_path, "r", encoding="utf-8") as f:
    decree_filtered = json.load(f)
with open(filtered_law_path, "r", encoding="utf-8") as f:
    law_filtered = json.load(f)

## 8️⃣ Main Processing Loop (with tqdm + result merging)

In [None]:
from tqdm import tqdm
import json

# ===================================================
# Initialize result containers and variable pools
# ===================================================
results_decree = []
results_law = []
accumulated_variables_decree = base_variables.copy()
accumulated_variables_law = base_variables.copy()

# ===================================================
# Main processing function
# ===================================================
def process_dataset(dataset, law_type, accumulated_variables, results_container):
    """법 종류(시행령 / 법)에 따라 별도로 결과 및 변수 누적"""
    for item in tqdm(dataset, desc=f"Processing {law_type}"):
        parent_id = item["parent"].split()[0] if " " in item["parent"] else item["parent"]
        full_article = aggregate_desc(law_type, parent_id)
        target_clause = get_single(law_type, item["id"])
        references_text = get_references_text(item)

        # Build user prompt
        prompt = f"""
[full article]
{full_article}

[target]
{target_clause}

[references]
{references_text if references_text else 'None'}
"""

        # Combine current variables
        combined_variables = accumulated_variables

        # Dynamically build system prompt
        dynamic_system_prompt = system_prompt.format(
            combined_variables_json=json.dumps(combined_variables, ensure_ascii=False, indent=2),
            fewshot_examples_json=json.dumps(fewshot_examples, ensure_ascii=False, indent=2)
        )

        try:
            # ✅ Chat Completions API (JSON enforced)
            completion = client.chat.completions.create(
                model="gpt-4o",
                response_format={"type": "json_object"},
                messages=[
                    {"role": "system", "content": dynamic_system_prompt},
                    {"role": "user", "content": prompt},
                ],
            )

            # Parse response
            result_text = completion.choices[0].message.content
            result = json.loads(result_text)

            # Save pseudocode output
            results_container.append({
                "id": item["id"],
                "var_name": item["var_name"],
                "content": item["content"],
                "parent": item["parent"],
                "class": item["class"],
                "pseudocode": result.get("pseudocode", {})
            })

            # Update variable pool
            for nv in result.get("added_variables", []):
                if nv not in accumulated_variables:
                    accumulated_variables.append(nv)

        except Exception as e:
            print(f"⚠️ Error on {law_type} {item['id']}: {e}")

    return accumulated_variables


# ===================================================
# Load filtered datasets
# ===================================================
filtered_decree_path = "./decree_filtered.json"
filtered_law_path = "./law_filtered.json"

with open(filtered_decree_path, "r", encoding="utf-8") as f:
    decree_filtered = json.load(f)
with open(filtered_law_path, "r", encoding="utf-8") as f:
    law_filtered = json.load(f)


# ===================================================
# Process each dataset separately
# ===================================================
accumulated_variables_decree = process_dataset(decree_filtered, "시행령", accumulated_variables_decree, results_decree)
accumulated_variables_law = process_dataset(law_filtered, "법", accumulated_variables_law, results_law)


# ===================================================
# Save all outputs
# ===================================================
with open("./pseudocode_decree.json", "w", encoding="utf-8") as f:
    json.dump(results_decree, f, ensure_ascii=False, indent=2)

with open("./pseudocode_law.json", "w", encoding="utf-8") as f:
    json.dump(results_law, f, ensure_ascii=False, indent=2)

with open("./accumulated_variables_decree.json", "w", encoding="utf-8") as f:
    json.dump(accumulated_variables_decree, f, ensure_ascii=False, indent=2)

with open("./accumulated_variables_law.json", "w", encoding="utf-8") as f:
    json.dump(accumulated_variables_law, f, ensure_ascii=False, indent=2)

print("✅ Pseudocode generation complete.")
print("📂 Saved files:")
print(" - pseudocode_decree.json")
print(" - pseudocode_law.json")
print(" - accumulated_variables_decree.json")
print(" - accumulated_variables_law.json")


Processing 시행령: 100%|██████████| 173/173 [09:56<00:00,  3.45s/it]
Processing 법: 100%|██████████| 396/396 [23:31<00:00,  3.56s/it]

✅ Pseudocode generation complete.
📂 Saved files:
 - pseudocode_decree.json
 - pseudocode_law.json
 - accumulated_variables_decree.json
 - accumulated_variables_law.json



