In [1]:
!pip install -q transformers peft accelerate bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h

In [5]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
!git clone https://github.com/aivle-agent/complaint_system_AI.git
%cd complaint_system_AI

!ls

!unzip complain_data.zip -d data

!ls data


Cloning into 'complaint_system_AI'...
remote: Enumerating objects: 21, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 21 (delta 5), reused 8 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (21/21), 12.13 MiB | 46.52 MiB/s, done.
Resolving deltas: 100% (5/5), done.
/content/complaint_system_AI
 bandit_prompt_system.py   complain_quality_shap.py   README.md
 complain_data.zip	  'Complaint data.zip'	      system_test.ipynb
Archive:  complain_data.zip
  inflating: data/중앙행정기관.csv  
  inflating: data/국립아시아문화전당.csv  
  inflating: data/지방행정기관.csv  
  inflating: data/국민신문고.csv  
국립아시아문화전당.csv	지방행정기관.csv  중앙행정기관.csv  국민신문고.csv


In [7]:
# =========================== 공통 유틸 ===========================
import os
import re
from typing import Any, Tuple, List, Dict

import numpy as np
import pandas as pd


def clean_text(x: Any) -> str:
    """전화번호, 주민번호, 이름 패턴 등을 마스킹하고 strip."""
    if not isinstance(x, str):
        return ""
    x = re.sub(r"\d{2,3}-\d{3,4}-\d{4}", "[TEL]", x)
    x = re.sub(r"\d{6}-\d{7}", "[RRN]", x)
    x = re.sub(r"[가-힣]{2,3}씨", "[NAME]", x)
    return x.strip()


def parse_question_answer(full_text: str) -> Tuple[str, str]:
    """
    consulting_content 한 덩어리에서 Q/A를 분리.
    패턴 대략:
      제목 : ...
      
      Q : ...
      ...
      
      A : ...
    """
    if not isinstance(full_text, str):
        return "", ""

    text = full_text.strip()

    # A : 위치 찾기
    a_idx = text.find("\nA :")
    if a_idx == -1:
        a_idx = text.find("\n\nA :")
    if a_idx == -1:
        a_idx = text.find("A :")

    if a_idx != -1:
        q_part = text[:a_idx].strip()
        a_part = text[a_idx:].strip()
        a_part = re.sub(r"^A\s*:\s*", "", a_part, flags=re.MULTILINE).strip()
    else:
        # A:가 없다면 Q:만이라도
        q_idx = text.find("Q :")
        if q_idx != -1:
            q_part = text[q_idx:].strip()
            a_part = ""
        else:
            q_part, a_part = text, ""

    # 제목, Q 제거
    q_part = re.sub(r"^제목\s*:\s*", "", q_part, flags=re.MULTILINE)
    q_part = re.sub(r"^Q\s*:\s*", "", q_part, flags=re.MULTILINE)

    q_part = clean_text(q_part)
    a_part = clean_text(a_part)
    return q_part, a_part


def load_wide_from_csv(csv_path: str) -> pd.DataFrame:
    """
    중앙행정기관.csv → wide 포맷 (기존 pivot 유지).
    """
    df = pd.read_csv(csv_path)

    group_keys = ["source", "consulting_date", "consulting_category", "consulting_content"]

    wide = (
        df.pivot_table(
            index=group_keys,
            columns="classification_category",
            values="classification",
            aggfunc=lambda x: " / ".join(sorted(set(x)))
        )
        .reset_index()
    )

    wide = wide.rename(columns={
        "consulting_content": "full_text",
        "상담 주제": "topic",
        "상담 사유": "reason",
        "상담 결과": "outcome",
        "상담 요건": "requirement",
        "상담 내용": "summary",
    })
    return wide


def build_qa_dataset_from_wide(wide: pd.DataFrame) -> List[Dict[str, str]]:
    """
    wide → [{"question": q, "answer": a, ...}, ...]
    """
    data = []
    for _, row in wide.iterrows():
        full_text = row.get("full_text", "")
        q, a = parse_question_answer(full_text)

        if len(q) < 5 or len(a) < 5:
            continue

        data.append({
            "question": q,
            "answer": a,
            "topic": row.get("topic", ""),
            "reason": row.get("reason", ""),
            "outcome": row.get("outcome", ""),
            "requirement": row.get("requirement", ""),
            "summary": row.get("summary", ""),
        })
    return data


def train_test_split_qa(all_data: List[Dict[str, str]], test_ratio: float = 0.1):
    total = len(all_data)
    test_size = max(1, int(total * test_ratio))
    train_data = all_data[:-test_size]
    test_data = all_data[-test_size:]
    return train_data, test_data


In [None]:
# ===================== Generator LoRA 학습 스크립트 =====================


import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model
from tqdm.auto import tqdm

# ---- 1) 하이퍼파라미터 & 모델 설정 ----
GEN_BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
GEN_LORA_OUT_DIR = "./gen_lora"
BASE_SYS_PROMPT = (
    "당신은 한국 공공기관의 민원 답변을 작성하는 공무원 보조 AI입니다. "
    "법령과 정책에 기반하여 답변하고, 실제 처리 여부는 담당 부서의 최종 판단에 따름을 명시해야 합니다."
)

MAX_LEN_GEN = 512
GEN_BATCH_SIZE = 2
GEN_NUM_EPOCHS = 1    
GEN_LR = 5e-5


# ---- 2) Dataset 정의 ----
class GeneratorSFTDataset(Dataset):
    """
    Generator SFT용 Dataset
    prompt = system + question
    target = answer
    """

    def __init__(self, data, tokenizer, max_length=512, base_sys_prompt=""):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.base_sys_prompt = base_sys_prompt

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        q = item["question"]
        a = item["answer"]


        prompt = (
            f"<|system|>\n{self.base_sys_prompt}\n\n"
            f"<|user|>\n{q}\n\n"
            f"<|assistant|>\n"
        )
        full_text = prompt + a

        enc = self.tokenizer(
            full_text,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        )

        input_ids = enc["input_ids"][0]
        attention_mask = enc["attention_mask"][0]
        labels = input_ids.clone()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }


# ---- 3) 데이터 로드 & Dataset 생성 ----
csv_path = 'data/중앙행정기관.csv'
wide = load_wide_from_csv(csv_path)
qa_data = build_qa_dataset_from_wide(wide)
train_data, test_data = train_test_split_qa(qa_data, test_ratio=0.1)

print(f"총 Q/A: {len(qa_data)}, Train: {len(train_data)}, Test: {len(test_data)}")

tokenizer_gen = AutoTokenizer.from_pretrained(GEN_BASE_MODEL)
if tokenizer_gen.pad_token is None:
    tokenizer_gen.pad_token = tokenizer_gen.eos_token

train_dataset_gen = GeneratorSFTDataset(
    train_data,
    tokenizer=tokenizer_gen,
    max_length=MAX_LEN_GEN,
    base_sys_prompt=BASE_SYS_PROMPT,
)

# ---- 4) 모델 + LoRA 래핑 ----
base_model_gen = AutoModelForCausalLM.from_pretrained(
    GEN_BASE_MODEL,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

peft_config_gen = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)
model_gen = get_peft_model(base_model_gen, peft_config_gen)
print("[Generator] LoRA 적용 파라미터 개수:", sum(p.numel() for p in model_gen.parameters() if p.requires_grad))

data_collator_gen = DataCollatorForLanguageModeling(
    tokenizer=tokenizer_gen,
    mlm=False,
)

# ---- 5) Trainer 설정 & 학습 ----
training_args_gen = TrainingArguments(
    output_dir=GEN_LORA_OUT_DIR,
    per_device_train_batch_size=GEN_BATCH_SIZE,
    num_train_epochs=GEN_NUM_EPOCHS,
    learning_rate=GEN_LR,
    logging_steps=10,
    save_strategy="epoch",
    bf16=True,
    report_to="none",
)

trainer_gen = Trainer(
    model=model_gen,
    args=training_args_gen,
    train_dataset=train_dataset_gen,
    data_collator=data_collator_gen,
)

trainer_gen.train()

# ---- 6) LoRA 어댑터 & 토크나이저 저장 ----
model_gen.save_pretrained(GEN_LORA_OUT_DIR)
tokenizer_gen.save_pretrained(GEN_LORA_OUT_DIR)
print(f"[Generator LoRA] Saved to {GEN_LORA_OUT_DIR}")


총 Q/A: 4447, Train: 4003, Test: 444


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


[Generator] LoRA 적용 파라미터 개수: 2252800


Step,Training Loss
10,1.5218
20,1.3884
30,1.3551
40,1.2869
50,1.2729
60,1.1664
70,1.169
80,1.0816
90,1.0659
100,1.0402


[Generator LoRA] Saved to ./gen_lora


In [None]:
# ===================== Verifier LoRA 학습 스크립트 =====================


import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model

VER_BASE_MODEL = "distilbert-base-multilingual-cased"
VER_LORA_OUT_DIR = "./verifier_lora"
MAX_LEN_VER = 256
VER_BATCH_SIZE = 8
VER_NUM_EPOCHS = 1
VER_LR = 2e-5


class VerifierDataset(Dataset):
    """
    (question, answer) → score(0~1) 회귀용 Dataset
    지금은 pseudo label (답변 길이 기반) 사용.
    """

    def __init__(self, data, tokenizer, max_length=256):
        self.tokenizer = tokenizer
        self.max_length = max_length

        self.texts = []
        self.labels = []

        for item in data:
            q = item["question"]
            a = item["answer"]

            text = f"[COMPLAINT]\n{q}\n\n[ANSWER]\n{a}"
            self.texts.append(text)
            
            L = len(a)
            score = max(0.3, min(1.0, L / 1500.0))
            self.labels.append(score)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        )
        input_ids = enc["input_ids"][0]
        attention_mask = enc["attention_mask"][0]
        label = torch.tensor(self.labels[idx], dtype=torch.float)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": label,
        }


csv_path = 'data/중앙행정기관.csv'
wide = load_wide_from_csv(csv_path)
qa_data = build_qa_dataset_from_wide(wide)
train_data, test_data = train_test_split_qa(qa_data, test_ratio=0.1)

print(f"[Verifier] 총 Q/A: {len(qa_data)}, Train: {len(train_data)}, Test: {len(test_data)}")

# ---- 2) 토크나이저 & Dataset ----
tokenizer_ver = AutoTokenizer.from_pretrained(VER_BASE_MODEL)
train_dataset_ver = VerifierDataset(train_data, tokenizer=tokenizer_ver, max_length=MAX_LEN_VER)

# ---- 3) base model + LoRA 래핑 ----
base_model_ver = AutoModelForSequenceClassification.from_pretrained(
    VER_BASE_MODEL,
    num_labels=1,
    problem_type="regression",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

peft_config_ver = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin"],  
)

model_ver = get_peft_model(base_model_ver, peft_config_ver)

print("[Verifier] LoRA 적용 파라미터 개수:", sum(p.numel() for p in model_ver.parameters() if p.requires_grad))


# ---- 4) Trainer 설정 & 학습 ----
def compute_metrics_ver(eval_pred):
    return {}


training_args_ver = TrainingArguments(
    output_dir=VER_LORA_OUT_DIR,
    per_device_train_batch_size=VER_BATCH_SIZE,
    num_train_epochs=VER_NUM_EPOCHS,
    learning_rate=VER_LR,
    logging_steps=10,
    save_strategy="epoch",
    bf16=True,
    report_to="none",
)

trainer_ver = Trainer(
    model=model_ver,
    args=training_args_ver,
    train_dataset=train_dataset_ver,
    tokenizer=tokenizer_ver,
    compute_metrics=compute_metrics_ver,
)

trainer_ver.train()

# ---- 5) LoRA 어댑터 & 토크나이저 저장 ----
model_ver.save_pretrained(VER_LORA_OUT_DIR)
tokenizer_ver.save_pretrained(VER_LORA_OUT_DIR)
print(f"[Verifier LoRA] Saved to {VER_LORA_OUT_DIR}")


[Verifier] 총 Q/A: 4447, Train: 4003, Test: 444


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Please specify `target_modules` or `target_parameters`in `peft_config`