# KoBART 기반 한국어 요약 자동화 및 평가 노트북

이 노트북은 원본 데이터와 요약 데이터를 활용하여 KoBART 모델로 생성 요약을 수행하고, 업계 기준에 부합하는 종합 평가 점수(ROUGE 등)를 달성하기 위한 전처리, 파인튜닝, 평가까지의 전체 파이프라인을 자동화합니다.

In [None]:
# 1. 환경 준비 및 필수 패키지 설치
import sys


def install(package):
    import IPython

    IPython.get_ipython().run_line_magic("pip", f"install {package}")


for p in [
    "transformers==4.21.0",
    "torch>=1.13.0",
    "datasets",
    "evaluate",
    "rouge-score",
    "nltk",
    "sentencepiece",
    "scikit-learn",
    "pandas",
    "tqdm",
]:
    try:
        __import__(p.split("==")[0].split(">=")[0])
    except ImportError:
        install(p)
import pandas as pd, numpy as np, re, torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset
import evaluate, nltk
from sklearn.model_selection import train_test_split

nltk.download("punkt", quiet=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting sentencepiece
  Using cached sentencepiece-0.2.0.tar.gz (2.6 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting sentencepiece
  Using cached sentencepiece-0.2.0.tar.gz (2.6 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [48 lines of output]
      Traceback (most recent call last):
        File [35m"<string>"[0m, line [35m2[0m, in [35m<module>[0m
          [31mexec[0m[1;31m(compile('''[0m
          [31m~~~~[0m[1;31m^^^^^^^^^^^^[0m
          [1;31m# This is <pip-setuptools-caller> -- a caller that pip uses to run setup.py[0m
          [1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
          ...<32 lines>...
          [1;31mexec(compile(setup_py_code, filename, "exec"))[0m
          [1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
          [1;31m''' % ('C:\\Users\\ritar\\AppData\\Local\\Temp\\pip-install-_phqgk87\\sentencepiece_e86a131d3e5d4cbb93ebe39cdf1b8ec5\\setup.py',), "<pip-setuptools-caller>", "exec"))[0m
          [1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Device: cpu


In [4]:
# 2. 데이터 로드 및 전처리
def safe_read_csv(path):
    for enc in ["utf-8-sig", "utf-8", "cp949", "euc-kr"]:
        try:
            return pd.read_csv(path, encoding=enc)
        except:
            continue
    raise ValueError(f"CSV 파일 읽기 실패: {path}")


def preprocess(text):
    if pd.isna(text) or not text:
        return ""
    text = str(text).strip()
    text = re.sub(r"+", " ", text)
    text = re.sub(r"[^가-힣]", "", text)
    return text


orig_df = safe_read_csv("data/crawling_origin.csv")
summ_df = safe_read_csv("data/crawling_origin_with_summary.csv")
orig_col = next(
    (c for c in orig_df.columns if "본문" in c or "content" in c), orig_df.columns[0]
)
summ_col = next(
    (c for c in summ_df.columns if "요약" in c or "summary" in c), summ_df.columns[0]
)
min_len = min(len(orig_df), len(summ_df))
originals, summaries = [], []
for i in range(min_len):
    o, s = preprocess(orig_df.iloc[i][orig_col]), preprocess(summ_df.iloc[i][summ_col])
    if len(o) >= 20 and len(s) >= 3:
        ow, sw = len(o.split()), len(s.split())
        if ow > 0 and 0.01 <= sw / ow <= 0.8:
            originals.append(o)
            summaries.append(s)
print(f"유효 데이터: {len(originals)}개")

PatternError: nothing to repeat at position 0

In [None]:
# 3. 데이터 분할 및 Dataset 생성
X_train, X_val, y_train, y_val = train_test_split(
    originals, summaries, test_size=0.2, random_state=42
)


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input_text"], max_length=512, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        examples["target_text"], max_length=128, truncation=True, padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenizer = AutoTokenizer.from_pretrained("gogamza/kobart-base-v2")
train_dataset = Dataset.from_dict({"input_text": X_train, "target_text": y_train}).map(
    preprocess_function, batched=True
)
val_dataset = Dataset.from_dict({"input_text": X_val, "target_text": y_val}).map(
    preprocess_function, batched=True
)

In [None]:
# 4. KoBART 파인튜닝 (간단 Trainer)
from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

model = AutoModelForSeq2SeqLM.from_pretrained("gogamza/kobart-base-v2").to(device)
rouge = evaluate.load("rouge")


def compute_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    return {k: v * 100 for k, v in result.items()}


training_args = Seq2SeqTrainingArguments(
    output_dir="./kobart_finetuned",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
    logging_steps=50,
    predict_with_generate=True,
    generation_max_length=128,
    fp16=torch.cuda.is_available(),
    report_to=None,
)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
# 5. 평가 및 업계 기준 비교
eval_result = trainer.evaluate()
print("평가 결과:", eval_result)
print(
    "ROUGE-1:",
    eval_result.get("eval_rouge1", 0),
    "ROUGE-2:",
    eval_result.get("eval_rouge2", 0),
    "ROUGE-L:",
    eval_result.get("eval_rougeL", 0),
)
# 업계 기준 예시: ROUGE-1 30~35% 이상이면 실무 적용 가능 수준
if eval_result.get("eval_rouge1", 0) >= 30:
    print("✅ 업계 기준 도달!")
else:
    print("⚠️ 업계 기준 미달. 데이터/전처리/파인튜닝 추가 필요")

In [None]:
# 6. 실제 샘플 요약 및 비교
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(
        device
    )
    with torch.no_grad():
        summary_ids = model.generate(
            **inputs, max_length=128, num_beams=3, early_stopping=True
        )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


for i in range(3):
    print(f"원본: {X_val[i][:100]}...")
    print(f"실제요약: {y_val[i]}")
    print(f"KoBART요약: {generate_summary(X_val[i])}")
    print("-" * 60)