In [2]:
# 공통 import
import os, json, math, numpy as np, pandas as pd, torch
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt

# 출력 경로
OUT_DIR = Path("/data/kobert_outputs_384")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# 장치
device = "cuda" if torch.cuda.is_available() else "cpu"
device

# ====== 경로 ======
TRAIN_XLSX = "D:/학교/대외활동/학술제/train_dataset.xlsx"
VALID_XLSX = "D:/학교/대외활동/학술제/validation_dataset.xlsx"

# ====== 데이터 로드 ======
train = pd.read_excel(TRAIN_XLSX)
valid = pd.read_excel(VALID_XLSX)


In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader

# ====== (필요시) 컬럼명 매핑 ======
rename = {
    "ID":"id","Datastamp":"datastamp","title":"title","abstract":"abstract",
    "publisher":"publisher","issn":"issn","creator":"creator","label":"label",
    "게재일자":"datastamp","제목":"title","요약":"abstract","학회지":"publisher","저자":"creator","주제분류":"label",
}
train = train.rename(columns=rename); valid = valid.rename(columns=rename)

# ====== 기본 클린업 & 입력 텍스트 ======
for c in ["title","abstract","publisher","issn","creator","label"]:
    if c in train: train[c] = train[c].astype(str).replace({"nan":""}).str.strip()
    if c in valid: valid[c] = valid[c].astype(str).replace({"nan":""}).str.strip()

train["text"] = (train.get("title","") + " [SEP] " + train.get("abstract","")).str.strip()
valid["text"] = (valid.get("title","") + " [SEP] " + valid.get("abstract","")).str.strip()

# ====== 라벨 인코딩(Train 기준) ======
labels: List[str] = sorted(train["label"].dropna().unique().tolist())
lab2id: Dict[str,int] = {l:i for i,l in enumerate(labels)}
id2lab: Dict[int,str] = {i:l for l,i in lab2id.items()}

train["y"] = train["label"].map(lab2id)
valid["y"] = valid["label"].map(lab2id)

# 저장(서빙/복원용)
(OUT_DIR/"label2id.json").write_text(json.dumps(lab2id, ensure_ascii=False, indent=2), encoding="utf-8")

# ====== 하이퍼파라미터 ======
MODEL_NAME = "skt/kobert-base-v1"  # HF KoBERT
MAX_LEN    = 384
BATCH      = 32
EPOCHS     = 50
LR         = 2e-5
PATIENCE   = 8  # F1 early stopping

# ====== 데이터셋 ======
class PaperDS(Dataset):
    def __init__(self, df: pd.DataFrame, tok, max_len: int):
        self.df = df.reset_index(drop=True)
        self.tok = tok
        self.max_len = max_len
    def __len__(self): return len(self.df)
    def __getitem__(self, i: int):
        text = self.df.loc[i, "text"]
        y    = int(self.df.loc[i, "y"])
        enc = self.tok(text, truncation=True, padding="max_length", max_length=self.max_len)
        item = {k: torch.tensor(v) for k,v in enc.items()}
        item["labels"] = torch.tensor(y)
        return item

# ====== 토크나이저/모델 ======
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(lab2id)).to(device)

# ====== DataLoader ======
train_ds = PaperDS(train, tok, MAX_LEN)
valid_ds = PaperDS(valid, tok, MAX_LEN)
train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=BATCH, shuffle=False)

# ====== 불균형 보정: class_weight ======
cw = compute_class_weight(class_weight="balanced", classes=np.arange(len(lab2id)), y=train["y"].values)
cw = torch.tensor(cw, dtype=torch.float32, device=device)
criterion = torch.nn.CrossEntropyLoss(weight=cw)

# ====== 옵티마이저/스케줄러 ======
optim = torch.optim.AdamW(model.parameters(), lr=LR)
total_steps = EPOCHS * len(train_dl)
sched = get_linear_schedule_with_warmup(optim, int(0.1*total_steps), total_steps)

# ====== 학습루프 ======
best_f1, wait = 0.0, 0
def evaluate(model):
    model.eval(); preds=[]; trues=[]
    with torch.no_grad():
        for batch in valid_dl:
            batch = {k:v.to(device) for k,v in batch.items()}
            logits = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]).logits
            preds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
            trues.extend(batch["labels"].cpu().tolist())
    macro_f1 = f1_score(trues, preds, average="macro")
    rep = classification_report(trues, preds, target_names=[id2lab[i] for i in range(len(id2lab))], digits=4)
    return macro_f1, rep, preds, trues

for epoch in range(1, EPOCHS+1):
    model.train()
    for batch in train_dl:
        batch = {k:v.to(device) for k,v in batch.items()}
        logits = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]).logits
        loss = criterion(logits, batch["labels"])
        optim.zero_grad(); loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim.step(); sched.step()

    f1, rep, pv, yv = evaluate(model)
    print(f"[384] Epoch {epoch} | Val Macro-F1: {f1:.4f}")
    # 예측 저장(에폭별)
    pd.DataFrame({
        "id": valid["id"] if "id" in valid.columns else range(len(valid)),
        "gold": [id2lab[i] for i in yv],
        "pred": [id2lab[i] for i in pv],
    }).to_csv(OUT_DIR / f"preds_epoch{epoch}.csv", index=False, encoding="utf-8")

    if f1 > best_f1:
        best_f1, wait = f1, 0
        model.save_pretrained(OUT_DIR/"best_model")
        tok.save_pretrained(OUT_DIR/"best_model")
    else:
        wait += 1
        if wait > PATIENCE:
            break

print("[384] Best Val Macro-F1:", round(best_f1, 4))

# ====== 최종 리포트/혼동행렬 저장 ======
# 가장 마지막 에폭 파일 사용
pred_files = sorted([p for p in OUT_DIR.glob("preds_epoch*.csv")])
pred_df = pd.read_csv(pred_files[-1])
labels_sorted = sorted(list(set(pred_df["gold"].unique()) | set(pred_df["pred"].unique())))
lab2tmp = {l:i for i,l in enumerate(labels_sorted)}
cm = confusion_matrix(pred_df["gold"].map(lab2tmp), pred_df["pred"].map(lab2tmp), labels=list(range(len(labels_sorted))))

# 혼동행렬 그리기(간단 버전)
plt.figure(figsize=(7,6))
plt.imshow(cm, aspect='auto')
plt.xticks(range(len(labels_sorted)), labels_sorted, rotation=45, ha='right')
plt.yticks(range(len(labels_sorted)), labels_sorted)
plt.xlabel("Predicted"); plt.ylabel("True"); plt.title("KoBERT Confusion Matrix (384)")
plt.colorbar()
plt.tight_layout()
plt.savefig(OUT_DIR/"confusion_matrix.png", dpi=220)
plt.show()

# 라벨별 상세 리포트 저장
from sklearn.metrics import classification_report
rep_json = classification_report(
    pred_df["gold"].map(lab2tmp),
    pred_df["pred"].map(lab2tmp),
    target_names=labels_sorted,
    output_dict=True,
    digits=4
)
pd.DataFrame(rep_json).transpose().to_csv(OUT_DIR/"label_report.csv", encoding="utf-8")
print("Saved to:", str(OUT_DIR))


KeyboardInterrupt: 

KeyboardInterrupt: 

In [None]:
# 선택 (정확도 향상)
from torch.nn import functional as F

OUT_DIR_512 = Path("/mnt/data/kobert_outputs_512"); OUT_DIR_512.mkdir(parents=True, exist_ok=True)
MAX_LEN_512, BATCH_512 = 512, 8

# 512 데이터로더
tok2 = tok  # 같은 토크나이저 사용
class PaperDS512(PaperDS):
    pass
train_dl2 = DataLoader(PaperDS512(train, tok2, MAX_LEN_512), batch_size=BATCH_512, shuffle=True)
valid_dl2 = DataLoader(PaperDS512(valid, tok2, MAX_LEN_512), batch_size=BATCH_512, shuffle=False)

# 512 모델/학습
model2 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(lab2id)).to(device)
criterion2 = torch.nn.CrossEntropyLoss(weight=cw)
optim2 = torch.optim.AdamW(model2.parameters(), lr=LR)
total_steps2 = EPOCHS * len(train_dl2)
sched2 = get_linear_schedule_with_warmup(optim2, int(0.1*total_steps2), total_steps2)

best_f1_512, wait = 0.0, 0
for epoch in range(1, EPOCHS+1):
    model2.train()
    for batch in train_dl2:
        batch = {k:v.to(device) for k,v in batch.items()}
        logits = model2(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]).logits
        loss = criterion2(logits, batch["labels"])
        optim2.zero_grad(); loss.backward()
        torch.nn.utils.clip_grad_norm_(model2.parameters(), 1.0)
        optim2.step(); sched2.step()

    # 평가
    model2.eval(); preds=[]; trues=[]
    with torch.no_grad():
        for batch in valid_dl2:
            batch = {k:v.to(device) for k,v in batch.items()}
            logits = model2(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]).logits
            preds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
            trues.extend(batch["labels"].cpu().tolist())
    f1 = f1_score(trues, preds, average="macro")
    print(f"[512] Epoch {epoch} | Val Macro-F1: {f1:.4f}")

    pd.DataFrame({
        "id": valid["id"] if "id" in valid.columns else range(len(valid)),
        "gold": [id2lab[i] for i in trues],
        "pred": [id2lab[i] for i in preds],
    }).to_csv(OUT_DIR_512 / f"preds_epoch{epoch}.csv", index=False, encoding="utf-8")

    if f1 > best_f1_512:
        best_f1_512, wait = f1, 0
        model2.save_pretrained(OUT_DIR_512/"best_model")
        tok2.save_pretrained(OUT_DIR_512/"best_model")
    else:
        wait += 1
        if wait > PATIENCE:
            break

print("[512] Best Val Macro-F1:", round(best_f1_512, 4))

# ====== soft-vote 앙상블 (384/512) ======
def probs_from(model_dir, df, max_len, bs=32):
    tokA = AutoTokenizer.from_pretrained(model_dir)
    dsA = PaperDS(df, tokA, max_len)
    dlA = DataLoader(dsA, batch_size=bs, shuffle=False)
    mdlA = AutoModelForSequenceClassification.from_pretrained(model_dir).to(device).eval()
    probs=[]; ys=[]
    with torch.no_grad():
        for batch in dlA:
            b = {k:v.to(device) for k,v in batch.items()}
            logits = mdlA(input_ids=b["input_ids"], attention_mask=b["attention_mask"]).logits
            probs.append(F.softmax(logits, dim=-1).cpu().numpy())
            ys.extend(b["labels"].cpu().tolist())
    return np.vstack(probs), np.array(ys)

p384, yv = probs_from(str(OUT_DIR/"best_model"), valid, 384, bs=32)
p512, _  = probs_from(str(OUT_DIR_512/"best_model"), valid, 512, bs=32)

p_ens = (p384 + p512) / 2.0
pred_ens = p_ens.argmax(axis=1)

print("Ensemble Macro-F1:", f1_score(yv, pred_ens, average="macro"))
print(classification_report(yv, pred_ens, target_names=[id2lab[i] for i in range(len(id2lab))]))
