In [12]:
import json
import random
import os

import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_cosine_schedule_with_warmup,
)
from torch.cuda.amp import autocast, GradScaler
from tqdm.auto import tqdm

# =========================
# 0. 기본 설정 (경로 / 시드)
# =========================

DATA_FILE = "/content/감성대화말뭉치(최종데이터)_Training.json"

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

# =========================
# 1. 감정 매핑 + HS01만 사용
# =========================

def map_emotion_id_to_merged_label(eid: str):
    """
    AI Hub emotion.type (예: 'E18')을 5개 감정으로 매핑

    E10~19 → 분노
    E20~29 → 슬픔
    E30~39 → 불안
    E40~59 → 당황   (상처+당황 통합)
    E60~69 → 기쁨
    """
    if not isinstance(eid, str):
        return None
    if not eid.startswith("E"):
        return None
    try:
        num = int(eid[1:])
    except ValueError:
        return None

    if 10 <= num <= 19:
        return "분노"
    elif 20 <= num <= 29:
        return "슬픔"
    elif 30 <= num <= 39:
        return "불안"
    elif 40 <= num <= 59:
        return "당황"  # 상처(E40~49) + 당황(E50~59) 통합
    elif 60 <= num <= 69:
        return "기쁨"
    else:
        return None


def load_hs01_dataset(path: str):
    """
    JSON에서 HS01만 뽑아서 (text, label) 리스트 생성
    """
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    texts = []
    labels = []

    for item in data:
        profile = item.get("profile", {}) or {}
        emo = profile.get("emotion", {}) or {}
        emo_id = emo.get("type") or emo.get("emotion-id") or emo.get("감정_대분류")
        label = map_emotion_id_to_merged_label(emo_id)
        if label is None:
            continue

        talk = item.get("talk", {}) or {}
        content = talk.get("content", {}) or {}
        hs01 = content.get("HS01")
        if isinstance(hs01, str):
            hs01 = hs01.strip()
        if not hs01:
            continue

        texts.append(hs01)
        labels.append(label)

    return texts, labels


print("[LOAD] HS01 + 병합 라벨로 데이터 로드 중...")
texts, labels = load_hs01_dataset(DATA_FILE)
print("[LOAD] 샘플 개수:", len(texts))

from collections import Counter
cnt = Counter(labels)
print("[LABEL COUNTS]", cnt)

# =========================
# 2. 라벨 인덱싱
# =========================

label_list = ["기쁨", "당황", "분노", "불안", "슬픔"]
label2id = {lab: i for i, lab in enumerate(label_list)}
id2label = {i: lab for lab, i in label2id.items()}

print("[LABEL2ID]", label2id)

# =========================
# 3. Train / Validation 분할
# =========================

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts,
    labels,
    test_size=0.1,
    random_state=SEED,
    stratify=labels,
)

print("[SPLIT] train:", len(train_texts), "val:", len(val_texts))

# =========================
# 4. Dataset / DataLoader
# =========================

tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = [label2id[l] for l in labels]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_ds = EmotionDataset(train_texts, train_labels)
val_ds   = EmotionDataset(val_texts,   val_labels)

batch_size = 32
accum_steps = 2  # 효과적으로 batch 64

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_ds,   batch_size=64, shuffle=False, num_workers=2)

# =========================
# 5. 모델 / 옵티마이저 / 스케줄러
# =========================

model = AutoModelForSequenceClassification.from_pretrained(
    "klue/roberta-base",
    num_labels=len(label_list),
    label2id=label2id,
    id2label=id2label,
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

EPOCHS = 2
total_steps = len(train_loader) * EPOCHS
warmup_steps = int(total_steps * 0.1)

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps,
)

scaler = GradScaler()

# =========================
# 6. 평가 함수
# =========================

def evaluate():
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            for k in batch:
                batch[k] = batch[k].to(device)

            with autocast():
                outputs = model(**batch)
                logits = outputs.logits
                loss = F.cross_entropy(
                    logits,
                    batch["labels"],
                    label_smoothing=0.1,  # 라벨 스무딩
                )

            total_loss += loss.item()
            preds = logits.argmax(dim=-1)
            correct += (preds == batch["labels"]).sum().item()
            total += preds.size(0)

    avg_loss = total_loss / len(val_loader)
    acc = correct / total if total > 0 else 0.0
    return avg_loss, acc

# =========================
# 7. 학습 루프
# =========================

for epoch in range(1, EPOCHS + 1):
    print(f"\n===== Epoch {epoch}/{EPOCHS} =====")
    model.train()
    running_loss = 0.0

    optimizer.zero_grad(set_to_none=True)

    for step, batch in enumerate(tqdm(train_loader)):
        for k in batch:
            batch[k] = batch[k].to(device)

        with autocast():
            outputs = model(**batch)
            logits = outputs.logits
            loss = F.cross_entropy(
                logits,
                batch["labels"],
                label_smoothing=0.1,
            )
            loss = loss / accum_steps

        scaler.scale(loss).backward()

        if (step + 1) % accum_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()

        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)
    print(f"Train Loss: {avg_train_loss:.4f}")

    val_loss, val_acc = evaluate()
    print(f"Val Loss : {val_loss:.4f}")
    print(f"Val Acc  : {val_acc:.4f}")

print("\n[DONE] 학습 완료")


device: cuda
[LOAD] HS01 + 병합 라벨로 데이터 로드 중...
[LOAD] 샘플 개수: 51628
[LABEL COUNTS] Counter({'당황': 17898, '불안': 9319, '분노': 9160, '슬픔': 9125, '기쁨': 6126})
[LABEL2ID] {'기쁨': 0, '당황': 1, '분노': 2, '불안': 3, '슬픔': 4}
[SPLIT] train: 46465 val: 5163


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



===== Epoch 1/2 =====


  scaler = GradScaler()


  0%|          | 0/1453 [00:00<?, ?it/s]

  with autocast():


Train Loss: 0.6022


  with autocast():


Val Loss : 1.0924
Val Acc  : 0.6295

===== Epoch 2/2 =====


  0%|          | 0/1453 [00:00<?, ?it/s]

Train Loss: 0.5226
Val Loss : 1.0781
Val Acc  : 0.6390

[DONE] 학습 완료


In [14]:
save_path = "/content/moodtown_emotion_model"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("저장 완료:", save_path)

저장 완료: /content/moodtown_emotion_model


In [17]:
from google.colab import files
!zip -r moodtown_emotion_model.zip /content/moodtown_emotion_model
files.download("moodtown_emotion_model.zip")

  adding: content/moodtown_emotion_model/ (stored 0%)
  adding: content/moodtown_emotion_model/vocab.txt (deflated 49%)
  adding: content/moodtown_emotion_model/config.json (deflated 54%)
  adding: content/moodtown_emotion_model/special_tokens_map.json (deflated 85%)
  adding: content/moodtown_emotion_model/tokenizer.json (deflated 69%)
  adding: content/moodtown_emotion_model/model.safetensors (deflated 11%)
  adding: content/moodtown_emotion_model/tokenizer_config.json (deflated 75%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>