# 📘 KoBERT 기반 질문 분류기 (Question Classifier)
[구글 코랩 전용]

In [None]:
"""
!sudo apt update -y
!sudo apt install python3.10.12 -y

!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10.12 2
!sudo update-alternatives --config python3

!sudo apt install python3-pip -y
"""

In [None]:
!python --version


In [None]:
!pip install -q numpy<2.0
!pip install -q datasets pandas
!pip install -q transformers==4.40.0
!pip install -q accelerate==0.21.0
!pip install -q torch==2.5.1
!pip install -q gluonnlp==0.10.0
!pip install -q kobert-transformers

/bin/bash: line 1: 2.0: No such file or directory
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.6/137.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m102.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 4.1.0 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.40.0 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import json
from tqdm import tqdm
from sklearn.metrics import f1_score

from transformers import AutoTokenizer, AutoModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "klue/roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
class QClassifier(nn.Module):
    def __init__(self, num_classes=5):
        super(QClassifier, self).__init__()
        self.basemodel = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.basemodel.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.basemodel(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output
        return self.classifier(pooled_output)

model = QClassifier()
model = model.to(device)

In [None]:
class QuestionDataset(Dataset):
    def __init__(self, json_path, tokenizer, max_len=64):
        with open(json_path, "r", encoding="utf-8") as f:
            self.data = json.load(f)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encoded = self.tokenizer(
            item["question"],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(),
            "attention_mask": encoded["attention_mask"].squeeze(),
            "token_type_ids": encoded["token_type_ids"].squeeze(),
            "label": torch.tensor(item["label"])
        }

In [None]:
from sklearn.model_selection import train_test_split

# 원본 전체 데이터 로드
with open("/content/train.json", "r", encoding="utf-8") as f:
    full_data = json.load(f)

# 80% train, 20% validation split
train_data, val_data = train_test_split(full_data, test_size=0.2, stratify=[x["label"] for x in full_data], random_state=42)

# 파일로 저장
with open("/content/train_split.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)

with open("/content/val_split.json", "w", encoding="utf-8") as f:
    json.dump(val_data, f, ensure_ascii=False, indent=2)

# 기존처럼 dataset 생성
train_dataset = QuestionDataset("/content/train_split.json", tokenizer)
val_dataset = QuestionDataset("/content/val_split.json", tokenizer)

# 각각의 DataLoader 구성
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

In [None]:
from sklearn.metrics import f1_score

for epoch in range(6):
    # 🔹 학습 단계
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"[Epoch {epoch+1}] Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, token_type_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"[Epoch {epoch+1}] Train Loss: {total_loss:.4f}")

    # 🔹 검증 단계
    model.eval()
    val_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"[Epoch {epoch+1}] Validation"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average="macro")
    print(f"[Epoch {epoch+1}] Val Loss: {val_loss:.4f}, F1 Score: {f1:.4f}")

In [None]:

from sklearn.metrics import accuracy_score, classification_report

def evaluate_accuracy(model, tokenizer, test_path, label_key="label", device="cuda"):
    # 1) 데이터 로드
    try:
        with open(test_path, "r", encoding="utf-8") as f:
            test_data = json.load(f)
    except Exception as e:
        print(f"❌ 파일 읽기 오류: {e}")
        return 0.0

    model.eval()
    model.to(device)
    preds, trues = [], []
    wrong_samples = []  # 오답 저장

    for item in test_data:
        try:
            encoded = tokenizer(
                item["question"],
                padding='max_length',
                truncation=True,
                max_length=64,
                return_tensors='pt',
                return_token_type_ids=True
            )
            # KoBERT는 token_type_ids가 꼭 있어야 하므로 0으로 채움
            encoded["token_type_ids"] = torch.zeros_like(encoded["input_ids"])

            input_ids = encoded["input_ids"].to(device)
            attention_mask = encoded["attention_mask"].to(device)
            token_type_ids = encoded["token_type_ids"].to(device)
            label = item[label_key]

            with torch.no_grad():
                output = model(input_ids, attention_mask, token_type_ids)
                pred_label = torch.argmax(output, dim=1).item()

            preds.append(pred_label)
            trues.append(label)

            if pred_label != label:
                wrong_samples.append({
                    "question": item["question"],
                    "true": label,
                    "pred": pred_label
                })

        except Exception as e:
            print(f"⚠️ 샘플 처리 오류 (무시됨): {e}")
            continue

    # 정확도 및 리포트 출력
    acc = accuracy_score(trues, preds)
    print(f"\n✅ Accuracy: {acc*100:.2f}%")
    print(f"🎯 F1 Score (macro): {f1_score(trues, preds, average='macro'):.4f}")

    print("\n📊 Classification Report:")
    print(classification_report(trues, preds, digits=3))

    # 오답 샘플 출력
    print(f"\n❌ 잘못 분류된 문항 수: {len(wrong_samples)} / {len(test_data)}")
    for sample in wrong_samples[:]:  # 최대 10개만 출력
        print(f"Q: {sample['question']}\n→ 예측: {sample['pred']} / 실제: {sample['true']}\n")

    return acc

evaluate_accuracy(model, tokenizer, test_path="/content/test1.json")

In [None]:
import torch

model.eval()  # 평가 모드로 설정

while True:
    user_input = input("질문을 입력하세요 (종료하려면 'exit' 또는 '종료'): ")

    if user_input.strip().lower() in ["exit", "종료"]:
        print("테스트를 종료합니다.")
        break

    encoded = tokenizer(
        user_input,
        padding='max_length',
        truncation=True,
        max_length=64,
        return_tensors='pt'
    )

    input_ids = encoded["input_ids"].to(device)
    attention_mask = encoded["attention_mask"].to(device)
    token_type_ids = encoded.get("token_type_ids", torch.zeros_like(input_ids)).to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = outputs.logits if hasattr(outputs, "logits") else outputs
        predicted_label = torch.argmax(logits, dim=1).item()

    label_map = {
        0: "졸업 요건",
        1: "공지사항",
        2: "학사 일정",
        3: "식단 안내",
        4: "셔틀버스/통학"
    }
    print(f"🔍 예측된 라벨: {predicted_label} ({label_map[predicted_label]})\n")