# 질문 분류기 (Question Classifier)

In [1]:
%cd ..
!pip install -q numpy<2.0
!pip install -q datasets pandas
!pip install -q transformers==4.40.0
!pip install -q accelerate==0.21.0
!pip install -q torch==2.5.1

/bin/bash: line 1: 2.0: No such file or directory
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.6/137.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m118.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m101.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 4.1.0 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.40.0 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import json
import os
from tqdm import tqdm
from sklearn.metrics import f1_score

from transformers import AutoTokenizer, AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "klue/roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)




In [7]:
class QClassifier(nn.Module):
    def __init__(self, num_classes=5):
        super(QClassifier, self).__init__()
        self.basemodel = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.basemodel.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.basemodel(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs.pooler_output
        return self.classifier(pooled_output)

model = QClassifier()
model = model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
class QuestionDataset(Dataset):
    def __init__(self, json_path, tokenizer, max_len=64):
        with open(json_path, "r", encoding="utf-8") as f:
            self.data = json.load(f)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encoded = self.tokenizer(
            item["question"],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(),
            "attention_mask": encoded["attention_mask"].squeeze(),
            "token_type_ids": encoded["token_type_ids"].squeeze(),
            "label": torch.tensor(item["label"])
        }

In [9]:
from sklearn.model_selection import train_test_split

# 원본 전체 데이터 로드
with open("./data/train.json", "r", encoding="utf-8") as f:
    full_data = json.load(f)

# 80% train, 20% validation split
train_data, val_data = train_test_split(full_data, test_size=0.2, stratify=[x["label"] for x in full_data], random_state=42)

# 파일로 저장
os.makedirs("./data", exist_ok=True)

with open("./data/train_split.json", "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)

with open("./data/val_split.json", "w", encoding="utf-8") as f:
    json.dump(val_data, f, ensure_ascii=False, indent=2)

# 기존처럼 dataset 생성
train_dataset = QuestionDataset("./data/train_split.json", tokenizer)
val_dataset = QuestionDataset("./data/val_split.json", tokenizer)

# 각각의 DataLoader 구성
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

In [10]:
from sklearn.metrics import f1_score

for epoch in range(6):
    # 학습 단계
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"[Epoch {epoch+1}] Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, token_type_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"[Epoch {epoch+1}] Train Loss: {total_loss:.4f}")

    # 검증 단계
    model.eval()
    val_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"[Epoch {epoch+1}] Validation"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average="macro")
    print(f"[Epoch {epoch+1}] Val Loss: {val_loss:.4f}, F1 Score: {f1:.4f}")

[Epoch 1] Training: 100%|██████████| 26/26 [00:15<00:00,  1.67it/s]


[Epoch 1] Train Loss: 23.6828


[Epoch 1] Validation: 100%|██████████| 7/7 [00:01<00:00,  6.43it/s]


[Epoch 1] Val Loss: 0.7658, F1 Score: 0.9805


[Epoch 2] Training: 100%|██████████| 26/26 [00:14<00:00,  1.74it/s]


[Epoch 2] Train Loss: 2.5481


[Epoch 2] Validation: 100%|██████████| 7/7 [00:01<00:00,  6.50it/s]


[Epoch 2] Val Loss: 0.1872, F1 Score: 0.9900


[Epoch 3] Training: 100%|██████████| 26/26 [00:15<00:00,  1.72it/s]


[Epoch 3] Train Loss: 1.8548


[Epoch 3] Validation: 100%|██████████| 7/7 [00:01<00:00,  6.42it/s]


[Epoch 3] Val Loss: 0.4538, F1 Score: 0.9798


[Epoch 4] Training: 100%|██████████| 26/26 [00:15<00:00,  1.70it/s]


[Epoch 4] Train Loss: 1.6704


[Epoch 4] Validation: 100%|██████████| 7/7 [00:01<00:00,  6.29it/s]


[Epoch 4] Val Loss: 1.4463, F1 Score: 0.9311


[Epoch 5] Training: 100%|██████████| 26/26 [00:15<00:00,  1.68it/s]


[Epoch 5] Train Loss: 1.4594


[Epoch 5] Validation: 100%|██████████| 7/7 [00:01<00:00,  6.13it/s]


[Epoch 5] Val Loss: 0.7705, F1 Score: 0.9615


[Epoch 6] Training: 100%|██████████| 26/26 [00:15<00:00,  1.63it/s]


[Epoch 6] Train Loss: 0.6587


[Epoch 6] Validation: 100%|██████████| 7/7 [00:01<00:00,  5.76it/s]

[Epoch 6] Val Loss: 0.4727, F1 Score: 0.9902





In [12]:

# test_cls.json 예측 후 cls_output.json 저장
import os
os.makedirs("outputs", exist_ok=True)

# test 데이터 로드
with open("./data/test_cls.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

test_questions = [x["question"] for x in test_data]

# 토큰화
tokenized = tokenizer(
    test_questions,
    padding=True,
    truncation=True,
    return_tensors="pt"
).to(device)

# 추론
model.eval()
with torch.no_grad():
    outputs = model(**tokenized)
    preds = torch.argmax(outputs, dim=-1).tolist()

# 결과 저장
cls_output = [{"question": q, "label": l} for q, l in zip(test_questions, preds)]
with open("outputs/cls_output.json", "w", encoding="utf-8") as f:
    json.dump(cls_output, f, ensure_ascii=False, indent=2)

print("cls_output.json 저장 완료!")


✅ cls_output.json 저장 완료!
