<a href="https://colab.research.google.com/github/auberr/sparta_hp_ai/blob/main/sparta_AI_4_Text_classification_prac.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[4주차] 기본과제: HuggingFace로 두 문장의 논리적 모순 분류하기

In [None]:
!pip install datasets transformers evaluate --quiet

import os
import random
import numpy as np
import evaluate
from datasets import load_dataset, DatasetDict
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

# 1) MNLI 데이터셋 로드: "nyu-mll/glue", "mnli"
#    split은 train, validation_matched, validation_mismatched 등 여러 개가 있을 수 있으나
#    요구사항: 학습 시 'train' split만 활용 (valid 또한 train에서 만들어야 함)
raw_datasets = load_dataset("nyu-mll/glue", "mnli")

# 2) 라벨 종류: MNLI는 3개 레이블(0: entailment, 1: neutral, 2: contradiction)
#    (label 인덱스는 실제 데이터 구조 확인 가능)
print(raw_datasets)

# 3) 모델 & 토크나이저 선택 (예: DistilBERT)
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

# 3) train split에서 10,000개만 사용
train_split_full = raw_datasets["train"]
# train split 총 데이터 길이
print("Original train dataset length:", len(train_split_full))

# 10,000개만
max_train = 10000
if len(train_split_full) < max_train:
    max_train = len(train_split_full)
train_split_10k = train_split_full.select(range(max_train))

print("Reduced train dataset length:", len(train_split_10k))

# 4) 10,000개를 90%(학습) / 10%(검증) 분할
train_size = int(len(train_split_10k) * 0.9)
indices = list(range(len(train_split_10k)))
random.shuffle(indices)
train_indices = indices[:train_size]
valid_indices = indices[train_size:]

train_dataset = train_split_10k.select(train_indices)
valid_from_train_dataset = train_split_10k.select(valid_indices)

print(f"Train dataset size: {len(train_dataset)}, Valid from train size: {len(valid_from_train_dataset)}")


# 5) Tokenize 함수: premise/hypothesis 2문장을 하나로
def tokenize_function(examples):
    return tokenizer(
        examples["premise"],
        examples["hypothesis"],
        truncation=True
    )

# 6) 데이터셋 토큰화
train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_from_train_dataset = valid_from_train_dataset.map(tokenize_function, batched=True)

# 최종 평가 용도의 validation_matched
val_matched_dataset = raw_datasets["validation_matched"].map(tokenize_function, batched=True)

# 7) Padding Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 8) 평가 지표로 Accuracy 사용
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=preds, references=labels)

# 9) TrainingArguments 설정
training_args = TrainingArguments(
    output_dir="mnli-distilbert-output",
    evaluation_strategy="epoch",   # 매 epoch마다 검증
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,            # 예시로 2 epoch
    weight_decay=0.01,
    logging_steps=50,
    logging_dir="logs",
    load_best_model_at_end=True,
    # 아래 2개 옵션으로 W&B 대신 Hugging Face Transformers만 사용
    report_to=["none"],            # wandb에 로그 X
    run_name="mnli-distilbert-run" # run_name 지정 (wandb 사용 안 함)
)

# 10) Trainer 구성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_from_train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 11) 모델 학습 (Colab 셀에 로그 남음)
train_result = trainer.train()
trainer.save_model()

# 12) "validation_matched" 성능 측정(학습/검증X, 최종 평가)
val_result = trainer.evaluate(eval_dataset=val_matched_dataset)
print("Validation matched result:", val_result)
print(f"Accuracy on validation_matched = {val_result['eval_accuracy']:.4f}")


DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Original train dataset length: 392702
Reduced train dataset length: 10000
Train dataset size: 9000, Valid from train size: 1000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8709,0.748942,0.677
2,0.6381,0.710463,0.695
3,0.4641,0.771421,0.711


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8709,0.748942,0.677
2,0.6381,0.710463,0.695
3,0.4641,0.771421,0.711
4,0.273,0.96759,0.701
5,0.196,1.174536,0.705
6,0.1769,1.424419,0.699
7,0.0717,1.654369,0.707
8,0.0696,1.778924,0.702
9,0.0218,1.889823,0.692
10,0.0117,1.90369,0.7


Validation matched result: {'eval_loss': 0.7893429398536682, 'eval_accuracy': 0.6649006622516557, 'eval_runtime': 22.7024, 'eval_samples_per_second': 432.333, 'eval_steps_per_second': 27.046, 'epoch': 10.0}
Accuracy on validation_matched = 0.6649
축하합니다! 50% 이상 달성하였습니다.
