# [4주차] 기본과제: HuggingFace로 두 문장의 논리적 모순 분류하기

In [133]:
!pip install transformers datasets evaluate accelerate scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [134]:
import numpy as np

# [MY CODE] MNLI 데이터셋 로드

In [135]:
from datasets import load_dataset

mnli_dataset = load_dataset("nyu-mll/glue", "mnli")
# 이 함수는 Hugging Face의 datasets 라이브러리에서 제공하는 데이터셋을 로드할 수 있는 방법을 제공한다.
mnli_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

# [MY CODE] MNLI 데이터 확인

In [136]:
mnli_dataset['train'][0]
# Premise    : 전제
# Hypothesis : 가설
# label
# - 0 entailment    : 참
# - 1 neutral       : 중립
# - 2 contradiction : 모순

{'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.',
 'hypothesis': 'Product and geography are what make cream skimming work. ',
 'label': 1,
 'idx': 0}

# [MY CODE] DistilBERT 모델 & 토크나이저 로드

In [137]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# AutoTokenizer와 AutoModelForSequenceClassification을 사용하여 DistilBERT 모델과 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
#  num_labels=3: MNLI 데이터셋에서의 클래스 수에 맞게 설정 (3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [138]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


# [MY CODE] 데이터 전처리

In [139]:
# 데이터 전처리
def preprocess_function(data):
    tokenized = tokenizer(data['premise'], data['hypothesis'], truncation=True, padding=True, max_length=tokenizer.model_max_length)

    # 실제로 처리된 시퀀스의 길이를 출력하여 확인
    # print(f"Length of tokenized sequence: {len(tokenized['input_ids'])}")

    return tokenized

mnli_tokenized = mnli_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [140]:
mnli_tokenized['train'][0].keys()

dict_keys(['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'attention_mask'])

In [141]:
mnli_tokenized

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 9847
    })
})

# [MY CODE] 데이터 Split

In [142]:
mnli_split = mnli_tokenized['train'].train_test_split(test_size=0.2)

In [143]:
mnli_split

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 314161
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 78541
    })
})

In [144]:
mnli_train, mnli_val = mnli_split['train'], mnli_split['test']
mnli_test = mnli_tokenized['validation_matched']

len(mnli_train), len(mnli_val), len(mnli_test)

(314161, 78541, 9815)

# [MY CODE] 훈련 파라미터 설정

In [145]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',            # 모델, log 등을 저장할 directory
    per_device_train_batch_size=128,   # training data의 batch size
    per_device_eval_batch_size =128,   # validation data의 batch size
    num_train_epochs=10,               # 학습 epoch 수
    logging_strategy="epoch",          # Epoch가 끝날 때마다 training loss 등을 log하라는 의미
    do_train=True,                     # 학습을 진행하겠다는 의미
    do_eval=True,                      # 학습 중간에 validation data에 대한 평가를 수행하겠다는 의미
    eval_strategy="epoch",             # 매 epoch가 끝날 때마다 validation data에 대한 평가를 수행한다는 의미
    save_strategy="epoch",             # 매 epoch가 끝날 때마다 모델을 저장하겠다는 의미
    learning_rate=1e-3,                # optimizer에 사용할 learning rate
    load_best_model_at_end=True,       # 학습이 끝난 후, validation data에 대한 성능이 가장 좋은 모델을 채택
    weight_decay=0.01,                 # 가중치 감소
)

In [146]:
# evaluate를 이용한 metrics 계산 함수 정의

In [147]:
import evaluate

# accuracy 평가를 위한 로드
accuracy = evaluate.load("accuracy")

# metrics 계산 함수 정의
def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)  # 확률을 예측 클래스 레이블로 변환
    return accuracy.compute(predictions=predictions, references=labels)

# [MY CODE] Trainer 객체 생성

In [148]:
from transformers import Trainer
trainer = Trainer(
    model=model,                      # 훈련할 모델
    args=training_args,               # 학습 설정
    train_dataset=mnli_train,         # 훈련 데이터셋 (train split만 사용)
    eval_dataset=mnli_val,            # 평가 데이터셋 (test split 사용)
    compute_metrics=compute_metrics,  # 평가 함수 지정
    tokenizer=tokenizer
)

  trainer = Trainer(


# 모델 훈련(학습)

In [None]:
trainer.train()
trainer.save_model()

# 모델 평가

In [None]:
eval_results = trainer.evaluate(mnli_test)
print(eval_results)

In [None]:
# 정확도 확인 (정확도는 'eval_accuracy'에 포함되어야 함)
accuracy_value = eval_results.get("eval_accuracy", 0)
print(f"Validation Accuracy on 'validation_matched': {accuracy_value * 100:.2f}%")

# 50% 정확도를 넘는지 확인
if accuracy_value >= 0.5:
    print("성공: 정확도가 50%를 넘었습니다.")
else:
    print("실패: 정확도가 50% 미만입니다.")