# MNLI 데이터셋 분석하기

In [22]:
import tensorflow
import numpy
import transformers
import argparse
import tensorflow_datasets as tfds
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import numpy as np

In [23]:
import tensorflow as tf
from transformers import AutoTokenizer, TFRobertaForSequenceClassification
from transformers.data.processors.utils import InputExample
from datasets import load_dataset

# STEP 1: MNLI 데이터셋 분석하기 및 데이터셋 로드
# Huggingface의 datasets 라이브러리를 사용하여 MNLI 데이터셋을 로드합니다.
dataset = load_dataset('glue', 'mnli')




  0%|          | 0/5 [00:00<?, ?it/s]

# MNLIProcessor클래스 구현하기

In [24]:
class MNLIProcessor:
    """Processor for the MNLI dataset using Huggingface datasets."""

    def get_train_examples(self, dataset):
        """Returns training examples."""
        return self._create_examples(dataset['train'], "train")

    def get_dev_examples(self, dataset):
        """Returns dev examples."""
        return self._create_examples(dataset['validation_matched'], "dev")

    def get_labels(self):
        """Returns possible labels."""
        return ["entailment", "contradiction", "neutral"]

    def _create_examples(self, dataset_split, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for i, example in enumerate(dataset_split):
            guid = f"{set_type}-{i}"
            text_a = example['premise']
            text_b = example['hypothesis']
            label = str(example['label'])
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples


# 데이터셋 생성

In [25]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
processor = MNLIProcessor()

In [26]:
# 데이터셋 예제 가져오기
train_examples = processor.get_train_examples(dataset)
label_list = processor.get_labels()

In [27]:
# 단위 테스트: Processor 클래스가 올바르게 동작하는지 확인
def test_processor(processor, dataset):
    examples = processor.get_train_examples(dataset)
    assert len(examples) > 0, "Processor returned an empty list of examples"

    example = examples[0]
    print("------ 원본 데이터 ------")
    print(example)
    
    processed_example = processor._create_examples(dataset['train'], "train")[0]
    print("------ Processor 가공 데이터 ------")
    print(processed_example)

# Processor 단위 테스트 실행
test_processor(processor, dataset)

------ 원본 데이터 ------
InputExample(guid='train-0', text_a='Conceptually cream skimming has two basic dimensions - product and geography.', text_b='Product and geography are what make cream skimming work. ', label='1')
------ Processor 가공 데이터 ------
InputExample(guid='train-0', text_a='Conceptually cream skimming has two basic dimensions - product and geography.', text_b='Product and geography are what make cream skimming work. ', label='1')


In [28]:
def convert_example_to_features(example, label_list, max_length, tokenizer):
    """하나의 InputExample을 BERT가 이해할 수 있는 features로 변환합니다."""
    inputs = tokenizer.encode_plus(
        example.text_a,
        example.text_b,
        add_special_tokens=True,
        max_length=max_length,
        truncation=True
    )

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    
    label = int(example.label)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "label": label
    }


In [29]:
def create_dataset(examples, label_list, max_length, tokenizer, batch_size):
    """features를 tf.data.Dataset 형태로 변환합니다."""
    features = [convert_example_to_features(example, label_list, max_length, tokenizer) for example in examples]

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f["input_ids"],
                    "attention_mask": f["attention_mask"],
                },
                f["label"],
            )

    dataset = tf.data.Dataset.from_generator(
        gen,
        ({
            "input_ids": tf.int32,
            "attention_mask": tf.int32,
        }, tf.int64),
        ({
            "input_ids": tf.TensorShape([None]),
            "attention_mask": tf.TensorShape([None]),
        }, tf.TensorShape([])),
    )

    dataset = dataset.padded_batch(batch_size, padded_shapes=({
        "input_ids": [max_length],
        "attention_mask": [max_length],
    }, []))

    return dataset


# 모델 생성 및 학습

In [30]:
train_examples = processor.get_train_examples(dataset)[:len(dataset['train']) // 10]
num_train_examples = len(train_examples)

# steps_per_epoch를 계산
batch_size = 32
steps_per_epoch = num_train_examples // batch_size



In [31]:
# 로버타 모델 로드 및 컴파일
model = TFRobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_list))


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
#데이터셋 생성 및 캐싱, prefetch 적용
train_dataset = create_dataset(train_examples, label_list, max_length=128, tokenizer=tokenizer, batch_size=batch_size)
train_dataset = train_dataset.cache().prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [33]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])


In [34]:
# 모델 학습
model.fit(train_dataset, epochs=3, steps_per_epoch=steps_per_epoch)

Epoch 1/3
Epoch 2/3






<keras.callbacks.History at 0x7c9c24a38b80>

# 모델 평가 

In [35]:
# 검증 데이터셋 생성
dev_examples = processor.get_dev_examples(dataset)
dev_dataset = create_dataset(dev_examples, label_list, max_length=128, tokenizer=tokenizer, batch_size=32)

# 모델 평가
eval_results = model.evaluate(dev_dataset)
print(f"Evaluation Loss: {eval_results[0]}")
print(f"Evaluation Accuracy: {eval_results[1]}")

Evaluation Loss: 0.4563239812850952
Evaluation Accuracy: 0.8204788565635681
