In [1]:
import os, time, random, numpy as np
import torch

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

from transformers import set_seed
set_seed(random_seed)

In [2]:
# 셀 2 — Datasets (베이스라인 변수명/흐름 유지, GitHub e9t/nsmc에서 직접 로드)
import datasets
from datasets import load_dataset

# GitHub 원본 TSV 파일(탭 구분)
data_files = {
    "train": "https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt",
    "test":  "https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt",
}

# TSV를 csv 빌더로 로드 (delimiter='\t')
huggingface_mrpc_dataset = load_dataset(
    "csv",
    data_files=data_files,
    delimiter="\t",
)

# 결측/빈 문장 필터링 (document가 비어있는 샘플 제거)
def _valid(example):
    txt = example.get("document")
    return txt is not None and isinstance(txt, str) and txt.strip() != ""

huggingface_mrpc_dataset = huggingface_mrpc_dataset.filter(_valid)

print(huggingface_mrpc_dataset)
train = huggingface_mrpc_dataset['train']
cols = train.column_names
print("Columns:", cols)

# 베이스라인 형식으로 샘플 5개 출력
for i in range(5):
    for col in cols:
        print(col, ":", train[col][i])
    print('\n')


Downloading data:   0%|          | 0.00/14.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.89M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 149995
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 49997
    })
})
Columns: ['id', 'document', 'label']
id : 9976970
document : 아 더빙.. 진짜 짜증나네요 목소리
label : 0


id : 3819312
document : 흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나
label : 1


id : 10265843
document : 너무재밓었다그래서보는것을추천한다
label : 0


id : 9045019
document : 교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정
label : 0


id : 6483659
document : 사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다
label : 1




In [3]:
'''
import pandas as pd
from datasets import Dataset, DatasetDict

def parse_mrpc_file(file_path):
    data = {'Quality': [], '#1 ID': [], '#2 ID': [], '#1 String': [], '#2 String': []}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            next(f)
            for line_num, line in enumerate(f, 1):
                try:
                    parts = line.strip().split('\t')
                    if len(parts) >= 5:
                        quality = int(parts[0])
                        id1 = int(parts[1])
                        id2 = int(parts[2])
                        string1 = parts[3]
                        string2 = '\t'.join(parts[4:])
                        data['Quality'].append(quality)
                        data['#1 ID'].append(id1)
                        data['#2 ID'].append(id2)
                        data['#1 String'].append(string1)
                        data['#2 String'].append(string2)
                except Exception:
                    pass
    except FileNotFoundError:
        pass
    return pd.DataFrame(data)
'''

# NSMC: train을 train/validation으로 나누기, test는 그대로
from datasets import DatasetDict
from datasets import ClassLabel
huggingface_mrpc_dataset = huggingface_mrpc_dataset.cast_column("label", ClassLabel(num_classes=2))

nsmc_dataset = huggingface_mrpc_dataset['train'].train_test_split(
    test_size=0.1, seed=random_seed, stratify_by_column='label'
)
train_hf_dataset_raw = nsmc_dataset['train']
validation_hf_dataset_raw = nsmc_dataset['test']
test_hf_dataset_raw = huggingface_mrpc_dataset['test']

customized_mrpc_dataset = DatasetDict({
    'train': train_hf_dataset_raw,
    'validation': validation_hf_dataset_raw,
    'test': test_hf_dataset_raw
})

print("DatasetDict({")
for split_name, split_data in customized_mrpc_dataset.items():
    print(f"    {split_name}: Dataset({{")
    print(f"        features: {list(split_data.features.keys())},")
    print(f"        num_rows: {split_data.num_rows}")
    print("    })")
print("})")

print("\n데이터셋 정보 확인!")
print(f"Train dataset shape: ({customized_mrpc_dataset['train'].num_rows}, {len(customized_mrpc_dataset['train'].features)})")
print(f"Validation dataset shape: ({customized_mrpc_dataset['validation'].num_rows}, {len(customized_mrpc_dataset['validation'].features)})")
print(f"Test dataset shape: ({customized_mrpc_dataset['test'].num_rows}, {len(customized_mrpc_dataset['test'].features)})")
print(f"\n첫 번째 train 샘플:\n{customized_mrpc_dataset['train'][0]}")

Casting the dataset:   0%|          | 0/149995 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/49997 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 134995
    })
    validation: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 15000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 49997
    })
})

데이터셋 정보 확인!
Train dataset shape: (134995, 3)
Validation dataset shape: (15000, 3)
Test dataset shape: (49997, 3)

첫 번째 train 샘플:
{'id': 2986730, 'document': '우선포스터실패!코미디가아니자나!!비뚤어진포커스가아쉽다', 'label': 0}


In [4]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# KLUE BERT
huggingface_tokenizer = AutoTokenizer.from_pretrained('klue/bert-base', use_fast=True)
huggingface_model = AutoModelForSequenceClassification.from_pretrained('klue/bert-base', num_labels=2)

# transform (고정 길이 패딩)
def transform(data):
    if 'sentence1' in data and 'sentence2' in data:
        s1, s2 = data['sentence1'], data['sentence2']
    elif 'document' in data:
        s1 = data['document']
        s2 = [""] * len(s1) if isinstance(s1, list) else ""
    else:
        s1 = data.get('#1 String', data.get('text', ""))
        s2 = data.get('#2 String', [""] * len(s1) if isinstance(s1, list) else "")
    return huggingface_tokenizer(
        s1,
        s2,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_token_type_ids=True,
    )

# 불필요한 원문 ID/텍스트 제거
def _remove_cols(ds):
    feats = list(ds.features.keys())
    rem = [c for c in ['id', 'document'] if c in feats]
    return rem

hf_dataset = customized_mrpc_dataset.map(
    transform,
    batched=True,
    remove_columns=_remove_cols(customized_mrpc_dataset['train'])
)

hf_train_dataset = hf_dataset['train']
hf_val_dataset   = hf_dataset['validation']
hf_test_dataset  = hf_dataset['test']

print("\n토크나이즈 완료!")
print(hf_train_dataset[0].keys())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/134995 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/49997 [00:00<?, ? examples/s]


토크나이즈 완료!
dict_keys(['label', 'input_ids', 'token_type_ids', 'attention_mask'])


In [14]:
import numpy as np
from transformers import Trainer, TrainingArguments

output_dir = 'transformers'

training_arguments = TrainingArguments(
    output_dir,                                         # output이 저장될 경로
    eval_strategy="epoch",           #evaluation하는 빈도
    learning_rate = 3e-5,                         #learning_rate
    per_device_train_batch_size = 32,   # 각 device 당 batch size
    per_device_eval_batch_size = 64,    # evaluation 시에 batch size
    num_train_epochs = 3,                     # train 시킬 총 epochs
    weight_decay = 0.01,                        # weight decay
    report_to = 'none',                         # Disable Weights & Biases logging
)

In [15]:
from evaluate import load

metric = load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [16]:
trainer = Trainer(
    model=huggingface_model,
    args=training_arguments,
    train_dataset=hf_train_dataset,
    eval_dataset=hf_val_dataset,
    compute_metrics=compute_metrics,
)

print("\n학습 시작(고정 패딩)...")
t0 = time.perf_counter()
trainer.train()
t1 = time.perf_counter()
time_step4 = t1 - t0

print("\nValidation 성능:")
val_metrics_step4 = trainer.evaluate(hf_val_dataset)
print(val_metrics_step4)

print("\nTest 성능:")
test_metrics_step4 = trainer.evaluate(hf_test_dataset)
print(test_metrics_step4)


학습 시작(고정 패딩)...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2472,0.227654,0.909533
2,0.1699,0.260606,0.909533
3,0.0941,0.315013,0.9132



Validation 성능:


{'eval_loss': 0.31501302123069763, 'eval_accuracy': 0.9132, 'eval_runtime': 110.085, 'eval_samples_per_second': 136.258, 'eval_steps_per_second': 2.135, 'epoch': 3.0}

Test 성능:
{'eval_loss': 0.3402453362941742, 'eval_accuracy': 0.9050943056583395, 'eval_runtime': 366.5156, 'eval_samples_per_second': 136.412, 'eval_steps_per_second': 2.134, 'epoch': 3.0}
