In [1]:
import tensorflow as tf
import numpy as np
import datasets

In [2]:
from transformers import AutoModel, TFBertForPreTraining

In [3]:
from transformers import AutoTokenizer, BertTokenizer

In [4]:
from transformers import AutoConfig

In [5]:
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification

In [6]:
from datasets import load_dataset

In [7]:
# 1. NSMC 데이터셋 불러오기
dataset = load_dataset("nsmc")  # Hugging Face에서 NSMC 데이터셋 로드
train_dataset = dataset["train"]  # train split 사용

README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

nsmc.py:   0%|          | 0.00/3.18k [00:00<?, ?B/s]

In [8]:
# 2. 토크나이저와 모델 로드 (예: BERT 기반 한국어 모델)
model_name = "klue/bert-base"  # 또는 "klue/bert-base" 등 한국어 모델 사용 가능
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [9]:
# 3. 토크나이즈 함수 정의
def tokenize_function(example):
    return tokenizer(
        example["document"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors=None  # Trainer가 텐서로 변환하므로 딕셔너리 형태로 반환
    )

In [10]:
# 4. 토크나이즈 적용
tokenized_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["document", "id"]  # 불필요한 열 제거
)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

In [11]:
# 5. train/val 분리
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=123)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

In [12]:
# 6. 데이터 확인 (디버깅용)
print("Train sample:", train_dataset[0])
print("Train columns:", train_dataset.column_names)
for i in range(min(5, len(train_dataset))):
    print(f"Sample {i} input_ids length:", len(train_dataset[i]["input_ids"]))

Train sample: {'labels': 0, 'input_ids': [2, 3790, 2170, 4027, 24304, 24, 2532, 5675, 18, 20608, 2119, 3760, 11531, 1819, 2075, 2088, 3758, 2079, 12488, 2119, 4239, 3788, 2283, 2097, 2223, 2088, 6641, 4177, 25175, 2118, 3926, 18395, 2573, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [13]:
# 7. 학습 설정
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  # 빠른 테스트를 위해 1로 설정
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    do_train=True,
    do_eval=True,
    eval_steps=1000,
    evaluation_strategy="steps",  # 평가 주기 설정
    save_strategy="steps",       # 체크포인트 저장 전략
    save_steps=1000,             # 체크포인트 저장 주기
    load_best_model_at_end=True, # 학습 끝난 후 최적 모델 로드
)

In [14]:
# 8. Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# 9. 학습 시작
trainer.train()

***** Running training *****
  Num examples = 120000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7500


Step,Training Loss,Validation Loss
1000,0.3553,0.318039
2000,0.3197,0.305719
3000,0.3024,0.283015
4000,0.283,0.289858
5000,0.2713,0.270443
6000,0.2785,0.253696
7000,0.2444,0.24883


***** Running Evaluation *****
  Num examples = 30000
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 30000
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-2000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-2000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 30000
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3000/conf

TrainOutput(global_step=7500, training_loss=0.3001753224690755, metrics={'train_runtime': 4483.0838, 'train_samples_per_second': 26.767, 'train_steps_per_second': 1.673, 'total_flos': 7893331660800000.0, 'train_loss': 0.3001753224690755, 'epoch': 1.0})

In [None]:
# bucketing 적용을 위한 모델
model2 = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [15]:
# 다이나믹패딩을 위한 토크나이즈 함수 정의
def nonpadding_tokenize_function(example):
    return tokenizer(
        example["document"],
        truncation=True,
        padding=False
    )

In [16]:
train_dataset = dataset["train"]

In [17]:
# 패딩 제거 토크나이즈 적용
nonpadding_tokenized_dataset = train_dataset.map(
    nonpadding_tokenize_function,
    batched=True,
    remove_columns=["document", "id"]  # 불필요한 열 제거
)
nonpadding_tokenized_dataset = nonpadding_tokenized_dataset.rename_column("label", "labels")

Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

In [18]:
# 패딩 적용하지 않은 데이터셋 생성

nonpadding_split_dataset = nonpadding_tokenized_dataset.train_test_split(test_size=0.2, seed=123)
nonpadding_train_dataset = nonpadding_split_dataset["train"]
nonpadding_val_dataset = nonpadding_split_dataset["test"]

In [19]:
# bucketing을 위한 학습 설정

bucket_training_args = TrainingArguments(
    output_dir='./bucket_results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./bucket_logs',
    do_train=True,
    do_eval=True,
    eval_steps=1000,
    group_by_length=True,  # Bucketing 활성화: 비슷한 길이의 샘플을 그룹화
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [20]:
from transformers import DataCollatorWithPadding

In [21]:
# bucketing을 위한 data_collator생성

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=True,          # 배치 내 최대 길이에 맞춰 패딩
    pad_to_multiple_of=8   # GPU 효율성을 위해 8의 배수로 패딩 (선택적)
)

In [22]:
# data_collator 적용한 학습 수행
bucket_trainer = Trainer(
    model=model2,
    args=bucket_training_args,
    train_dataset=nonpadding_train_dataset,
    eval_dataset=nonpadding_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,  # Dynamic Padding 적용
)

# 학습 시작
bucket_trainer.train()

***** Running training *****
  Num examples = 120000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7500


Step,Training Loss
500,0.2368
1000,0.2521
1500,0.219
2000,0.2253
2500,0.2377
3000,0.2232
3500,0.2219
4000,0.2271
4500,0.2156
5000,0.2189


Saving model checkpoint to ./bucket_results/checkpoint-500
Configuration saved in ./bucket_results/checkpoint-500/config.json
Model weights saved in ./bucket_results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./bucket_results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./bucket_results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./bucket_results/checkpoint-1000
Configuration saved in ./bucket_results/checkpoint-1000/config.json
Model weights saved in ./bucket_results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./bucket_results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./bucket_results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./bucket_results/checkpoint-1500
Configuration saved in ./bucket_results/checkpoint-1500/config.json
Model weights saved in ./bucket_results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./bucket_results/checkpo

TrainOutput(global_step=7500, training_loss=0.23017427775065105, metrics={'train_runtime': 1097.1004, 'train_samples_per_second': 109.379, 'train_steps_per_second': 6.836, 'total_flos': 1656612982310400.0, 'train_loss': 0.23017427775065105, 'epoch': 1.0})

In [23]:
#pip list | grep evaluate

In [24]:
import evaluate

In [25]:
accuracy_metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [26]:
# bucketing 적용 전 예측

val_predictions = trainer.predict(val_dataset)
val_pred_labels = val_predictions.predictions.argmax(-1)
val_true_labels = val_predictions.label_ids

***** Running Prediction *****
  Num examples = 30000
  Batch size = 64


In [27]:
# bucketing 적용 전 검증 정확도

val_accuracy = accuracy_metric.compute(predictions=val_pred_labels, references=val_true_labels)
print(f"Validation Accuracy: {val_accuracy['accuracy']:.4f}")

Validation Accuracy: 0.9024


In [30]:
# bucketing 적용 후 예측

bucket_val_predictions = bucket_trainer.predict(nonpadding_val_dataset)
bucket_val_pred_labels = bucket_val_predictions.predictions.argmax(-1)
bucket_val_true_labels = bucket_val_predictions.label_ids

***** Running Prediction *****
  Num examples = 30000
  Batch size = 64


In [31]:
# bucketing 적용 후 검증 정확도

bucket_val_accuracy = accuracy_metric.compute(predictions=bucket_val_pred_labels, references=val_true_labels)
print(f"Validation Accuracy with bucket: {bucket_val_accuracy['accuracy']:.4f}")

Validation Accuracy with bucket: 0.9024


#### bucketing과 다이나믹 패딩 적용 전 후 비교
 - 학습 시간 : 적용 전 74분, 적용 후 18분
 - 추론 소요시간 : 적용 전 4분, 적용후 1분 내외 
 - 검증정확도 : 적용 전 0.9024, 적용 후 0.9024

##### 학습시간은 약 4배 빨라졌고 검증 정확도는 성능이 유지되었음
 - 같은 모델에서 학습 됐을 것으로 보임(bucket_trainer의 첫 에폭 손실값이 너무 낮음)