In [1]:
import tensorflow
import numpy as np
import datasets

In [2]:
from transformers import AutoModel, TFBertForPreTraining

In [3]:
from transformers import AutoTokenizer, BertTokenizer

In [4]:
from transformers import AutoConfig

In [5]:
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification

In [6]:
from datasets import load_dataset

In [7]:
# 1. NSMC 데이터셋 불러오기
dataset = load_dataset("nsmc")  # Hugging Face에서 NSMC 데이터셋 로드
train_dataset = dataset["train"]  # train split 사용

README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

nsmc.py:   0%|          | 0.00/3.18k [00:00<?, ?B/s]

In [15]:
# 2. 토크나이저와 모델 로드 (예: BERT 기반 한국어 모델)
model_name = "klue/bert-base"  # 또는 "klue/bert-base" 등 한국어 모델 사용 가능
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [16]:
# 3. 토크나이즈 함수 정의
def tokenize_function(example):
    return tokenizer(
        example["document"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors=None  # Trainer가 텐서로 변환하므로 딕셔너리 형태로 반환
    )

In [17]:
# 4. 토크나이즈 적용
tokenized_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["document", "id"]  # 불필요한 열 제거
)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

In [18]:
# 5. train/val 분리
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=123)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

In [29]:
# 6. 데이터 확인 (디버깅용)
print("Train sample:", train_dataset[0])
print("Train columns:", train_dataset.column_names)
for i in range(min(5, len(train_dataset))):
    print(f"Sample {i} input_ids length:", len(train_dataset[i]["input_ids"]))

Train sample: {'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': 0, 'input_ids': [2, 3790, 2170, 4027, 24304, 24, 2532, 5675, 18, 20608, 2119, 3760, 11531, 1819, 2075, 2088, 3758, 2079, 12488, 2119, 4239, 3788, 2283, 2097, 2223, 208

In [19]:
# 7. 학습 설정
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  # 빠른 테스트를 위해 1로 설정
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    do_train=True,
    do_eval=True,
    eval_steps=1000,
    evaluation_strategy="steps",  # 평가 주기 설정
    save_strategy="steps",       # 체크포인트 저장 전략
    save_steps=1000,             # 체크포인트 저장 주기
    load_best_model_at_end=True, # 학습 끝난 후 최적 모델 로드
)

In [None]:
# 8. Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# 9. 학습 시작
trainer.train()

In [12]:
# bucketing 적용을 위한 모델
model2 = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized

In [20]:
# 다이나믹패딩을 위한 토크나이즈 함수 정의
def nonpadding_tokenize_function(example):
    return tokenizer(
        example["document"],
        truncation=True,
        padding=False
    )

In [21]:
train_dataset = dataset["train"]

In [22]:
# 패딩 제거 토크나이즈 적용
nonpadding_tokenized_dataset = train_dataset.map(
    nonpadding_tokenize_function,
    batched=True,
    remove_columns=["document", "id"]  # 불필요한 열 제거
)
nonpadding_tokenized_dataset = nonpadding_tokenized_dataset.rename_column("label", "labels")

Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

In [23]:
# 패딩 적용하지 않은 데이터셋 생성

nonpadding_split_dataset = nonpadding_tokenized_dataset.train_test_split(test_size=0.2, seed=123)
nonpadding_train_dataset = nonpadding_split_dataset["train"]
nonpadding_val_dataset = nonpadding_split_dataset["test"]

In [24]:
# bucketing을 위한 학습 설정

bucket_training_args = TrainingArguments(
    output_dir='./bucket_results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./bucket_logs',
    do_train=True,
    do_eval=True,
    eval_steps=1000,
    evaluation_strategy="steps",  # 평가 주기 설정
    save_strategy="steps",       # 체크포인트 저장 전략
    save_steps=1000,             # 체크포인트 저장 주기
    load_best_model_at_end=True, # 학습 끝난 후 최적 모델 로드
    group_by_length=True,  # Bucketing 활성화: 비슷한 길이의 샘플을 그룹화
)

In [25]:
from transformers import DataCollatorWithPadding

In [26]:
# bucketing을 위한 data_collator생성

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=True,          # 배치 내 최대 길이에 맞춰 패딩
    pad_to_multiple_of=8   # GPU 효율성을 위해 8의 배수로 패딩 (선택적)
)

In [20]:
# data_collator 적용한 학습 수행
bucket_trainer = Trainer(
    model=model2,
    args=bucket_training_args,
    train_dataset=nonpadding_train_dataset,
    eval_dataset=nonpadding_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,  # Dynamic Padding 적용
)

# 학습 시작
bucket_trainer.train()

***** Running training *****
  Num examples = 120000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7500


Step,Training Loss,Validation Loss
1000,0.3537,0.382512
2000,0.331,0.296016
3000,0.303,0.287602
4000,0.2885,0.289535
5000,0.2699,0.271328
6000,0.2771,0.251906
7000,0.2487,0.252512


***** Running Evaluation *****
  Num examples = 30000
  Batch size = 64
Saving model checkpoint to ./bucket_results/checkpoint-1000
Configuration saved in ./bucket_results/checkpoint-1000/config.json
Model weights saved in ./bucket_results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./bucket_results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./bucket_results/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 30000
  Batch size = 64
Saving model checkpoint to ./bucket_results/checkpoint-2000
Configuration saved in ./bucket_results/checkpoint-2000/config.json
Model weights saved in ./bucket_results/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in ./bucket_results/checkpoint-2000/tokenizer_config.json
Special tokens file saved in ./bucket_results/checkpoint-2000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 30000
  Batch size = 64
Saving model checkpoint to ./bucket_

TrainOutput(global_step=7500, training_loss=0.30200464680989586, metrics={'train_runtime': 2087.0105, 'train_samples_per_second': 57.499, 'train_steps_per_second': 3.594, 'total_flos': 1656612982310400.0, 'train_loss': 0.30200464680989586, 'epoch': 1.0})

In [23]:
#pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
     |████████████████████████████████| 84 kB 2.6 MB/s             
Collecting datasets>=2.0.0
  Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
     |████████████████████████████████| 487 kB 13.9 MB/s            
Collecting huggingface-hub>=0.7.0
  Downloading huggingface_hub-0.29.3-py3-none-any.whl (468 kB)
     |████████████████████████████████| 468 kB 62.2 MB/s            
Collecting tqdm>=4.62.1
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
     |████████████████████████████████| 78 kB 10.2 MB/s            
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
     |███████████

In [27]:
# 학습완료된 bucketing 적용 전 모델 로드

checkpoint_path = "./results/checkpoint-7000"  # validation loss가 가장 낮았던 체크포인트
loaded_model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path, num_labels=2)
loaded_tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

# 새로운 Trainer 생성
trainer = Trainer(
    model=loaded_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=loaded_tokenizer,
)

In [28]:
# 학습완료된 bucketing 적용 후 모델 로드

checkpoint_path = "./bucket_results/checkpoint-6000"  # validation loss가 가장 낮았던 체크포인트
bucket_model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path, num_labels=2)
bucket_tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

# data_collator 적용한 모델 불러오기
bucket_trainer = Trainer(
    model=bucket_model,
    args=bucket_training_args,
    train_dataset=nonpadding_train_dataset,
    eval_dataset=nonpadding_val_dataset,
    tokenizer=bucket_tokenizer,
    data_collator=data_collator,  # Dynamic Padding 적용
)

loading configuration file ./bucket_results/checkpoint-6000/config.json
Model config BertConfig {
  "_name_or_path": "klue/bert-base",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file ./bucket_results/checkpoint-6000/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were

In [29]:
import evaluate

In [30]:
accuracy_metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [31]:
# bucketing 적용 전 예측

val_predictions = trainer.predict(val_dataset)
val_pred_labels = val_predictions.predictions.argmax(-1)
val_true_labels = val_predictions.label_ids

***** Running Prediction *****
  Num examples = 30000
  Batch size = 64


In [32]:
# bucketing 적용 전 검증 정확도

val_accuracy = accuracy_metric.compute(predictions=val_pred_labels, references=val_true_labels)
print(f"Validation Accuracy: {val_accuracy['accuracy']:.4f}")

Validation Accuracy: 0.9011


In [33]:
# bucketing 적용 후 예측

bucket_val_predictions = bucket_trainer.predict(nonpadding_val_dataset)
bucket_val_pred_labels = bucket_val_predictions.predictions.argmax(-1)
bucket_val_true_labels = bucket_val_predictions.label_ids

***** Running Prediction *****
  Num examples = 30000
  Batch size = 64


In [34]:
# bucketing 적용 후 검증 정확도

bucket_val_accuracy = accuracy_metric.compute(predictions=bucket_val_pred_labels, references=val_true_labels)
print(f"Validation Accuracy with bucket: {bucket_val_accuracy['accuracy']:.4f}")

Validation Accuracy with bucket: 0.8962


#### bucketing과 다이나믹 패딩 적용 전 후 비교
 - 학습 시간 : 적용 전 74분, 적용 후 34분
 - 추론 소요시간 : 적용 전 4분, 적용후 2분 30초
 - 검증정확도 : 적용 전 0.9011, 적용 후 0.8962

##### 학습시간은 약 2배 빨라졌고 검증 정확도는 성능이 비슷한 수준에서 유지되었음
##### bucketing과 다이나믹 패딩 적용 전 후 학습에 동일한 모델변수를 넣어서 수행했을때 검증 정확도 향상은 크지 않았음
##### 데이터를 불러올때 판다스 데이터프레임으로 사용하면 허깅페이스 데이터셋 만들 때 아래와 같은 에러 발생
 - ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.