In [6]:
# 필요한 라이브러리 임포트
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
import os

# 토크나이저 병렬 처리 경고 제거
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 1. KoBART 토크나이저 및 모델 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained('hyunwoongko/kobart')
model = BartForConditionalGeneration.from_pretrained('hyunwoongko/kobart')

# 2. 이모지 토큰 추가 및 모델 임베딩 레이어 확장
dataset = load_dataset('omarkamali/emoji-map')
dataset = dataset['train']

new_emojis = list(set(dataset['emoji']))
tokenizer.add_tokens(new_emojis)
model.resize_token_embeddings(len(tokenizer))

# 모델의 모든 파라미터를 학습 가능하도록 설정
for param in model.parameters():
    param.requires_grad = True

# 3. 데이터셋 로드 및 전처리
dataset = dataset.select_columns(['emoji', 'description_kor_Hang'])
dataset = dataset.filter(lambda x: x['description_kor_Hang'] is not None)

def combine_emoji_description(example):
    example['text'] = example['description_kor_Hang'] + ' ' + example['emoji']
    return example

dataset = dataset.map(combine_emoji_description)

# 4. 토크나이징
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=64
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 데이터셋을 훈련용과 검증용으로 분할
split_dataset = tokenized_datasets.train_test_split(test_size=0.1, shuffle=True)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

# 5. 데이터 콜레이터 설정 (MLM 방식 적용)
class EmojiDataCollatorForBart:
    def __init__(self, tokenizer, new_emojis, mlm_probability=0.15):
        self.tokenizer = tokenizer
        self.new_emojis = new_emojis
        self.emoji_token_ids = set(tokenizer.convert_tokens_to_ids(new_emojis))
        self.mlm_probability = mlm_probability

    def __call__(self, features):
        # 입력 시퀀스 생성
        input_ids = [f['input_ids'] for f in features]
        attention_mask = [f['attention_mask'] for f in features]

        # 텐서로 변환
        input_ids = torch.tensor(input_ids, dtype=torch.long)
        attention_mask = torch.tensor(attention_mask, dtype=torch.long)

        # 레이블 생성 (원본 시퀀스)
        labels = input_ids.clone()

        # 마스킹 적용
        probability_matrix = torch.full(labels.shape, self.mlm_probability)
        special_tokens_mask = [
            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
        ]
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)

        # 이모지 토큰에 대한 마스킹 확률을 높이기 위해 마스크 생성
        emoji_mask = torch.zeros_like(labels, dtype=torch.bool)
        for token_id in self.emoji_token_ids:
            emoji_mask |= (labels == token_id)
        probability_matrix.masked_fill_(emoji_mask, value=self.mlm_probability * 2)  # 이모지 토큰의 마스킹 확률을 2배로 설정

        masked_indices = torch.bernoulli(probability_matrix).bool()
        input_ids[masked_indices] = self.tokenizer.mask_token_id

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

data_collator = EmojiDataCollatorForBart(tokenizer, new_emojis)

# 6. 학습 인자 설정 (조기 종료 관련 설정 추가)
training_args = TrainingArguments(
    output_dir='./kobart-emoji',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',       # 에포크마다 평가
    save_strategy='epoch',             # 에포크마다 모델 저장
    save_total_limit=2,
    logging_strategy='epoch',          # 에포크마다 로그 출력
    learning_rate=5e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

# 7. Trainer 설정 및 모델 학습
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()

# 8. 모델 및 토크나이저 저장
trainer.save_model('./kobart-emoji')
tokenizer.save_pretrained('./kobart-emoji')


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Epoch,Training Loss,Validation Loss
1,2.0659,0.086642
2,0.0731,0.066804
3,0.0614,0.066345
4,0.0607,0.065569
5,0.059,0.061626
6,0.055,0.067695
7,0.0551,0.063914
8,0.0539,0.060496
9,0.0537,0.063939
10,0.0512,0.065808


Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
Non-default generation parameters: {'forced_eos_token_id': 1}


('./kobart-emoji/tokenizer_config.json',
 './kobart-emoji/special_tokens_map.json',
 './kobart-emoji/tokenizer.json')