In [1]:
# 모델 학습 코드
import os
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainerCallback
from datasets import Dataset
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig

[2025-02-15 12:56:27,955] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [2]:
# 모델 및 데이터 설정
base_model_id = "beomi/OPEN-SOLAR-KO-10.7B"
device_map = "cuda"
torch_dtype = torch.bfloat16
output_dir = "output_0214"
dataset_name = "data/augmented_train_output.csv"

In [3]:
# 최대 입력/출력 길이 설정 
seq_length = 3000

In [4]:
# CSV 데이터 로드
try:
    full_dataset = Dataset.from_csv(path_or_paths=dataset_name)
    print("CSV 파일이 성공적으로 로드되었습니다.")
except Exception as e:
    print(f"CSV 파일 로드 중 오류 발생: {e}")
    raise

CSV 파일이 성공적으로 로드되었습니다.


In [5]:
# 토크나이저 설정
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.padding_side = "right"
tokenizer.truncation = True  
tokenizer.pad_token = tokenizer.eos_token  # 패딩 토큰 설정

tokenizer_config.json:   0%|          | 0.00/87.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.59M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [6]:
# LoRA 구성
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "down_proj", "up_proj", "gate_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)


In [9]:
# 4-bit 양자화 설정
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 4비트 양자화
    bnb_4bit_compute_dtype="float16",  # 계산을 위한 데이터 타입 설정
    bnb_4bit_use_double_quant=True,  # 이중 양자화 활성화 (메모리 절약)
)

In [10]:
# 모델 로드
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=quant_config,  
    device_map="auto",
)


Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:  45%|####5     | 1.36G/3.00G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/2.97G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [11]:
# 캐시 비활성화
base_model.config.use_cache = False
base_model.config.max_position_embeddings = seq_length  # max_length 설정

In [12]:
# 입력 및 출력 포맷 준비 함수 
def function_prepare_sample_text(tokenizer, for_train=True):
    def _prepare_sample_text(example):
        system_prompt = (
            "당신은 한국어 리뷰 복원 전문가입니다.\n"
            "당신의 임무는 난독화된 한글 리뷰를 분석하고, 이를 자연스럽고 명확한 원래 의미의 한글 리뷰로 복원하는 것입니다.\n"
            "난독화된 리뷰의 단어를 원본 단어로 복원하고, 띄어쓰기와 문장 구조도 원래대로 복원하세요.\n"
            "문맥을 분석하여 자연스럽고 의미 있는 복원을 수행하며, 출력은 오직 한국어로만 작성하십시오."
        )
        
        user_prompt = example.get("input", "")
        if not isinstance(user_prompt, str):
            user_prompt = "" 

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]

        if for_train:
            assistant_response = example.get("output", "")
            if not isinstance(assistant_response, str):
                assistant_response = ""

            messages.append({"role": "assistant", "content": assistant_response})

        # 메시지를 문자열로 변환
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=not for_train)
        
        if not isinstance(text, str):
            raise ValueError(f"apply_chat_template 결과가 문자열이 아닙니다. text={text}")

        return text
    return _prepare_sample_text

In [13]:
# 데이터셋 변환 (ConstantLengthDataset 제거)
def preprocess_function(examples):
    prepare_sample_text = function_prepare_sample_text(tokenizer)
    
    # 여러 샘플을 처리하는 경우 리스트로 전달됨
    inputs = examples["input"]  # 리스트 형태
    outputs = examples["output"]  # 리스트 형태

    # 입력 텍스트 변환
    texts = [
        prepare_sample_text({"input": inp, "output": out})
        for inp, out in zip(inputs, outputs)
    ]

    model_inputs = tokenizer(
        texts,
        max_length=seq_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

    return model_inputs

In [14]:
# 데이터셋 변환 적용
# train_dataset = full_dataset.map(preprocess_function, batched=True, remove_columns=full_dataset.column_names)
train_dataset = full_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/22526 [00:00<?, ? examples/s]

No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.


In [15]:
# 성능 평가 함수 추가 (ROUGE, BLEU 등 가능)
def compute_metrics(eval_pred):
    import evaluate  # 최신 버전에서는 evaluate 사용
    metric = evaluate.load("rouge")  # `datasets.load_metric` → `evaluate.load`

    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {key: value.mid.fmeasure for key, value in result.items()}


In [16]:
class SaveBestModelCallback(TrainerCallback):
    def __init__(self, tokenizer):
        self.best_loss = float('inf')
        self.best_model_path = "./best_model_02015_v1"
        self.tokenizer = tokenizer

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        eval_loss = metrics.get("eval_loss")
        if eval_loss and eval_loss < self.best_loss:
            self.best_loss = eval_loss
            print(f"New best eval loss: {self.best_loss}. Saving model to {self.best_model_path}")

            kwargs["model"].save_pretrained(self.best_model_path)
            self.tokenizer.save_pretrained(self.best_model_path)

In [20]:
# SFT 설정 
sft_config = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=2,  
    gradient_accumulation_steps=8, 
    gradient_checkpointing=True,
    learning_rate=2e-4,
    warmup_ratio=0.1,
    max_grad_norm=0.3,
    weight_decay=0.05,
    num_train_epochs=1,  
    logging_steps=500,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=3,
    max_seq_length=seq_length,
    dataset_text_field="input"  # 여기서 컬럼명 수정 (ex. "text" -> "input" 또는 "output")
)

In [21]:
# 트레이너 설정 및 학습 시작
trainer = SFTTrainer(
    model=base_model,
    train_dataset=train_dataset,
    eval_dataset=None,
    peft_config=lora_config,  # LoRA 적용
    tokenizer=tokenizer,
    args=sft_config,
    callbacks=[SaveBestModelCallback(tokenizer)]
)

In [22]:
trainer.train()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss


KeyboardInterrupt: 