In [1]:
import os
import re
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from trl import SFTTrainer, SFTConfig
from trl.trainer import ConstantLengthDataset

# 필요한 라이브러리 임포트
from datasets import Dataset
from transformers import AutoTokenizer
from peft import LoraConfig
from transformers import BitsAndBytesConfig
import torch

[2025-02-24 06:13:47,136] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [2]:
# 1. 모델과 데이터셋 정보 설정
base_model_id = "beomi/OPEN-SOLAR-KO-10.7B"
device_map = "cuda"
torch_dtype = torch.bfloat16
output_dir = "output_0224"
dataset_name = "data/train.csv"  # 데이터 파일 경로
seq_length = 512

In [3]:
# 2. 데이터셋 로드 (CSV 형식)
try:
    full_dataset = Dataset.from_csv(path_or_paths=dataset_name)
    print("CSV 파일이 성공적으로 로드되었습니다.")
except Exception as e:
    print(f"CSV 파일 로드 중 오류 발생: {e}")
    raise

Generating train split: 0 examples [00:00, ? examples/s]

CSV 파일이 성공적으로 로드되었습니다.


In [4]:
# 3. 토크나이저 설정
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.padding_side = "right"

# 4. LoRA 구성
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj", "down_proj", "up_proj", "gate_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
)

In [5]:
# 5. 4-bit 양자화 설정
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

print("코드 실행 완료.")

코드 실행 완료.


In [6]:
# 6. 모델 로드
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
)

# 7. 캐시 비활성화
base_model.config.use_cache = False

# 8. LoRA 설정 적용
peft_config = lora_config

# 9. 토크나이저 패딩 토큰 설정
if getattr(tokenizer, "pad_token", None) is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [7]:
# 10. 입력 및 출력 포맷 준비 함수
def function_prepare_sample_text(tokenizer, for_train=True):
    def _prepare_sample_text(example):
        system_prompt = (
            "당신은 한국어 리뷰 난독화 전문가입니다.\n당신의 임무는 정상적인 한글 숙소 리뷰를 난독화하여 외국인이 알아볼 수 없고, 한국인만 알아볼 수 있게 변환하는 것입니다."
        )
        user_prompt = example['output']
        messages = [
            {"role": "system", "content": f"{system_prompt}"},
            {"role": "user", "content": f"{user_prompt}"}
        ]
        if for_train:
            messages.append({"role": "assistant", "content": example['input']})

        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=not for_train)
        return text
    return _prepare_sample_text

In [8]:
# 11. 샘플 텍스트 준비 및 토큰 길이 대비 문자 비율 계산
def chars_token_ratio(dataset, tokenizer, prepare_sample_text, nb_examples=400):
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example)
        total_characters += len(text)
        total_tokens += len(tokenizer(text).tokens())
    return total_characters / total_tokens

In [9]:
# 12. 데이터셋 생성
def create_datasets(tokenizer, dataset, seq_length):
    prepare_sample_text = function_prepare_sample_text(tokenizer)
    chars_per_token = chars_token_ratio(dataset, tokenizer, prepare_sample_text)
    print(f"문자 대비 토큰 비율: {chars_per_token:.2f}")
    
    cl_dataset = ConstantLengthDataset(
        tokenizer,
        dataset,
        formatting_func=prepare_sample_text,
        infinite=True,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )
    return cl_dataset

# 데이터셋 준비
ds = create_datasets(tokenizer, full_dataset, seq_length)


  0%|                                                                                     | 0/400 [00:00<?, ?it/s]No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
100%|█████████████████████████████████████████████████████████████████████████| 400/400 [00:00<00:00, 1440.75it/s]

문자 대비 토큰 비율: 1.49





In [10]:
# 13. SFT 설정
sft_config = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,
    learning_rate=1e-4,
    warmup_ratio=0.1,
    max_grad_norm=0.3,
    weight_decay=0.05,
    num_train_epochs=1,
    logging_steps=100,
    eval_strategy="no",
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=3,
    max_seq_length=seq_length,
    report_to="wandb",
    run_name="0224_dacon"
)

In [11]:
# 14. 트레이너 설정 및 학습 시작
trainer = SFTTrainer(
    model=base_model,
    train_dataset=ds,
    eval_dataset=None,
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=sft_config,
    formatting_func=function_prepare_sample_text(tokenizer, for_train=True)
)

# 15. 학습 시작
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33m20211367[0m ([33m20211367-sungshin-women-s-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,4.0
200,2.8475
300,2.6288
400,2.3874
500,2.1748
600,2.3388
700,2.2789
800,2.1864
900,2.1195
1000,2.0099


wandb: Network error (ReadTimeout), entering retry loop.
[34m[1mwandb[0m: Network error resolved after 0:01:12.652959, resuming normal operation.
[34m[1mwandb[0m: 429 encountered ({"error":"rate limit exceeded"}), retrying request
[34m[1mwandb[0m: Network error resolved after 0:00:08.833266, resuming normal operation.


TrainOutput(global_step=11263, training_loss=1.696176956530353, metrics={'train_runtime': 28736.743, 'train_samples_per_second': 0.392, 'train_steps_per_second': 0.392, 'total_flos': 3.699313817293947e+17, 'train_loss': 1.696176956530353, 'epoch': 1.0})