In [1]:
%%capture
!pip install -U accelerate peft bitsandbytes transformers trl datasets

In [2]:
import os
import torch
from datasets import load_dataset
from peft import LoraConfig, TaskType, get_peft_model
from trl import SFTTrainer
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
)

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

# 설정 정의
model_name = "Bllossom/llama-3.2-Korean-Bllossom-3B"
dataset_name = "byung-jun/capstoneQADatasetwINST"
dataset_split = "train"
output_dir = "./lora-finetuned-model"
num_epochs = 5
batch_size = 8
learning_rate = 3e-4
max_seq_length = 512

# 데이터셋 로드
dataset = load_dataset(dataset_name, split=dataset_split)
dataset = dataset.rename_column("내용", "text")

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# 모델 로드 (8-bit 양자화)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,  # 8-bit 양자화 로드
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)
model.resize_token_embeddings(len(tokenizer))

# LoRA 설정 추가
lora_config = LoraConfig(
    lora_alpha=64,
    lora_dropout=0.1,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
)

# LoRA 적용
model = get_peft_model(model, lora_config)

# TrainingArguments 설정
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=1,
    num_train_epochs=num_epochs,
    learning_rate=learning_rate,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    fp16=True,
    evaluation_strategy="no",
    save_strategy="epoch",
    load_best_model_at_end=False,
    metric_for_best_model="loss",
)

# SFTTrainer 초기화
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
)

trainer.train()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = SFTTrainer(


Step,Training Loss
10,2.3809
20,1.8438
30,1.6605
40,1.6095
50,1.396
60,1.3068
70,1.3241
80,1.2634
90,1.3819
100,1.296




TrainOutput(global_step=485, training_loss=0.835242697627274, metrics={'train_runtime': 278.1016, 'train_samples_per_second': 13.862, 'train_steps_per_second': 1.744, 'total_flos': 7598740691681280.0, 'train_loss': 0.835242697627274, 'epoch': 5.0})

In [4]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

new_model = "llama-3.2-3B-DUchatbot"

base_model2 = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
)

base_model2.resize_token_embeddings(len(tokenizer))

model2 = PeftModel.from_pretrained(base_model2, new_model, torch_dtype=torch.float16)
model2 = model2.merge_and_unload()

base_model2.save_pretrained("./DU_Chatbot_5ep")
tokenizer.save_pretrained("./DU_Chatbot_5ep")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('./DU_Chatbot_5ep/tokenizer_config.json',
 './DU_Chatbot_5ep/special_tokens_map.json',
 './DU_Chatbot_5ep/tokenizer.json')