# DeepSeek 모델 파인튜닝

이 노트북은 DeepSeek 모델을 파인튜닝하기 위한 코드를 포함하고 있습니다.

In [None]:
# 필요한 라이브러리 임포트
import os
import glob
import pandas as pd
import torch
import torch.backends.mps
import unicodedata
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    TrainerCallback,
    TrainerState,
    TrainerControl
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from tqdm import tqdm
from transformers.integrations import TensorBoardCallback



In [None]:
# 설정값 정의
DATA_DIR = os.path.abspath("./LiarHeart_dataset")
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
OUTPUT_DIR = "./deepseek-r1-finetuned"
TB_LOG_DIR = os.path.join(OUTPUT_DIR, "tensorboard_logs")

# LoRA 설정
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.1
LORA_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"]

# 학습 설정
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 16
LEARNING_RATE = 5e-4
NUM_EPOCHS = 1
MAX_LENGTH = 512
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.05

# 처리할 시트 목록
SHEET_NAMES = ["알리바이_대화", "인터뷰_대화", "가쉽_대화"]

In [None]:
# 데이터 로딩 및 준비 함수
def load_and_prepare_data():
    all_data = []
    
    # Excel 파일 찾기
    search_prefix = "페르소나 데이터_"
    search_suffix = ".xlsx"
    normalized_prefix = unicodedata.normalize('NFC', search_prefix)
    
    print(f"Searching for files in {DATA_DIR}")
    EXCEL_FILES = []
    for filename in os.listdir(DATA_DIR):
        normalized_filename = unicodedata.normalize('NFC', filename)
        if normalized_filename.startswith(normalized_prefix) and normalized_filename.endswith(search_suffix) and not normalized_filename.startswith("~$"):
            EXCEL_FILES.append(os.path.join(DATA_DIR, filename))
    print(f"Found Excel files: {EXCEL_FILES}")
    
    for excel_file in tqdm(EXCEL_FILES, desc="Loading Excel files"):
        print(f"Processing file: {excel_file}")
        
        for sheet_name in SHEET_NAMES:
            try:
                print(f"  Reading sheet: {sheet_name}")
                df = pd.read_excel(excel_file, sheet_name=sheet_name)
                
                human_col = '사람 대사'
                assistant_col = '챗봇 대사'
                
                if human_col in df.columns and assistant_col in df.columns:
                    for _, row in df.iterrows():
                        question = row[human_col]
                        answer = row[assistant_col]
                        
                        if pd.isna(question) or pd.isna(answer):
                            continue
                            
                        question = str(question)
                        answer = str(answer)
                        
                        dialogue_type = ""
                        if sheet_name == "알리바이_대화": dialogue_type = "알리바이"
                        elif sheet_name == "인터뷰_대화": dialogue_type = "인터뷰"
                        elif sheet_name == "가쉽_대화": dialogue_type = "가쉽"
                        
                        if dialogue_type:
                            formatted_text = f"<dialogue_type>\n{dialogue_type}\n</dialogue_type>\n\n<human>\n{question}\n</human>\n\n<assistant>\n{answer}\n</assistant>"
                            all_data.append({"text": formatted_text})
                            
            except Exception as e:
                print(f"  Error processing {excel_file}, sheet {sheet_name}: {e}")
    
    print(f"Total examples loaded: {len(all_data)}")
    
    # 학습:검증 데이터 분리 (9:1)
    train_size = int(len(all_data) * 0.9)
    train_data = all_data[:train_size]
    eval_data = all_data[train_size:]
    
    return Dataset.from_dict({"text": [item["text"] for item in train_data]}), \
           Dataset.from_dict({"text": [item["text"] for item in eval_data]})

In [None]:
# 모델과 토크나이저 준비 함수
def prepare_model_and_tokenizer():
    print("Loading model and tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False, cache_dir="./")
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float32,
        device_map=None,
        cache_dir="./"
    )
    
    # LoRA 설정 적용
    peft_config = LoraConfig(
        task_type="CAUSAL_LM",
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        target_modules=LORA_TARGET_MODULES,
        bias="none",
        inference_mode=False,
    )
    
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    
    return model, tokenizer

In [None]:
# 토크나이징 함수
def tokenize_function(examples, tokenizer):
    result = tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
# 커스텀 콜백 클래스
class LearningRateLoggerCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        optimizer = kwargs.get('optimizer', None)
        if optimizer:
            for param_group in optimizer.param_groups:
                current_lr = param_group['lr']
                print(f"Step {state.global_step}: Learning rate = {current_lr:.6f}")
        return control
        
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics:
            print(f"\n===== Evaluation Results at Step {state.global_step} =====")
            for key, value in metrics.items():
                print(f"{key}: {value:.4f}")
            print("=" * 50)
        return control

In [None]:
# 데이터 준비
train_dataset, eval_dataset = load_and_prepare_data()

In [None]:
# 모델 및 토크나이저 준비
model, tokenizer = prepare_model_and_tokenizer()

In [None]:
# 데이터 토크나이징
print("Tokenizing datasets...")
tokenized_train = train_dataset.map(
    lambda x: tokenize_function(x, tokenizer),
    batched=True,
    remove_columns=["text"]
)

tokenized_eval = eval_dataset.map(
    lambda x: tokenize_function(x, tokenizer),
    batched=True,
    remove_columns=["text"]
)

In [None]:
# 학습 설정
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

os.makedirs(TB_LOG_DIR, exist_ok=True)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    lr_scheduler_type="cosine",
    logging_dir=TB_LOG_DIR,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=False,
    bf16=False,
    remove_unused_columns=False,
    report_to=["tensorboard"],
    gradient_checkpointing=False,
    optim="adamw_torch",
    ddp_find_unused_parameters=False,
    no_cuda=True
)

lr_callback = LearningRateLoggerCallback()
tensorboard_callback = TensorBoardCallback()

In [None]:
# Trainer 초기화 및 학습
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    callbacks=[lr_callback, tensorboard_callback],
)

print("\n===== TensorBoard 실행 방법 =====")
print(f"터미널에서 다음 명령어를 실행하세요:")
print(f"tensorboard --logdir={TB_LOG_DIR}")
print("그런 다음 웹 브라우저에서 http://localhost:6006/ 으로 접속하세요.\n")

# 초기 평가
print("Initial evaluation...")
trainer.evaluate()

# 학습 시작
print("Starting training...")
trainer.train()

In [None]:
# 모델 저장
print("Saving model...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# 최종 평가
print("Final evaluation...")
final_metrics = trainer.evaluate()
print("\n===== Final Evaluation Results =====")
for key, value in final_metrics.items():
    print(f"{key}: {value:.4f}")
print("=" * 50)

print(f"Training complete. Model saved to {OUTPUT_DIR}")
print(f"TensorBoard 로그는 {TB_LOG_DIR}에 저장되었습니다.")