<a href="https://colab.research.google.com/github/a-00-a/IdolFan-LLM-Chatbot-Korean-Entertainment-Domain/blob/main/03_FineTuning_Gradio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 02 dataset 생성 코드 추가 -> 03에서 단독 실행 가능
# 라이브러리 import
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model
import torch
import gradio as gr

# sample 데이터 (단일문장, 20개)
sample_data = [
    {"prompt": "오늘 기분 어때요?", "completion": "팬들 생각하면서 힘냈어요!"},
    {"prompt": "추천 노래 있어요?", "completion": "제 최애 노래는 'Shakira-Zoo'예요!"},
    {"prompt": "오늘 뭐했어요?", "completion": "새로운 앨범 춤 연습했어요!"},
    {"prompt": "최근 좋아하는 영화는?", "completion": "최근에는 'Inception'봤는데 재밌었어요."},
    {"prompt": "팬들에게 한마디?", "completion": "항상 사랑해요!"},
    {"prompt": "새로운 앨범 언제 나오나요?", "completion": "조금만 더 기다려주세요!"},
    {"prompt": "운동도 하나요?", "completion": "네, 건강하게 유지하려고 해요."},
    {"prompt": "오늘 날씨 어때요?", "completion": "오늘 많이 춥네요."},
    {"prompt": "좋아하는 음식은?", "completion": "초밥 좋아해요!"},
    {"prompt": "휴식 시간에는 뭐해요?", "completion": "책 읽거나 음악 들어요."},
    {"prompt": "팬들 질문 많이 받았나요?", "completion": "네, 항상 감사하게 받아요."},
    {"prompt": "최근 목표는?", "completion": "더 좋은 음악 만들기!"},
    {"prompt": "노래 연습 어떻게 하나요?", "completion": "매일매일 꾸준히 연습해요."},
    {"prompt": "팬들과 소통 방법?", "completion": "인스타랑 bubble로 소통해요!"},
    {"prompt": "좋아하는 운동?", "completion": "요가랑 가벼운 러닝 좋아해요."},
    {"prompt": "가장 기억에 남는 순간?", "completion": "저번 콘서트에서 팬들과 노래부른 순간이 감동이였어요!"},
    {"prompt": "추천하는 책?", "completion": "'Harry Potter' 시리즈 좋아해요."},
    {"prompt": "스트레스 해소 방법?", "completion": "엽떡먹기!"},
    {"prompt": "최근 관심 있는 것?", "completion": "젤리에 푹 빠져서 포도맛 젤리!"},
    {"prompt": "팬들에게 전하고 싶은 말?", "completion": "늘 함께 해줘서 고마워요!"},
]

# Tokenizer 로드
model_name = "skt/kogpt2-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # padding token 정의

# tokenize 함수 (batched=False, 단일 example 방지)
def tokenize(example):
    text = example["prompt"] + " " + example["completion"]
    tokenized = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=128
    )
    # labels 생성
    labels = tokenized["input_ids"].copy()

    # pad_token_id는 -100으로 바꾸기 (Loss 무시)
    labels = [i if i != tokenizer.pad_token_type_id else -100 for i in labels]

    tokenized["labels"] = labels
    return tokenized

# Dataset 생성
dataset = Dataset.from_list(sample_data)
tokenized_dataset = dataset.map(tokenize, batched=False)
tokenized_dataset = tokenized_dataset.remove_columns(["prompt", "completion"])
tokenized_dataset.set_format("torch")
print(tokenized_dataset.column_names)

#모델 로드 + LoRA 설정
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Training 설정 (wandb off)
training_args = TrainingArguments(
    output_dir="./idolfan_lora",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    logging_steps=1,
    save_steps=50,
    save_total_limit=3,
    learning_rate=5e-4,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset, # 02 에서 만든 dataset
)

# 학습 시작
trainer.train()

# Gradio 챗봇
def chatbot(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=50,
        do_sample=True,
        top_p=0.9,
        temperature=0.8
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

gr.Interface(
    fn=chatbot,
    inputs="text",
    outputs="text",
    title="Idol Fan Chatbot",
    description="LoRA fine-tuned idol-style chatbot"
).launch()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

['input_ids', 'attention_mask', 'labels']


pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/513M [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


trainable params: 294,912 || all params: 125,459,712 || trainable%: 0.2351


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,14.1322
2,12.77
3,12.3895
4,12.5842
5,12.2486
6,11.9228
7,12.0174
8,11.8515
9,11.5175
10,11.6411




It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://41608ef0fef1b8d2e6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


