In [None]:
import json
data_path_1_SFT = 'KoChatGPT/data_kochatgpt/kochatgpt_1_SFT.jsonl'
with open(data_path_1_SFT, "r", encoding='utf-8-sig') as json_file:
    list_data_dict = json.load(json_file)

print(len(list_data_dict))
list_data_dict[:3]

In [None]:
lens_prompt=[len(x['prompt']) for x in list_data_dict]
lens_comp=[len(x['completion']) for x in list_data_dict]

import matplotlib.pyplot as plt

plt.subplot(1,2,1)
plt.hist(lens_prompt)
plt.title('length of prompt')

plt.subplot(1,2,2)
plt.hist(lens_comp)
plt.title('length of completion')

중복된 (prompt, completion) 쌍을 제거했어요.

너무 짧거나 (prompt < 5, completion < 3) / 너무 긴 데이터 (prompt > 512, completion > 1024) 를 걸러냈습니다.

모든 completion 문장이 </s> 토큰으로 끝나도록 통일시켰습니다.

이 과정을 통해 품질이 낮거나 모델 학습에 방해가 될 수 있는 데이터를 제거하고, 학습 안정성과 성능 향상을 기대할 수 있습니다.

In [None]:
seen=set()
sft_clean=[]
for x in list_data_dict:
  prompt=x['prompt'].strip()
  completion=x['completion'].strip()

  key=(prompt, completion)
  if key in seen:
    continue
  seen.add(key)

  if len(prompt)<5:
    continue
  if len(completion)<3:
    continue

  if len(prompt)>512:
    continue
  if len(completion)>1024:
    continue

  if not completion.endswith('</s>'):
    completion = completion + "</s>"

  sft_clean.append({'prompt':prompt, 'completion':completion})

out_path= "KoChatGPT/data_kochatgpt/kochatgpt_1_SFT.clean.jsonl"
with open(out_path, 'w') as f:
  json.dump(sft_clean, f, ensure_ascii=False, indent=2)

print("SFT before:", len(list_data_dict), "after:", len(sft_clean))


In [None]:
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch
from peft import LoraConfig, get_peft_model


데이터 처리 방식

처음에는 DataCollatorForSupervisedDataset을 사용해서 input_ids와 labels를 따로 패딩했음.

지금은 CausalLMDataCollator를 직접 정의해서 tokenizer.pad()로 한 번에 패딩하고, attention_mask==0인 부분을 -100으로 마스킹하도록 변경함.

학습 설정(TrainingArguments)

원래는 num_train_epochs=1, batch_size=8, warmup_steps=5, prediction_loss_only=True 등 간단한 설정.

지금은 num_train_epochs=2, batch_size=2, gradient_accumulation_steps=8, warmup_steps=200, weight_decay=0.01 등 더 정교한 설정으로 바꿈.

또한 logging_steps, eval_steps, save_steps, save_total_limit 등을 추가해 학습 과정을 더 세밀하게 관리.

In [None]:
wsft_data=json.load(f)
ith open('KoChatGPT/data_kochatgpt/kochatgpt_1_SFT.clean.jsonl', 'r') as f:

def format_row(r):
    return {"text": f"### Instruction:\n{r['prompt']}\n\n### Response:\n{r['completion']}"}

dataset = Dataset.from_list([format_row(r) for r in sft_data])
dataset = dataset.train_test_split(test_size=0.05, seed=42)

base_model="skt/kogpt2-base-v2"
tokenizer=AutoTokenizer.from_pretrained(base_model, bos_token="</s>", eos_token="</s>", unk_token="</s>", pad_token="</s>")
tokenizer.model_max_length=512
tokenizer.padding_side="right"

def tokenize(batch):
  return tokenizer(batch['text'], truncation=True, max_length=512)

dataset_tok=dataset.map(tokenize, batched=True)
dataset_tok=dataset_tok.remove_columns('text')

model = AutoModelForCausalLM.from_pretrained(base_model)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj", "mlp.c_fc", "mlp.c_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

model.config.use_cache = False

class CausalLMDataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features):

        batch = self.tokenizer.pad(
            features,
            padding=True,
            return_tensors="pt"
        )

        labels = batch["input_ids"].clone()
        labels[batch["attention_mask"] == 0] = -100
        batch["labels"] = labels
        return batch

data_collator = CausalLMDataCollator(tokenizer)

trainable, all_params = 0, 0
for name, param in model.named_parameters():
    all_params += param.numel()
    if param.requires_grad:
        trainable += param.numel()
        print("✅ trainable:", name, param.shape)

print(f"\nTrainable params: {trainable:,} / {all_params:,} "
      f"({100 * trainable/all_params:.2f}%)")

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="models/kogpt2-base-v2",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    num_train_epochs=2,
    warmup_steps=200,
    weight_decay=0.01,
    logging_steps=50,
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    group_by_length=True,
    dataloader_num_workers=2,
    remove_unused_columns=False,
    report_to="none"
)

from transformers import Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_tok["train"],
    eval_dataset=dataset_tok["test"],
    data_collator=data_collator
)

In [None]:
trainer.train()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
save_dir = "/content/drive/MyDrive/KoChatGPT/output_SFT_trinity345M_dynpad"

trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

In [None]:
dataset_tok['test']


do_sample=False, num_beams=5, repetition_penalty=1.1 등 더 보수적이고 결정적인 생성으로 설정.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

save_dir = "/content/drive/MyDrive/KoChatGPT/output_SFT_trinity345M_dynpad"
base_model_id = "skt/kogpt2-base-v2"

tokenizer = AutoTokenizer.from_pretrained(save_dir)
base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
model = PeftModel.from_pretrained(base_model, save_dir).eval()

INSTR = "### Instruction:\n"
RESP  = "\n\n### Response:\n"

n_samples = 10
for i in range(n_samples):
    row = dataset["test"][i]
    full_text = row["text"]

    assert full_text.startswith(INSTR)
    body = full_text[len(INSTR):]
    prompt_part, target_part = body.split(RESP, 1)

    prompt_for_model = f"{INSTR}{prompt_part}{RESP}"

    inputs = tokenizer(prompt_for_model, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=False,
            #top_p=0.9,
            #temperature=0.7,
            length_penalty=1.0,
            repetition_penalty=1.1,
            num_beams=5,
            no_repeat_ngram_size=2
        )
    generated_full = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_resp = generated_full.replace(prompt_for_model, "").strip()

    print(f"\n==== Test Example {i+1} ====")
    print("Prompt:", prompt_part)
    print("Target completion:", target_part.strip())
    print("Generated:", generated_resp)


In [None]:
dataset['test']

이전 모델은 거의 모든 질문에 부정확하거나 엉뚱한 답변을 길게 생성했고, 사실과 맞지 않는 정보가 많았음.

현재 모델은 Instruction–Response 형식을 따르며 답변이 좀 더 간결해졌지만, 여전히 “저는 인공지능 어시스턴트이기 때문에 …”라는 회피형 응답이 많음.

일부 질문(예: 탈모 샴푸, 세렝게티 우기 등)에서는 어느 정도 관련 있는 내용을 포함했으나, 사실 정확도는 여전히 낮음.