In [None]:
!pip install datasets
!pip install loralib
!pip install trl
!pip install accelerate
!pip install transformers

In [None]:
!git clone https://github.com/airobotlab/KoChatGPT
!cp -r KoChatGPT/colossalai_ChatGPT_230319/chatgpt chatgpt

In [None]:
import os

modifications = [
    {
        "file": "chatgpt/trainer/callbacks/save_checkpoint.py",
        "changes": [
            {"line": 3, "old": "from chatgpt.trainer.strategies import ColossalAIStrategy, Strategy",
             "new": "from chatgpt.trainer.strategies import Strategy"},
            {"line": 71, "old": "only_rank0 = not isinstance(self.strategy, ColossalAIStrategy)",
             "new": "            only_rank0 = not isinstance(self.strategy)"},
        ],
    },
    {
        "file": "chatgpt/trainer/strategies/__init__.py",
        "changes": [
            {"line": 1, "old": "from .colossalai import ColossalAIStrategy", "new": ""},  # 삭제
            {"line": 5, "old": "__all__ = ['Strategy', 'NaiveStrategy', 'DDPStrategy', 'ColossalAIStrategy']",
             "new": "__all__ = ['Strategy', 'NaiveStrategy', 'DDPStrategy']"},
        ],
    },
    {
        "file": "chatgpt/dataset/reward_dataset.py",
        "changes": [
            {"line": 3, "old": "from tqdm import tqdm", "new": "from tqdm.notebook import tqdm"},
        ],
    },
    {
        "file": "chatgpt/trainer/strategies/__init__.py",
        "changes": [
            {"line": 8, "old": "from tqdm import tqdm", "new": "from tqdm.notebook import tqdm"},
        ]
    },
    {
        "file": "chatgpt/dataset/reward_dataset.py",
        "changes": [
            {"line": 8, "old": "from tqdm import tqdm", "new": "from tqdm.notebook import tqdm"},
        ]
    }
]


def modify_file(file_path, changes):
    """파일에서 지정된 줄을 찾아 내용을 수정하는 함수"""

    if not os.path.exists(file_path):
        print(f"⚠️ 파일이 존재하지 않습니다: {file_path}")
        return

    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()

    modified = False

    for change in changes:
        line_index = change["line"]
        if 0 <= line_index < len(lines):
            if lines[line_index].strip() == change["old"]:
                lines[line_index] = change["new"] + "\n"
                modified = True
            else:
                print(f"⚠️ {file_path} 파일의 {change['line']}번째 줄이 예상과 다릅니다.")
                print(f"   예상: {change['old']}")
                print(f"   실제: {lines[line_index].strip()}")

    if modified:
        with open(file_path, "w", encoding="utf-8") as file:
            file.writelines(lines)
        print(f"✅ 수정 완료: {file_path}")
    else:
        print(f"⚠️ {file_path} 수정할 내용이 없습니다.")

for mod in modifications:
    modify_file(mod["file"], mod["changes"])

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import numpy


In [None]:
!unzip '/content/output_2_RM.zip' -d output_RM


In [None]:
!unzip '/content/KoChatGPT-20250919T020728Z-1-001.zip' -d sft

In [None]:
# === PPO step: actor-only (REINFORCE + KL к initial), награда из замороженного RM ===
import os, json, random, torch
from copy import deepcopy
from tqdm.auto import tqdm
import torch.nn as nn

from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2Model
from peft import PeftModel

# ===================== ПУТИ =====================
SFT_CHECKPOINT      = "/content/sft/KoChatGPT/output_SFT_trinity345M_dynpad"
RM_CHECKPOINT_DIR   = "/content/output_RM"
BASE_MODEL_ID       = "skt/kogpt2-base-v2"
PPO_OUTPUT_DIR      = "/content/drive/MyDrive/KoChatGPT/output_PPO_actor"

DATA_JSON           = "KoChatGPT/data_kochatgpt/kochatgpt_3_PPO.jsonl"  # обычный JSON list[{"prompt":...}]

os.makedirs(PPO_OUTPUT_DIR, exist_ok=True)

# ===================== SAFE INIT =====================
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

def safe_set_seed(seed: int = 42):
    import numpy as np
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available():
        try:
            torch.cuda.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)
        except Exception as e:
            print("[warn] CUDA seeding failed:", repr(e))

safe_set_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ===================== PROMPT FORMAT =====================
INSTR = "### Instruction:\n"
RESP  = "\n\n### Response:\n"
def format_prompt(p: str) -> str:
    return f"{INSTR}{p}{RESP}"

# ===================== TOKENIZERS =====================
tokenizer = AutoTokenizer.from_pretrained(SFT_CHECKPOINT)
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ===================== ACTOR (LoRA PEFT) =====================
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
)
actor = PeftModel.from_pretrained(base_model, SFT_CHECKPOINT)
if hasattr(actor, "peft_config") and "default" in actor.peft_config:
    actor.peft_config["default"].inference_mode = False
actor.to(device)

# учим только LoRA (и lm_head по желанию)
for n, p in actor.named_parameters():
    p.requires_grad = ("lora" in n.lower()) or ("lm_head" in n)

print("[actor] total params:", sum(p.numel() for p in actor.parameters()),
      "| trainable:", sum(p.numel() for p in actor.parameters() if p.requires_grad))

# якорь для KL
initial_model = deepcopy(actor).to(device).eval()
for p in initial_model.parameters():
    p.requires_grad = False

optim_params = [p for p in actor.parameters() if p.requires_grad]
assert optim_params, "Нет trainable-параметров у актора"
actor_optim = torch.optim.AdamW(optim_params, lr=5e-5)

# ===================== REWARD MODEL =====================
# токенизатор RM
try:
    rm_tokenizer = AutoTokenizer.from_pretrained(RM_CHECKPOINT_DIR)
    print("[RM] tokenizer loaded from RM dir")
except Exception:
    rm_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
    print("[RM] tokenizer fallback to BASE_MODEL_ID")
rm_tokenizer.padding_side = "left"
if rm_tokenizer.pad_token is None:
    rm_tokenizer.pad_token = rm_tokenizer.eos_token

# backbone RM
rm_backbone = GPT2Model.from_pretrained(RM_CHECKPOINT_DIR)

# resize embeddings если токенизатор длиннее
model_vocab = rm_backbone.get_input_embeddings().num_embeddings
tok_vocab = len(rm_tokenizer)
if tok_vocab != model_vocab:
    print(f"[RM] resize embeddings {model_vocab} -> {tok_vocab}")
    rm_backbone.resize_token_embeddings(tok_vocab)

class SimpleRewardModel(nn.Module):
    def __init__(self, backbone: nn.Module, hidden_size: int):
        super().__init__()
        self.backbone = backbone
        self.value_head = nn.Linear(hidden_size, 1)
    def forward(self, input_ids, attention_mask=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = out.last_hidden_state[:, -1, :]
        return self.value_head(last_hidden).squeeze(-1)

hidden = rm_backbone.config.n_embd
reward_model = SimpleRewardModel(rm_backbone, hidden).to(device).eval()
vh_path = os.path.join(RM_CHECKPOINT_DIR, "value_head.bin")
if os.path.exists(vh_path):
    reward_model.value_head.load_state_dict(torch.load(vh_path, map_location="cpu"))
else:
    print("⚠️ value_head.bin не найден — RM head случайная")

for p in reward_model.parameters():
    p.requires_grad = False

# preflight
with torch.no_grad():
    enc = rm_tokenizer("### Instruction:\n테스트\n\n### Response:\n좋아", return_tensors="pt")
    max_id = int(enc["input_ids"].max())
    vocab  = reward_model.backbone.get_input_embeddings().num_embeddings
    print(f"[RM] preflight: max_id={max_id}, vocab={vocab}")
    _ = reward_model(enc["input_ids"].to(device), enc["attention_mask"].to(device))
    print("[RM] smoke ok")

@torch.no_grad()
def rm_score_texts(text_batch, max_len=512):
    enc = rm_tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=max_len)
    return reward_model(enc["input_ids"].to(device), enc["attention_mask"].to(device))

# ===================== DATA =====================
def load_prompts_from_json(path: str):
    with open(path, "r", encoding="utf-8-sig") as f:
        obj = json.load(f)
    if isinstance(obj, list):
        records = obj
    elif isinstance(obj, dict):
        records = obj.get("data") or obj.get("items") or []
    else:
        raise ValueError("Bad JSON format")
    prompts = [r.get("prompt","").strip() for r in records if isinstance(r,dict) and r.get("prompt")]
    seen, uniq = set(), []
    for p in prompts:
        if p not in seen: uniq.append(p); seen.add(p)
    random.shuffle(uniq)
    return uniq

list_prompt = load_prompts_from_json(DATA_JSON)
print("Загружено промптов:", len(list_prompt))

# ===================== HELPERS =====================
def tokenize_inputs(texts, max_len=96):
    enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=max_len)
    return {k: v.to(device) for k, v in enc.items()}

def logprobs_from_logits(logits, ids):
    logp = torch.log_softmax(logits, dim=-1)
    return torch.gather(logp, -1, ids.unsqueeze(-1)).squeeze(-1)

gen_kwargs = dict(
    max_new_tokens=128,
    min_new_tokens=8,
    do_sample=True,
    top_p=0.9,
    temperature=0.7,
    num_beams=1,
    no_repeat_ngram_size=3,
    repetition_penalty=1.1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id
)

# ===================== PPO LOOP =====================
actor.train()
beta_kl, adaptive_kl, kl_target, kl_lr = 0.01, True, 0.1, 0.1
BATCH_SIZE, UPDATES = 8, 100
pbar = tqdm(range(UPDATES), desc="PPO")

for it in pbar:
    s = (it * BATCH_SIZE) % max(1, len(list_prompt) - BATCH_SIZE + 1)
    batch_prompts = [format_prompt(p) for p in list_prompt[s:s+BATCH_SIZE]]
    if not batch_prompts: break

    actor.eval()
    with torch.no_grad():
        inp = tokenize_inputs(batch_prompts, max_len=96)
        gen_ids = actor.generate(**inp, **gen_kwargs)
    actor.train()

    prompt_len = inp["input_ids"].shape[1]
    attn_all   = (gen_ids != tokenizer.pad_token_id).long()
    gen_part_ids = gen_ids[:, prompt_len:]
    gen_mask     = (gen_part_ids != tokenizer.pad_token_id).float()

    actor_out = actor(input_ids=gen_ids, attention_mask=attn_all)
    init_out  = initial_model(input_ids=gen_ids, attention_mask=attn_all)

    actor_logits = actor_out.logits[:, :-1, :]
    init_logits  = init_out.logits[:,  :-1, :]
    target_ids   = gen_ids[:,    1:]

    actor_logits_gen = actor_logits[:, prompt_len-1:, :][:, :gen_mask.shape[1], :]
    init_logits_gen  = init_logits[:,  prompt_len-1:, :][:, :gen_mask.shape[1], :]
    target_ids_gen   = target_ids[:,   prompt_len-1:][:, :gen_mask.shape[1]]

    logp_actor = logprobs_from_logits(actor_logits_gen, target_ids_gen)
    logp_init  = logprobs_from_logits(init_logits_gen,  target_ids_gen)

    kl_mean = ( (logp_actor - logp_init) * gen_mask ).sum() / gen_mask.sum().clamp(min=1)

    decoded = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
    rewards = rm_score_texts(decoded)

    adv = (rewards - rewards.mean()) / (rewards.std(unbiased=False)+1e-6)
    logp_actor_sum = (logp_actor * gen_mask).sum(dim=1)

    policy_loss = -(adv * logp_actor_sum).mean()
    loss = policy_loss + beta_kl * kl_mean

    actor_optim.zero_grad(set_to_none=True)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(optim_params, 1.0)
    actor_optim.step()

    if adaptive_kl:
        ratio = (kl_mean.item() / max(1e-8, kl_target)) - 1.0
        beta_kl = float(max(1e-5, min(1.0, beta_kl * (1.0 + kl_lr * ratio))))

    pbar.set_postfix({"loss":f"{loss.item():.3f}","reward":f"{rewards.mean().item():.3f}","kl":f"{kl_mean.item():.3f}","beta":f"{beta_kl:.4f}"})

# ===================== SAVE =====================
actor.save_pretrained(PPO_OUTPUT_DIR)
tokenizer.save_pretrained(PPO_OUTPUT_DIR)
print("✅ Saved PPO actor (LoRA) to:", PPO_OUTPUT_DIR)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# ===== ТЕСТОВЫЕ ЗАПРОСЫ =====
list_prompt = [
    '불고기용 고기 한우에요?',
    '리처드 닉슨이 43대 부통령직을 수행한 년도는?',
    '시카고 오헤어 국제공항은 어디에 있어',
    '오늘 미세먼지 어때?',
    '한국에서 가장 높은 산은 어디야?',
    '서울 지하철 2호선은 몇 시에 끊겨?',
    'BTS 멤버 중 막내는 누구야?',
    '코로나19 첫 발생 연도는?',
    '한글날은 언제야?',
    '부산에서 유명한 음식은 뭐야?',
    '애플의 창립자는 누구야?',
    '인공지능과 머신러닝의 차이는 뭐야?',
    '한국의 전통 혼례에서 중요한 의식은?',
    '세계에서 가장 긴 강은 어디야?',
    '올해 한국 프로야구 우승팀은 누구야?',
    '김치찌개 맛있게 끓이는 법 알려줘',
    '삼국시대 고구려의 수도는 어디였어?',
    '테슬라 CEO는 누구야?',
    '아이 공부 집중력을 높이는 방법은?',
    '우주에서 가장 가까운 별 이름은 뭐야?'
]

# ===== ФОРМАТ ПРОМПТА (как при обучении) =====
def format_prompt(p: str) -> str:
    return f"### Instruction:\n{p}\n\n### Response:\n"

# ===== НАСТРОЙКИ ГЕНЕРАЦИИ =====
# режим 1: детерминированный (greedy) — для быстрой sanity-проверки
gen_kwargs_greedy = dict(
    max_new_tokens=128,
    do_sample=False,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

# режим 2: стохастический (sampling) — если greedy даёт мало текста
gen_kwargs_sampling = dict(
    max_new_tokens=128,
    do_sample=True,
    top_p=0.9,
    temperature=0.7,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

def generate_batch(prompts, gen_kwargs):
    texts = [format_prompt(p) for p in prompts]
    enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        out_ids = actor.generate(**enc, **gen_kwargs)
    return [tokenizer.decode(ids, skip_special_tokens=True) for ids in out_ids]

# ===== ЗАПУСК ПРОВЕРКИ =====
print("=== GREEDY (do_sample=False) ===")
greedy_out = generate_batch(list_prompt, gen_kwargs_greedy)
for p, r in zip(list_prompt, greedy_out):
    print("="*60)
    print("Prompt:", p)
    print("Response:", r)
