In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, DPOTrainer, DPOConfig
from datasets import Dataset
import openai

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

In [3]:
# GPT-API 기반 PPO 및 DPO 정렬 예제 (Mistral-7B)

# OpenAI API 설정
openai.api_key = "YOUR_OPENAI_API_KEY"

# 모델 로딩
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLMWithValueHead.from_pretrained(model_name, device_map="auto")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [4]:
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(model_name).to("cpu")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [None]:
# ---------------------------
# DPO용 GPT 기반 preference 수집 예시
# ---------------------------

def get_preference_from_gpt(prompt, response_a, response_b):
    user_prompt = f"""
    [PROMPT]
    {prompt}

    [RESPONSE A]
    {response_a}

    [RESPONSE B]
    {response_b}

    Which is better? Return 'A' or 'B'.
    """

    print(user_prompt)
    return input()
    # completion = openai.ChatCompletion.create(
    #     model="gpt-4",
    #     messages=[
    #         {"role": "system", "content": "You are a helpful assistant."},
    #         {"role": "user", "content": user_prompt}
    #     ],
    #     temperature=0
    # )
    # content = completion['choices'][0]['message']['content'].strip().upper()
    # return "A" if "A" in content else "B"

# 예제 데이터 생성 (실제로는 더 많은 쌍 필요)
raw_data = [
    {"prompt": "Explain black holes.", "response_a": "A black hole is...", "response_b": "Black holes are scary things..."},
    {"prompt": "Define relativity.", "response_a": "Relativity is...", "response_b": "Einstein said..."}
]
print("helllllllllllllllllllo")
# GPT API를 이용한 DPO용 chosen/rejected 생성
dpo_data = []
for row in raw_data:
    better = get_preference_from_gpt(row['prompt'], row['response_a'], row['response_b'])
    if better == "A":
        chosen, rejected = row['response_a'], row['response_b']
    else:
        chosen, rejected = row['response_b'], row['response_a']
    dpo_data.append({"prompt": row["prompt"], "chosen": chosen, "rejected": rejected})
print("hellllllllllo")
# Dataset 준비 및 DPO 학습
train_dataset = Dataset.from_list(dpo_data)
training_args = DPOConfig(
    output_dir="models/rlhf/mistralai/Mistral-7B-Instruct-v0.3",
    per_device_train_batch_size=1,
    beta=0.1,
    num_train_epochs=1,
    logging_steps=1,
    fp16=True,
    gradient_accumulation_steps=1,
    save_strategy="epoch",
    save_steps=1
)
tokenizer.pad_token = tokenizer.eos_token
model.warnings_issued = {}
print("hellllo")
trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    args=training_args,
    train_dataset=train_dataset,
    processing_class=tokenizer
)
print("hello")
trainer.train()

helllllllllllllllllllo

    [PROMPT]
    Explain black holes.

    [RESPONSE A]
    A black hole is...

    [RESPONSE B]
    Black holes are scary things...

    Which is better? Return 'A' or 'B'.
    



    [PROMPT]
    Define relativity.

    [RESPONSE A]
    Relativity is...

    [RESPONSE B]
    Einstein said...

    Which is better? Return 'A' or 'B'.
    
hellllllllllo
hellllo


Extracting prompt in train dataset:   0%|          | 0/2 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/2 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2 [00:00<?, ? examples/s]



OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 47.54 GiB of which 59.88 MiB is free. Including non-PyTorch memory, this process has 47.47 GiB memory in use. Of the allocated memory 47.16 GiB is allocated by PyTorch, and 16.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import inspect
print("DPOTrainer source:", DPOTrainer.__module__)
print("DPOTrainer file:", inspect.getfile(DPOTrainer))

DPOTrainer source: trl.trainer.dpo_trainer
DPOTrainer file: /src/gs25009/LLM_DAG_ALLIGN/.conda/lib/python3.11/site-packages/trl/trainer/dpo_trainer.py


: 