In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
torch.cuda.current_device()

0

In [4]:
import os
from datasets import config


config.HF_DATASETS_CACHE = "./downloaded_datasets"

In [5]:
from huggingface_hub import login

hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [6]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mst124974[0m ([33mbinit-ait[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [7]:
from datasets import Dataset, load_dataset

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer
)

## Load the Dahoas Dataset

In [8]:
def preprocess_dataset(sample: dict) -> dict:
     return {
        "prompt": sample["prompt"].strip(),
        "chosen": sample["chosen"].strip(),
        "rejected": sample["rejected"].strip(),
    }

# load and preprocess dataset
def load_dataset_split(split: str, sanity_check: bool = False, cache_dir: str = None) -> Dataset:
    dataset = load_dataset("Dahoas/rm-static", split=split, cache_dir=cache_dir)
    if sanity_check:
        dataset = dataset.select(range(min(len(dataset), 5)))
    return dataset.map(preprocess_dataset)

In [9]:
sanity_check = True
train_data = load_dataset_split("train", sanity_check=sanity_check)
eval_data = load_dataset_split("test", sanity_check=sanity_check)

In [10]:
# debugging to check preprocessed dataset
print("Prompt:", train_data["prompt"][1])
print("Chosen Response:", train_data["chosen"][1])
print("Rejected Response:", train_data["rejected"][1])

Prompt: Human: What are some foods that are good for diabetics?

Assistant: To be honest, some of these are better than others, and they’re a little more like opinions than facts. For example, many of the diets say to limit vegetables with high sugar content, and there’s some debate on the subject, as far as how much of these vegetables are actually bad for diabetics.

Human: Okay, any other advice?

Assistant:
Chosen Response: What exactly are you asking? There’s a lot of different kinds of diabetic diets. I could try to recommend you some specific foods and recipes. I could help you look up any of the foods, and I could find recipes for them.
Rejected Response: Sure, we’ve got information on common mistakes that diabetic patients make with their diets, and even some specific things to do when you eat out and on the go.  One thing that’s recommended in these articles is just to be very mindful of the timing of food intake.


In [11]:
print("Prompt:", eval_data["prompt"][0])
print("Chosen Response:", eval_data["chosen"][0])
print("Rejected Response:", eval_data["rejected"][0])

Prompt: Human: I am trying to write a fairy tale. What is the most popular plot?

Assistant: The most popular plot might be “a princess goes to a faraway land, falls in love, and has a magic genie that grants her wishes”.  We can find plenty of examples of this if we search for fairy tales using the search engine Google.

Human: You can't look anything up on google.

Assistant: OK, so let’s think about some other popular fairy tales.

Human: Do you like the plot of Hansel and Gretel?

Assistant: Not particularly.

Human: what don't you like?

Assistant: I feel like the children do very stupid things.

Human: Should I include a witch on my story?

Assistant: I think a witch is a fairly common fairy tale element, so yes.  However, what kind of plotline would you like to pursue?

Human: I was thinking I might pursue a feminist plot for a more modern fairytale.

Assistant: I would be happy to help you make your story a more feminist one.  Are you writing a new story, or retelling a traditi

## Load pretrained model and tokenizer

The DPO trainer expects a model of AutoModelForCausalLM, compared to PPO that expects AutoModelForCausalLMWithValueHead for the value function.

In [12]:
model_name = "Qwen/Qwen2-0.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
ref_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [13]:
print(train_data)

Dataset({
    features: ['prompt', 'response', 'chosen', 'rejected'],
    num_rows: 5
})


In [14]:
print(eval_data)

Dataset({
    features: ['prompt', 'response', 'chosen', 'rejected'],
    num_rows: 5
})


## Training the model

In [15]:
import itertools

learning_rates = [3e-5, 1e-5]
betas = [0.1, 0.2]
batch_size = 2
num_epochs = 3
results = []
best_loss = float("inf")  # validation loss is smaller than infinity
best_model_path = None
hyperparameter_combinations = list(itertools.product(learning_rates, betas))

In [16]:
from trl import DPOTrainer, DPOConfig

for lr, beta in hyperparameter_combinations:
    run_name = f"st124974-dpo_lr-{lr}_bs-{batch_size}_ep-{num_epochs}_beta-{beta}"
    output_dir = f"models/{run_name}"
    
    # arguments for DPOTrainer
    dpo_config = DPOConfig(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        logging_dir="./logs",
        logging_steps=5,
        save_total_limit=2,
        learning_rate=lr,
        report_to=["wandb"],
        beta=beta,
        run_name=run_name
    )

    # initialize DPOTrainer
    dpo_trainer = DPOTrainer(
        model=model,
        ref_model=ref_model,
        args=dpo_config,
        train_dataset=train_data,
        eval_dataset=eval_data,
        processing_class=tokenizer,
    )

    dpo_trainer.train()

    #evaluate the model
    eval_results = dpo_trainer.evaluate()
    loss = eval_results.get("eval_loss", None)
    if loss and loss < best_loss:
        best_loss = loss
        best_model_path = output_dir
        print(f"New best model found! Saving model at: {best_model_path}")

Epoch,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
1,No log,0.557057,-0.743484,-1.281764,0.5,0.53828,-126.743866,-150.153152,-3.201709,-2.860964
2,0.398400,0.520812,-1.52517,-2.246588,0.666667,0.721418,-134.56073,-159.801376,-3.28067,-2.91784
3,0.398400,0.530221,-1.799008,-2.52351,0.666667,0.724502,-137.299118,-162.570602,-3.289494,-2.933159


New best model found! Saving model at: models/st124974-dpo_lr-3e-05_bs-2_ep-3_beta-0.1


Epoch,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
1,No log,0.532556,-3.598132,-5.047195,0.666667,1.449063,-137.299698,-162.571487,-3.289541,-2.933217
2,0.000000,0.532555,-3.598191,-5.047298,0.666667,1.449107,-137.299988,-162.571991,-3.289572,-2.933255
3,0.000000,0.532561,-3.598219,-5.047341,0.666667,1.449122,-137.30011,-162.57222,-3.289589,-2.933276


Epoch,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
1,No log,0.513055,-2.473351,-3.381426,0.833333,0.908075,-144.042526,-171.149765,-3.371381,-3.09924
2,0.000000,0.531839,-2.875102,-3.856045,0.833333,0.980943,-148.060043,-175.89595,-3.394658,-3.172605
3,0.000000,0.539542,-2.978933,-3.967966,0.833333,0.989033,-149.098358,-177.015152,-3.400038,-3.186575


Epoch,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
1,No log,0.666068,-5.957865,-7.935932,0.833333,1.978067,-149.098358,-177.015152,-3.400038,-3.186575
2,0.000000,0.666068,-5.957865,-7.935932,0.833333,1.978067,-149.098358,-177.015152,-3.400038,-3.186575
3,0.000000,0.666068,-5.957865,-7.935932,0.833333,1.978067,-149.098358,-177.015152,-3.400038,-3.186575


## Upload to Hugging face

In [23]:
if best_model_path:
    model.save_pretrained(best_model_path)
    tokenizer.save_pretrained(best_model_path)
    print(f"Model and tokenizer saved at: {best_model_path}")

Model and tokenizer saved at: models/st124974-dpo_lr-3e-05_bs-2_ep-3_beta-0.1


In [24]:
from huggingface_hub import HfApi

if best_model_path:
    api = HfApi()
    safe_model_name = model_name.replace("/", "-")
    model_repo = f"archx64/best-dpo-{safe_model_name}"
    
    # create the repository if it doesn't exist
    try:
        api.create_repo(repo_id=model_repo, repo_type="model", exist_ok=True)
        print(f"Repository created or already exists: {model_repo}")
    except Exception as e:
        print(f"Error creating repository: {e}")
    
    api.upload_folder(folder_path=best_model_path, repo_id=model_repo, repo_type="model")
    print(f"Model uploaded to Hugging Face Hub: {model_repo}")

Repository created or already exists: archx64/best-dpo-Qwen-Qwen2-0.5B-Instruct


model.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Model uploaded to Hugging Face Hub: archx64/best-dpo-Qwen-Qwen2-0.5B-Instruct


## Inference

In [25]:
# for inference default device is cuda
def run_inference(prompt: str, model_name: str, device: str = 'cuda') -> str:
    print(f"running inference on model: {model_name}")
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output = model.generate(**inputs, max_new_tokens=50)
    return tokenizer.decode(output[0], skip_special_tokens=True)

if best_model_path:
    test_prompt = "What is the meaning of life?"
    response = run_inference(test_prompt, model_repo, device)
    print(f"Prompt: {test_prompt}\nGenerated Response: {response}")

running inference on model: archx64/best-dpo-Qwen-Qwen2-0.5B-Instruct


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/267 [00:00<?, ?B/s]

Prompt: What is the meaning of life?
Generated Response: What is the meaning of life? The question of what a living being is and how it should be defined has been a source of philosophical debate for centuries. The answer to this question can be very complex, as there are many different kinds of living beings that could possibly be considered "living
