# 🚀 PPO Fine-Tuning with a Custom Reward Function
This notebook shows how to fine-tune a language model using PPO from Hugging Face's `trl`, with a simple reward function.

In [1]:
# PPO Fine-Tuning of a Language Model with a Custom Reward Function

# 🛠️ Setup
!pip install -q transformers datasets trl accelerate bitsandbytes

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/376.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m368.6/376.2 kB[0m [31m17.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m376.2/376.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import userdata
userdata.get('HF_TOKEN')

In [6]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
from transformers import AutoTokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from datasets import load_dataset
import torch
# from peft import PeftModel
from transformers import pipeline

# ⚙️ Load model & tokenizer
# model_name = "tiiuae/falcon-1b"  # small model for quick test

model_name = "microsoft/Phi-3.5-mini-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLMWithValueHead.from_pretrained(model_name, load_in_8bit=True)#, device_map="auto")

# test original model
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
prompt = "The purpose of life is"
outputs = pipe(prompt, max_new_tokens=50, do_sample=True, top_k=50, temperature=0.7)
print(outputs[0]["generated_text"])

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



config.json: 0.00B [00:00, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 188.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 18.12 MiB is free. Process 10133 has 14.72 GiB memory in use. Of the allocated memory 13.73 GiB is allocated by PyTorch, and 896.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# ⚙️ Load model & tokenizer

# model_name = "tiiuae/falcon-1b"  # small model for quick test
# tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
# tokenizer.pad_token = tokenizer.eos_token

# model = AutoModelForCausalLMWithValueHead.from_pretrained(model_name, load_in_8bit=True, device_map="auto")

# 📄 Load small dataset
dataset = load_dataset("Abirate/english_quotes", split="train[:100]")

# 📈 Define a custom reward function
def simple_reward_fn(query, response):
    # Reward longer responses that contain "life"
    reward = 1.0 if "life" in response.lower() else -1.0
    reward += len(response) / 100.0  # small bonus for length
    return reward

# ⚙️ PPO Config
config = PPOConfig(
    model_name=model_name,
    learning_rate=1.41e-5,
    log_with=None,
    mini_batch_size=1,
    batch_size=2,
)

ppo_trainer = PPOTrainer(config, model, tokenizer)

# 🔁 Training loop
text_prompts = [f"What is the purpose of life?" for _ in range(100)]

for epoch, prompt in enumerate(text_prompts[:5]):  # Keep it short for test
    query_tensors = tokenizer(prompt, return_tensors="pt", padding=True).input_ids.to(model.device)

    response_tensors = model.generate(query_tensors, max_new_tokens=32, pad_token_id=tokenizer.pad_token_id)
    response_text = tokenizer.batch_decode(response_tensors[:, query_tensors.shape[-1]:], skip_special_tokens=True)

    rewards = [simple_reward_fn(prompt, response_text[0])]
    print(f"Epoch {epoch+1}: {response_text[0]} | Reward: {rewards[0]}")

    ppo_trainer.step([prompt], response_text, rewards)

In [None]:
# test RFTed model
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
prompt = "The purpose of life is"
outputs = pipe(prompt, max_new_tokens=50, do_sample=True, top_k=50, temperature=0.7)
print(outputs[0]["generated_text"])

In [None]:
model.save_pretrained("rft_ppo_falcon7b")

In [None]:
from peft import PeftModel
from transformers import pipeline

# Load base + LoRA
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto"
)
model = PeftModel.from_pretrained(base_model, "rft_ppo_falcon7b")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "The purpose of life is"
outputs = pipe(prompt, max_new_tokens=50, do_sample=True, top_k=50, temperature=0.7)
print(outputs[0]["generated_text"])