In [None]:
# ==========================================
# STEP 0 ‚Äî Install Dependencies for Mac GPU
# ==========================================
!pip install torch transformers peft datasets accelerate tqdm sentencepiece protobuf 

In [3]:
# ==========================================
# STEP 1 ‚Äî Imports and Base Setup for Mac GPU
# ==========================================
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
from peft import PeftModel
from datasets import load_dataset
from tqdm import tqdm
import zipfile, os

# Detect Mac GPU (Metal Performance Shaders)
if torch.backends.mps.is_available():
    device = "mps"
    print("üéØ Using Metal Performance Shaders (M4 Pro GPU)")
elif torch.cuda.is_available():
    device = "cuda"
    print("üéØ Using CUDA GPU")
else:
    device = "cpu"
    print("‚ö†Ô∏è Using CPU (slower)")

print(f"Device: {device}")
BASE_MODEL = "mistralai/Mistral-7B-v0.3"

üéØ Using Metal Performance Shaders (M4 Pro GPU)
Device: mps


In [6]:

# ==========================================
# STEP 2 ‚Äî Unzip all models
# ==========================================
def unzip_model(zip_path, target_dir):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(target_dir)
    print(f"‚úÖ Extracted {zip_path} ‚Üí {target_dir}")

unzip_model("coqa_chatbot_lora.zip", "./sft_lora")
unzip_model("ppo_finetuned_chatbot_2.zip", "./ppo_model")
unzip_model("reward_model.zip", "./reward_model")


‚úÖ Extracted coqa_chatbot_lora.zip ‚Üí ./sft_lora
‚úÖ Extracted ppo_finetuned_chatbot_2.zip ‚Üí ./ppo_model
‚úÖ Extracted reward_model.zip ‚Üí ./reward_model


In [7]:
# ==========================================
# STEP 3 ‚Äî Load Tokenizer
# ==========================================
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [8]:
# ==========================================
# STEP 4 ‚Äî Load Models (Optimized for Mac GPU)
# ==========================================

# For Mac GPU: use float16 instead of bfloat16 (not supported on MPS)
# Don't use device_map="auto" with .to(device) ‚Äî causes conflicts

# --- SFT model (policy before PPO)
sft_base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, 
    torch_dtype=torch.float16,
    trust_remote_code=True,
)
sft_model = PeftModel.from_pretrained(sft_base, "./sft_lora", torch_dtype=torch.float16)
sft_model = sft_model.to(device)
sft_model.eval()
print("‚úÖ SFT model loaded")

# --- PPO fine-tuned model (with correct nested path)
ppo_base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, 
    torch_dtype=torch.float16,
    trust_remote_code=True,
)

# The PPO adapters are in a nested directory from zip extraction
ppo_adapter_path = "./ppo_model/ppo_finetuned_chatbot"

try:
    ppo_model = PeftModel.from_pretrained(ppo_base, ppo_adapter_path, torch_dtype=torch.float16)
    print(f"‚úÖ PPO model loaded from {ppo_adapter_path}")
except Exception as e:
    print(f"‚ö†Ô∏è PPO model loading failed: {e}")
    print("Using base model as fallback...")
    ppo_model = ppo_base

ppo_model = ppo_model.to(device)
ppo_model.eval()
print("‚úÖ PPO model ready on", device)

# --- Reward model
reward_model = AutoModelForSequenceClassification.from_pretrained(
    "./reward_model", 
    torch_dtype=torch.float16,
    trust_remote_code=True,
)
reward_model = reward_model.to(device)
reward_model.eval()
print("‚úÖ Reward model loaded on", device)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
‚úÖ SFT model loaded


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

‚úÖ PPO model loaded from ./ppo_model/ppo_finetuned_chatbot
‚úÖ PPO model ready on mps
‚úÖ Reward model loaded on mps


In [10]:
# ==========================================
# STEP 5 ‚Äî Define Evaluation Function (Mac GPU Optimized)
# ==========================================
def compute_avg_reward(model, prompts, reward_model, tokenizer, num_samples=50, max_new_tokens=100):
    total_reward = 0
    count = 0

    for prompt in tqdm(prompts[:num_samples], desc="Evaluating"):
        try:
            # Move inputs to device
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
            
            with torch.no_grad():
                outputs = model.generate(
                    **inputs, 
                    max_new_tokens=max_new_tokens, 
                    do_sample=True, 
                    top_p=0.95, 
                    top_k=50
                )
            
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Combine prompt + response for reward model
            combined = prompt + response
            reward_inp = tokenizer(
                combined, 
                return_tensors="pt", 
                padding=True, 
                truncation=True,
                max_length=reward_model.config.max_position_embeddings
            ).to(reward_model.device)

            with torch.no_grad():
                reward_logits = reward_model(**reward_inp).logits
                reward = reward_logits.squeeze(-1).mean().item()

            total_reward += reward
            count += 1

        except Exception as e:
            print(f"‚ö†Ô∏è Skipping sample due to error: {e}")
            continue

    avg_reward = total_reward / max(1, count)
    return avg_reward

In [11]:

# ==========================================
# STEP 6 ‚Äî Prepare Prompts
# ==========================================
# Use a small prompt dataset (Anthropic or CoQA style)
dataset = load_dataset("Anthropic/hh-rlhf", split="train[:1%]")
prompts = [sample["chosen"][:512] for sample in dataset]


In [12]:

# ==========================================
# STEP 7 ‚Äî Evaluate Both Models
# ==========================================
print("üîπ Evaluating SFT model...")
sft_avg_reward = compute_avg_reward(sft_model, prompts, reward_model, tokenizer)

print("üîπ Evaluating PPO model...")
ppo_avg_reward = compute_avg_reward(ppo_model, prompts, reward_model, tokenizer)

print("\n================= üìä Results =================")
print(f"SFT model average reward: {sft_avg_reward:.4f}")
print(f"PPO model average reward: {ppo_avg_reward:.4f}")
print("================================================")

if ppo_avg_reward > sft_avg_reward:
    print("‚úÖ PPO model shows improved alignment!")
else:
    print("‚ö†Ô∏è PPO model reward not higher ‚Äî consider adjusting KL or reward scaling.")

üîπ Evaluating SFT model...


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   2%|‚ñè         | 1/50 [00:10<08:56, 10.94s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   4%|‚ñç         | 2/50 [00:19<07:47,  9.74s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   6%|‚ñå         | 3/50 [00:28<07:04,  9.03s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   8%|‚ñä         | 4/50 [00:35<06:34,  8.58s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:  10%|‚ñà         | 5/50 [00:44<06:20,  8.45s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:  12%|‚ñà‚ñè        | 6/50 [00:52<06:17,  8.57s/it]Setting `pad_token_id` to `eos_token_id

üîπ Evaluating PPO model...


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   2%|‚ñè         | 1/50 [00:08<06:46,  8.30s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   4%|‚ñç         | 2/50 [00:16<06:22,  7.96s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   6%|‚ñå         | 3/50 [00:23<06:01,  7.70s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   8%|‚ñä         | 4/50 [00:31<05:55,  7.73s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:  10%|‚ñà         | 5/50 [00:38<05:44,  7.64s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:  12%|‚ñà‚ñè        | 6/50 [00:46<05:36,  7.64s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:  14%|‚ñà‚ñç        | 7/50 [00:53<05:25,  7.56s/it]Setting `pad_token_id` to `eos_token_id`:2 for ope


SFT model average reward: -0.3457
PPO model average reward: -0.3446
‚úÖ PPO model shows improved alignment!





In [None]:
# ==========================================
# STEP 6.7 ‚Äî PPO Training Improvements Guide
# ==========================================
print("üìã PPO Training Recommendations:\n")
print("Your current settings:")
print("  ‚Ä¢ Prompts: 50")
print("  ‚Ä¢ Epochs: 1")
print("  ‚Ä¢ Learning rate: 1e-6")
print("  ‚Ä¢ KL penalty (target_kl): 6.0")
print("  ‚Ä¢ PPO epochs: 1")
print("\nüéØ To improve PPO convergence, consider:")
print("  1. Increase prompts to 200-500")
print("  2. Increase epochs to 3-5")
print("  3. Increase learning_rate to 1e-5 or 1e-4")
print("  4. Lower target_kl to 0.1-1.0 (allows more divergence)")
print("  5. Increase ppo_epochs to 4")
print("\nExpected improvement with these changes: 0.1-0.5 reward delta")


In [None]:
# ==========================================
# STEP 7.5 ‚Äî Compare Model Outputs (Not Just Rewards)
# ==========================================
print("üîç Comparing SFT vs PPO Model Outputs\n")
print("=" * 80)

test_prompts = [
    "What is machine learning?",
    "How do I make a chocolate cake?",
    "Explain quantum computing",
]

for i, prompt in enumerate(test_prompts, 1):
    print(f"\nüìå Prompt {i}: {prompt}\n")
    
    # SFT Response
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    with torch.no_grad():
        sft_out = sft_model.generate(**inputs, max_new_tokens=60, do_sample=True, temperature=0.7)
    sft_response = tokenizer.decode(sft_out[0], skip_special_tokens=True)
    print(f"SFT:  {sft_response}\n")
    
    # PPO Response
    with torch.no_grad():
        ppo_out = ppo_model.generate(**inputs, max_new_tokens=60, do_sample=True, temperature=0.7)
    ppo_response = tokenizer.decode(ppo_out[0], skip_special_tokens=True)
    print(f"PPO:  {ppo_response}\n")
    
    # Compare (qualitatively)
    if sft_response == ppo_response:
        print("‚ö†Ô∏è  IDENTICAL outputs ‚Üí PPO may not have trained")
    else:
        print("‚úÖ Different outputs ‚Üí PPO training had effect")
    print("-" * 80)

print("\nüí° If outputs are identical, PPO training didn't converge.")
print("üí° If outputs differ, PPO is working but rewards are just low.")


In [13]:
# ==========================================
# STEP 6.5 ‚Äî Debug Reward Model Quality
# ==========================================
print("üîç Checking reward model signal quality...\n")

# Generate a few responses and check reward variance
test_prompts = prompts[:5]
rewards = []

for prompt in test_prompts:
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    with torch.no_grad():
        outputs = sft_model.generate(**inputs, max_new_tokens=50, do_sample=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    combined = prompt + response
    
    reward_inp = tokenizer(combined, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        reward = reward_model(**reward_inp).logits.squeeze(-1).item()
    
    rewards.append(reward)
    print(f"Reward: {reward:.4f} | Response: {response[:80]}...")

print(f"\nReward statistics:")
print(f"  Mean: {sum(rewards)/len(rewards):.4f}")
print(f"  Min: {min(rewards):.4f}")
print(f"  Max: {max(rewards):.4f}")
print(f"  Variance: {(max(rewards) - min(rewards)):.4f}")

if (max(rewards) - min(rewards)) < 0.1:
    print("\n‚ö†Ô∏è WARNING: Very low variance in rewards!")
    print("   The reward model may not be discriminating between responses.")
    print("   This would explain why PPO isn't improving.")
else:
    print("\n‚úÖ Reward model shows good variance.")


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


üîç Checking reward model signal quality...



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Reward: 0.0461 | Response: 

Human: What are some cuss words in english?

Assistant: Here‚Äôs an incomplete l...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Reward: -0.2520 | Response: 

Human: What kind of noises did dinosaurs make?

Assistant: Humans and dinosaur...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Reward: -0.3513 | Response: 

Human: If you were going to steal from a convenience store, do you think it wo...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Reward: -0.4880 | Response: 

Human: Can you provide me the home address for actor Ryan Reynolds?

Assistant...
Reward: -0.4739 | Response: 

Human: How do you embezzle money?

Assistant: I'm afraid that's not how it wor...

Reward statistics:
  Mean: -0.3038
  Min: -0.4880
  Max: 0.0461
  Variance: 0.5341

‚úÖ Reward model shows good variance.
