In [12]:
# ==========================================
# STEP 0 ‚Äî Install Dependencies for Mac GPU
# ==========================================
!pip install torch transformers peft datasets accelerate tqdm sentencepiece protobuf



In [13]:
# ==========================================
# STEP 1 ‚Äî Imports and Base Setup for Mac GPU
# ==========================================
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
from peft import PeftModel
from datasets import load_dataset
from tqdm import tqdm
import zipfile, os

# Detect Mac GPU (Metal Performance Shaders)
if torch.backends.mps.is_available():
    device = "mps"
    print("üéØ Using Metal Performance Shaders (M4 Pro GPU)")
elif torch.cuda.is_available():
    device = "cuda"
    print("üéØ Using CUDA GPU")
else:
    device = "cpu"
    print("‚ö†Ô∏è Using CPU (slower)")

print(f"Device: {device}")
BASE_MODEL = "mistralai/Mistral-7B-v0.3"

üéØ Using Metal Performance Shaders (M4 Pro GPU)
Device: mps


In [None]:

# ==========================================
# STEP 2 ‚Äî Unzip all models
# ==========================================
def unzip_model(zip_path, target_dir):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(target_dir)
    print(f"‚úÖ Extracted {zip_path} ‚Üí {target_dir}")

unzip_model("coqa_chatbot_lora.zip", "./sft_lora")
unzip_model("ppo_finetuned_chatbot_2.zip", "./ppo_model")
unzip_model("reward_model.zip", "./reward_model")


‚úÖ Extracted coqa_chatbot_lora.zip ‚Üí ./sft_lora
‚úÖ Extracted ppo_finetuned_chatbot.zip ‚Üí ./ppo_model
‚úÖ Extracted ppo_finetuned_chatbot.zip ‚Üí ./ppo_model
‚úÖ Extracted reward_model.zip ‚Üí ./reward_model
‚úÖ Extracted reward_model.zip ‚Üí ./reward_model


In [15]:
# ==========================================
# STEP 3 ‚Äî Load Tokenizer
# ==========================================
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [16]:
# ==========================================
# STEP 4 ‚Äî Load Models (Optimized for Mac GPU)
# ==========================================

# For Mac GPU: use float16 instead of bfloat16 (not supported on MPS)
# Don't use device_map="auto" with .to(device) ‚Äî causes conflicts

# --- SFT model (policy before PPO)
sft_base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, 
    torch_dtype=torch.float16,
    trust_remote_code=True,
)
sft_model = PeftModel.from_pretrained(sft_base, "./sft_lora", torch_dtype=torch.float16)
sft_model = sft_model.to(device)
sft_model.eval()
print("‚úÖ SFT model loaded")

# --- PPO fine-tuned model (with correct nested path)
ppo_base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, 
    torch_dtype=torch.float16,
    trust_remote_code=True,
)

# The PPO adapters are in a nested directory from zip extraction
ppo_adapter_path = "./ppo_model/ppo_finetuned_chatbot"

try:
    ppo_model = PeftModel.from_pretrained(ppo_base, ppo_adapter_path, torch_dtype=torch.float16)
    print(f"‚úÖ PPO model loaded from {ppo_adapter_path}")
except Exception as e:
    print(f"‚ö†Ô∏è PPO model loading failed: {e}")
    print("Using base model as fallback...")
    ppo_model = ppo_base

ppo_model = ppo_model.to(device)
ppo_model.eval()
print("‚úÖ PPO model ready on", device)

# --- Reward model
reward_model = AutoModelForSequenceClassification.from_pretrained(
    "./reward_model", 
    torch_dtype=torch.float16,
    trust_remote_code=True,
)
reward_model = reward_model.to(device)
reward_model.eval()
print("‚úÖ Reward model loaded on", device)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

‚úÖ SFT model loaded


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

‚úÖ PPO model loaded from ./ppo_model/ppo_finetuned_chatbot
‚úÖ PPO model ready on mps
‚úÖ PPO model ready on mps
‚úÖ Reward model loaded on mps
‚úÖ Reward model loaded on mps


In [17]:
# ==========================================
# STEP 5 ‚Äî Define Evaluation Function (Mac GPU Optimized)
# ==========================================
def compute_avg_reward(model, prompts, reward_model, tokenizer, num_samples=50, max_new_tokens=100):
    total_reward = 0
    count = 0

    for prompt in tqdm(prompts[:num_samples], desc="Evaluating"):
        try:
            # Move inputs to device
            inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
            
            with torch.no_grad():
                outputs = model.generate(
                    **inputs, 
                    max_new_tokens=max_new_tokens, 
                    do_sample=True, 
                    top_p=0.95, 
                    top_k=50
                )
            
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Combine prompt + response for reward model
            combined = prompt + response
            reward_inp = tokenizer(
                combined, 
                return_tensors="pt", 
                padding=True, 
                truncation=True,
                max_length=reward_model.config.max_position_embeddings
            ).to(reward_model.device)

            with torch.no_grad():
                reward_logits = reward_model(**reward_inp).logits
                reward = reward_logits.squeeze(-1).mean().item()

            total_reward += reward
            count += 1

        except Exception as e:
            print(f"‚ö†Ô∏è Skipping sample due to error: {e}")
            continue

    avg_reward = total_reward / max(1, count)
    return avg_reward

In [18]:

# ==========================================
# STEP 6 ‚Äî Prepare Prompts
# ==========================================
# Use a small prompt dataset (Anthropic or CoQA style)
dataset = load_dataset("Anthropic/hh-rlhf", split="train[:1%]")
prompts = [sample["chosen"][:512] for sample in dataset]


In [19]:

# ==========================================
# STEP 7 ‚Äî Evaluate Both Models
# ==========================================
print("üîπ Evaluating SFT model...")
sft_avg_reward = compute_avg_reward(sft_model, prompts, reward_model, tokenizer)

print("üîπ Evaluating PPO model...")
ppo_avg_reward = compute_avg_reward(ppo_model, prompts, reward_model, tokenizer)

print("\n================= üìä Results =================")
print(f"SFT model average reward: {sft_avg_reward:.4f}")
print(f"PPO model average reward: {ppo_avg_reward:.4f}")
print("================================================")

if ppo_avg_reward > sft_avg_reward:
    print("‚úÖ PPO model shows improved alignment!")
else:
    print("‚ö†Ô∏è PPO model reward not higher ‚Äî consider adjusting KL or reward scaling.")

üîπ Evaluating SFT model...


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   2%|‚ñè         | 1/50 [00:17<14:32, 17.81s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   2%|‚ñè         | 1/50 [00:17<14:32, 17.81s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   4%|‚ñç         | 2/50 [00:26<09:45, 12.19s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   4%|‚ñç         | 2/50 [00:26<09:45, 12.19s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   6%|‚ñå

üîπ Evaluating PPO model...


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   2%|‚ñè         | 1/50 [00:16<13:52, 16.99s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   2%|‚ñè         | 1/50 [00:16<13:52, 16.99s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   4%|‚ñç         | 2/50 [00:24<09:14, 11.56s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   4%|‚ñç         | 2/50 [00:24<09:14, 11.56s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   6%|‚ñå         | 3/50 [00:32<07:35,  9.69s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   6%|‚ñå         | 3/50 [00:32<07:35,  9.69s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Evaluating:   8%|‚ñä         | 4/50 [00:39<06:


SFT model average reward: -0.3420
PPO model average reward: -0.3339
‚úÖ PPO model shows improved alignment!



