In [12]:
pip install torch transformers accelerate peft trl datasets bitsandbytes sentencepiece protobuf --quiet

Note: you may need to restart the kernel to use updated packages.


In [None]:
!unzip coqa_chatbot_lora.zip -d ./sft_lora

In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

MODEL_NAME = "mistralai/Mistral-7B-v0.3"  # ‚úÖ base model

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [14]:
import torch, os
from transformers import BitsAndBytesConfig

# Base model name used in your SFT notebook
BASE_MODEL = "mistralai/Mistral-7B-v0.3"

# Path where SFT LoRA adapters were saved
SFT_ADAPTER_DIR = "coqa_chatbot_lora"

In [15]:
import torch

# Check for Metal Performance Shaders (Mac GPU)
if torch.backends.mps.is_available():
    device = "mps"
    print("üéØ Using Metal Performance Shaders (Mac GPU)")
elif torch.cuda.is_available():
    device = "cuda"
    print("üéØ Using CUDA GPU")
else:
    device = "cpu"
    print("‚ö†Ô∏è Using CPU (slower)")

print("Device:", device)

üéØ Using Metal Performance Shaders (Mac GPU)
Device: mps


In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from pathlib import Path

def load_sft_model(adapter_dir=SFT_ADAPTER_DIR, base_model=BASE_MODEL):
    """
    Loads the base model (without 4-bit quantization for Mac GPU compatibility)
    and applies the LoRA adapters saved from SFT.
    """
    print(f"Loading base model {base_model} and applying adapters from {adapter_dir}")
    tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=False, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token # Set the padding token
    
    # For Mac GPU, use torch_dtype=torch.float16 without 4-bit quantization
    base = AutoModelForCausalLM.from_pretrained(
        base_model,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        use_cache=False,
    )
    # Load adapter from local directory
    model = PeftModel.from_pretrained(base, "./sft_lora", device_map="auto", torch_dtype=torch.float16)
    model.to(device)
    model.eval()
    return tokenizer, model

# quick test
tok, sft_model = load_sft_model()
print("‚úÖ Loaded SFT model + tokenizer")

Loading base model mistralai/Mistral-7B-v0.3 and applying adapters from coqa_chatbot_lora


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

‚úÖ Loaded SFT model + tokenizer


In [None]:
!unzip -o reward_model.zip -d ./reward_model


In [17]:

from transformers import AutoModelForSequenceClassification

reward_model = AutoModelForSequenceClassification.from_pretrained(
    "./reward_model",
    torch_dtype=torch.float16,
    device_map="auto"
)
reward_model.to(device)
reward_model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [18]:
from copy import deepcopy

# Instead of deepcopy (uses 2x RAM), use gradient checkpointing
ref_model = None  # Set to None to skip reference model comparison
# This saves ~13GB RAM since we don't duplicate the model

# Enable gradient checkpointing to reduce memory
sft_model.gradient_checkpointing_enable()


In [19]:
pip install trl==0.8.6 accelerate bitsandbytes


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [20]:
from trl import PPOTrainer, PPOConfig

ppo_config = PPOConfig(
    model_name=BASE_MODEL,
    learning_rate=1e-6,
    batch_size=1,  # Keep at 1 to minimize RAM
    mini_batch_size=1,
    gradient_accumulation_steps=1,
    ppo_epochs=1,  # Reduced from 4
    target_kl=6.0,
    log_with=None,
    use_score_scaling=False,  # Disable for lower memory
    use_score_norm=False,  # Disable for lower memory
)


In [21]:
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead

# Wrap the SFT model with the value head required for PPO
model_with_value_head = AutoModelForCausalLMWithValueHead.from_pretrained(sft_model)
model_with_value_head.to(device)

ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=model_with_value_head,       # trainable policy
    ref_model=ref_model,   # frozen reference
    tokenizer=tok,
)

# Explicitly set trainer device to avoid CUDA issues on Mac
ppo_trainer.current_device = device

In [22]:
ppo_trainer.reward_model = reward_model


In [23]:
import gc
import psutil

# Monitor RAM before cleanup
process = psutil.Process()
ram_before = process.memory_info().rss / 1024 / 1024 / 1024  # Convert to GB

print(f"RAM before cleanup: {ram_before:.2f} GB")

# 1. Delete the original sft_model to free memory (we have model_with_value_head now)
if 'sft_model' in locals():
    del sft_model
    
# 2. Delete reference model since we're not using it
if 'ref_model' in locals():
    del ref_model

# 3. Move models to CPU if not training yet (frees GPU RAM)
# reward_model.to("cpu")  # Uncomment only if you're done with reward computation

# 4. Force garbage collection
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

ram_after = process.memory_info().rss / 1024 / 1024 / 1024
print(f"RAM after cleanup: {ram_after:.2f} GB")
print(f"Freed: {ram_before - ram_after:.2f} GB")


RAM before cleanup: 0.52 GB
RAM after cleanup: 0.76 GB
Freed: -0.25 GB


In [24]:
from tqdm import tqdm
import torch
import gc

# --- Load Dolly 15K dataset and prepare prompts for PPO training ---
from datasets import load_dataset
import random

# Load the Dolly dataset
print("Loading Databricks Dolly 15K dataset...")
ds_dolly = load_dataset("databricks/databricks-dolly-15k")

# Extract the 'instruction' field (used as prompts)
prompts = [d["instruction"].strip() for d in ds_dolly["train"] if d["instruction"].strip()]

# Randomly sample SMALL number of prompts to save RAM
random.seed(42)
ppo_prompts = random.sample(prompts, k=min(500, len(prompts)))  # REDUCED from 500!

# Delete dataset from memory after extracting prompts
del ds_dolly
gc.collect()

print(f"‚úÖ Loaded {len(ppo_prompts)} Dolly prompts for PPO training.")
print("Example prompt:", ppo_prompts[0])


for epoch in range(3):  # Reduced from 3
    for idx, prompt in enumerate(tqdm(ppo_prompts)):
        # 1Ô∏è‚É£ Tokenize prompt WITHOUT padding
        query_tokenized = tok(prompt, return_tensors="pt", truncation=True, max_length=128).input_ids.to(ppo_trainer.current_device)

        # 2Ô∏è‚É£ Generate response from the current policy
        with torch.no_grad():  # Disable gradients during generation
            response_tensors = ppo_trainer.model.generate(
                query_tokenized,
                max_new_tokens=32,  # REDUCED from 128!
                do_sample=True,
                top_p=0.95,
                top_k=50,
                use_cache=True,
            )
        
        # Extract only the newly generated tokens (remove the query part)
        response_only = response_tensors[:, query_tokenized.shape[-1]:]
        responses = [tok.decode(r, skip_special_tokens=True) for r in response_only]

        # 3Ô∏è‚É£ Compute reward using reward model
        combined_texts = [prompt + resp for resp in responses]
        inputs = tok(combined_texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(reward_model.device)
        with torch.no_grad():
            reward_tensors = reward_model(**inputs).logits
        
        # Get reward for each sample in batch, then squeeze batch dim
        reward_tensors = reward_tensors.squeeze(-1)  # Remove logit dimension
        if reward_tensors.dim() > 1:
            reward_tensors = reward_tensors.squeeze(0)  # Remove batch dim if needed

        # 4Ô∏è‚É£ PPO optimization step - need to squeeze batch dim from tensors for step
        # PPOTrainer expects list of 1D tensors (tokens), not 2D
        query_for_ppo = [query_tokenized.squeeze(0)]  # Convert from [1, seq_len] to [seq_len]
        response_for_ppo = [response_only.squeeze(0)]  # Convert from [1, seq_len] to [seq_len]
        reward_for_ppo = [reward_tensors]  # Single reward value
        
        stats = ppo_trainer.step(query_for_ppo, response_for_ppo, reward_for_ppo)
        
        # log_stats expects a dict with 'query' and 'response' keys, and numeric rewards
        batch_dict = {"query": [prompt], "response": responses}
        ppo_trainer.log_stats(stats, batch_dict, [reward_tensors])
        
        # Cleanup every 5 iterations to prevent memory leaks
        if (idx + 1) % 5 == 0:
            gc.collect()
            torch.cuda.empty_cache() if torch.cuda.is_available() else None


Loading Databricks Dolly 15K dataset...
‚úÖ Loaded 500 Dolly prompts for PPO training.
Example prompt: What athlete created the 'beast quake' for the Seattle Seahawks?
‚úÖ Loaded 500 Dolly prompts for PPO training.
Example prompt: What athlete created the 'beast quake' for the Seattle Seahawks?


  0%|          | 0/500 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/500 [00:03<29:38,  3.56s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/500 [00:03<29:38,  3.56s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your 

In [29]:
save_dir = Path("./ppo_finetuned_chatbot")
save_dir.mkdir(parents=True, exist_ok=True)  # Create directory if it doesn't exist
model_with_value_head.save_pretrained(save_dir)
tok.save_pretrained(save_dir)
print(f"‚úÖ PPO fine-tuned model saved at {save_dir}")

‚úÖ PPO fine-tuned model saved at ppo_finetuned_chatbot
