# environment setup

In [None]:
!nvidia-smi
import torch, sys
print("Python:", sys.version)
print("Torch:", torch.__version__ if hasattr(torch, "__version__") else "Not installed")


In [None]:
# Fresh start
%pip -q install --upgrade pip

# Install PyTorch CUDA 12.4 wheels
%pip -q install --index-url https://download.pytorch.org/whl/cu124 torch torchvision torchaudio
import torch, platform
print("Torch:", torch.__version__, "| CUDA available:", torch.cuda.is_available(), "| Device cap:", torch.cuda.get_device_capability() if torch.cuda.is_available() else None)


In [None]:
%pip -q install unsloth


In [None]:
# For Torch 2.5 + CUDA 12.4 wheels installed above:
%pip -q install "unsloth[cu124-torch250] @ git+https://github.com/unslothai/unsloth.git"


In [None]:
!wget -qO- https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/_auto_install.py | python -


In [None]:
%pip install --upgrade --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
%pip install --upgrade --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth-zoo.git


In [None]:
from unsloth import FastLanguageModel, FastModel
import torch
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset

max_seq_length = 2048

# Tiny public jsonl for a quick smoke test
url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
dataset = load_dataset("json", data_files={"train": url}, split="train")

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gpt-oss-20b",   # pick any supported model
    max_seq_length = max_seq_length,
    load_in_4bit = True,                  # QLoRA 4-bit
    load_in_8bit = False,
    load_in_16bit = False,
    full_finetuning = False,
    # token="hf_..."                       # if using gated models
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    max_seq_length = max_seq_length,
    use_rslora = False,
    loftq_config = None,
)

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    tokenizer = tokenizer,
    args = SFTConfig(
        max_seq_length = max_seq_length,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 60,           # use num_train_epochs=1..3 for a real run
        logging_steps = 1,
        output_dir = "outputs",
        optim = "adamw_8bit",
        seed = 3407,
    ),
)

trainer.train()


In [None]:
!nvidia-smi
import sys; print("Python:", sys.version)


In [None]:
%pip -q install --upgrade pip
%pip -q install --index-url https://download.pytorch.org/whl/cu124 torch torchvision torchaudio
import torch; print("Torch:", torch.__version__, "| CUDA:", torch.version.cuda, "| GPU available:", torch.cuda.is_available())


In [None]:
# 0) Info (optional)
!nvidia-smi
import sys; print("Python:", sys.version)


In [None]:
# 1) Torch (CUDA 12.4 wheels are stable on Colab)
%pip -q install --upgrade pip
%pip -q install --index-url https://download.pytorch.org/whl/cu124 torch torchvision torchaudio
import torch; print("Torch:", torch.__version__, "| CUDA:", torch.version.cuda, "| GPU:", torch.cuda.is_available())


In [None]:
# Create a clean virtual environment
!python -m venv /content/unsloth-venv
# Upgrade pip inside the venv
!/content/unsloth-venv/bin/python -m pip install --upgrade pip


In [None]:
!rm -rf /content/unsloth-venv


In [None]:
!python -m venv /content/unsloth-venv --without-pip


In [None]:
!curl -sS https://bootstrap.pypa.io/get-pip.py -o get-pip.py
!/content/unsloth-venv/bin/python get-pip.py


In [None]:
# Torch (CUDA 12.4)
!/content/unsloth-venv/bin/python -m pip install --index-url https://download.pytorch.org/whl/cu124 torch==2.5.1+cu124

# Unsloth + dependencies
!/content/unsloth-venv/bin/python -m pip install \
  "trl==0.23.0" "transformers==4.56.2" \
  "accelerate>=0.34.2" "peft>=0.13.2" "bitsandbytes>=0.45.0" "datasets>=2.20.0" \
  unsloth unsloth_zoo


In [None]:
import os, sys, glob, site

# Path to your venv
venv_path = "/content/unsloth-venv"

# Compute the venv's site-packages folder
pyver = f"{sys.version_info.major}.{sys.version_info.minor}"
site_dir = f"{venv_path}/lib/python{pyver}/site-packages"

# If folder name differs (e.g., python3.10 vs 3.12), auto-detect it
cands = glob.glob(f"{venv_path}/lib/python*/site-packages")
if not os.path.isdir(site_dir) and cands:
    site_dir = cands[0]

# Add it so imports come from the venv
site.addsitedir(site_dir)
print("‚úÖ Added venv to sys.path:", site_dir)

# ---- Sanity check ----
import torch, transformers, trl, unsloth
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)
print("TRL:", trl.__version__)
print("Unsloth imported OK from:", unsloth.__file__)


In [None]:
!nvidia-smi
import torch
print("GPU available:", torch.cuda.is_available())


In [None]:
from unsloth import FastLanguageModel, FastModel
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset

max_seq_length = 1024

# Small public dataset for a quick test
dataset = load_dataset(
    "json",
    data_files={"train": "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"},
    split="train[:200]"
)

# Load base model (you can later change to Llama, Gemma, etc.)
model, tokenizer = FastModel.from_pretrained(
    model_name      = "unsloth/gpt-oss-20b",
    max_seq_length  = max_seq_length,
    load_in_4bit    = True,      # fits on T4
    full_finetuning = False,
)

# Apply LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    max_seq_length=max_seq_length,
)

# Configure trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    args=SFTConfig(
        output_dir="outputs",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        max_steps=40,      # quick run
        logging_steps=2,
        optim="adamw_8bit",
        seed=3407,
        max_seq_length=max_seq_length,
    ),
)

trainer.train()


In [None]:
from datasets import load_dataset
ds = load_dataset("tatsu-lab/alpaca", split="train[:5000]")  # take small slice for speed
ds = ds.train_test_split(test_size=0.1, seed=42)
train_ds, eval_ds = ds["train"], ds["test"]

def format_alpaca(example):
    instruction = example["instruction"].strip()
    input_text = example["input"].strip()
    output = example["output"].strip()
    if input_text:
        prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
    else:
        prompt = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
    return {"text": prompt}

train_data = train_ds.map(format_alpaca, remove_columns=train_ds.column_names)
eval_data  = eval_ds.map(format_alpaca,  remove_columns=eval_ds.column_names)


In [None]:
!/content/unsloth-venv/bin/python -m pip install unsloth unsloth_zoo


In [None]:
# === Recreate Unsloth venv cleanly ===
!python -m venv /content/unsloth-venv --without-pip
!curl -sS https://bootstrap.pypa.io/get-pip.py -o get-pip.py
!/content/unsloth-venv/bin/python get-pip.py
!/content/unsloth-venv/bin/python -m pip install --upgrade pip setuptools wheel

# Install PyTorch (CUDA 12.4 wheel)
!/content/unsloth-venv/bin/python -m pip install --index-url https://download.pytorch.org/whl/cu124 torch==2.5.1+cu124

# Install Unsloth + compatible stack
!/content/unsloth-venv/bin/python -m pip install \
  "trl==0.23.0" "transformers==4.56.2" \
  "accelerate>=0.34.2" "peft>=0.13.2" "bitsandbytes>=0.45.0" "datasets>=2.20.0" \
  unsloth unsloth_zoo


In [None]:
import os, sys, glob, site

venv_path = "/content/unsloth-venv"
pyver = f"{sys.version_info.major}.{sys.version_info.minor}"
site_dir = f"{venv_path}/lib/python{pyver}/site-packages"
cands = glob.glob(f"{venv_path}/lib/python*/site-packages")
if not os.path.isdir(site_dir) and cands:
    site_dir = cands[0]
site.addsitedir(site_dir)
print("‚úÖ Added venv to sys.path:", site_dir)

# Sanity check
import torch, transformers, trl, unsloth
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)
print("TRL:", trl.__version__)
print("Unsloth imported OK from:", unsloth.__file__)


In [None]:
from unsloth import FastLanguageModel, FastModel
from trl import SFTTrainer, SFTConfig
# ... rest of the fine-tuning code


# Full Finetuning (smollm2-135m) on Alpaca subset

In [None]:
!nvidia-smi
import torch; print("CUDA:", torch.cuda.is_available())


In [None]:
from datasets import load_dataset

# Small, fast slice for first run. Later increase to "train[:100%]".
raw = load_dataset("tatsu-lab/alpaca", split="train[:5000]", cache_dir="/content/.cache_hf")
splits = raw.train_test_split(test_size=0.1, seed=42)
train_ds, eval_ds = splits["train"], splits["test"]

def format_alpaca(ex):
    instr = (ex.get("instruction") or "").strip()
    inp   = (ex.get("input") or "").strip()
    out   = (ex.get("output") or "").strip()
    user  = f"### Instruction:\n{instr}" + (f"\n\n### Input:\n{inp}" if inp else "")
    return {"text": f"{user}\n\n### Response:\n{out}"}

train_data = train_ds.map(format_alpaca, remove_columns=train_ds.column_names)
eval_data  = eval_ds.map(format_alpaca,  remove_columns=eval_ds.column_names)

print(train_data[0]["text"][:500])
print(f"Train/Eval sizes: {len(train_data)}/{len(eval_data)}")


In [None]:
from unsloth import FastModel
from trl import SFTTrainer, SFTConfig

BASE_MODEL = "unsloth/smollm2-135m"
MAX_LEN = 1024

model, tokenizer = FastModel.from_pretrained(
    model_name      = BASE_MODEL,
    max_seq_length  = MAX_LEN,
    load_in_4bit    = False,     # FULL fine-tune (no quantization)
    full_finetuning = True,      # <‚Äî key difference vs LoRA
)

args = SFTConfig(
    output_dir="outputs_full",
    per_device_train_batch_size=8,     # small model ‚Üí can go higher; drop if OOM
    gradient_accumulation_steps=1,
    num_train_epochs=1,                # or set max_steps=1000 for time-boxed training
    logging_steps=20,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    optim="adamw_torch",               # standard AdamW
    learning_rate=2e-4,                # small model tolerates a bit higher LR
    weight_decay=0.01,
    seed=42,
    max_seq_length=MAX_LEN,
    dataset_text_field="text",
    fp16=True,                         # T4 = FP16 (if bf16=False)
    bf16=False,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    eval_dataset=eval_data,
    args=args,
)

trainer.train()


In [None]:
model.save_pretrained("finetuned_full_smollm2")
tokenizer.save_pretrained("finetuned_full_smollm2")
!ls -lah finetuned_full_smollm2


In [None]:
# Save the fully fine-tuned model and tokenizer
SAVE_DIR = "full_smollm2_final"
trainer.save_model(SAVE_DIR)           # saves model weights + config
tokenizer.save_pretrained(SAVE_DIR)    # saves tokenizer vocab + config
print("Saved to:", SAVE_DIR)


# (LoRA) on the same dataset (smaller memory, faster)

In [None]:
from unsloth import FastLanguageModel, FastModel
from trl import SFTTrainer, SFTConfig

BASE_MODEL = "unsloth/smollm2-135m"
MAX_LEN = 1024

model, tokenizer = FastModel.from_pretrained(
    model_name      = BASE_MODEL,
    max_seq_length  = MAX_LEN,
    load_in_4bit    = True,      # LoRA path (QLoRA)
    full_finetuning = False,     # <‚Äî enables PEFT
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",
    max_seq_length=MAX_LEN,
)

args = SFTConfig(
    output_dir="outputs_lora",
    per_device_train_batch_size=8,        # should be fine; lower if OOM
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    logging_steps=20,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    optim="adamw_8bit",                   # 8-bit optimizer
    learning_rate=2e-4,
    weight_decay=0.01,
    seed=42,
    max_seq_length=MAX_LEN,
    dataset_text_field="text",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    eval_dataset=eval_data,
    args=args,
)
trainer.train()


In [None]:
model.save_pretrained("finetuned_lora_smollm2")
tokenizer.save_pretrained("finetuned_lora_smollm2")

prompt = "Write a concise function in Python that reverses a string."
x = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.inference_mode():
    y = model.generate(**x, max_new_tokens=120, do_sample=True, temperature=0.7)
print(tokenizer.decode(y[0], skip_special_tokens=True))


In [None]:
SAVE_DIR = "lora_smollm2_final"
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print("Saved to:", SAVE_DIR)


GRPO (reasoning) with TRL‚Äôs GRPOTrainer

In [None]:
!/content/unsloth-venv/bin/python -m pip -q install "trl==0.24.0"
import os, sys, glob, site
venv_path = "/content/unsloth-venv"
pyver = f"{sys.version_info.major}.{sys.version_info.minor}"
site_dir = f"{venv_path}/lib/python{pyver}/site-packages"
cands = glob.glob(f"{venv_path}/lib/python*/site-packages")
if not os.path.isdir(site_dir) and cands: site_dir = cands[0]
site.addsitedir(site_dir)

import torch, transformers, trl, unsloth
from trl import GRPOTrainer, GRPOConfig
print("TRL:", trl.__version__)


In [None]:
from datasets import load_dataset

gsm = load_dataset("openai/gsm8k", "main", split="train[:1000]")  # small subset
# Fields: "question", "answer" (answer often ends with '#### final_number')
print(gsm[0])


In [None]:
import re

def extract_final_number(s: str):
    # common GSM8K pattern: "... #### 42"
    m = re.search(r"####\s*(-?\d+(?:\.\d+)?)", s)
    return m.group(1).strip() if m else None

def reward_function(samples, **kwargs):
    # samples: list[str] model generations
    # kwargs can include batch with "answer"
    rewards = []
    golds = kwargs.get("references")  # we‚Äôll pass gold answers separately
    for gen, gold in zip(samples, golds):
        gold_num = extract_final_number(gold or "")
        got_num  = extract_final_number(gen or "")
        rewards.append(1.0 if (gold_num is not None and gold_num == got_num) else 0.0)
    return rewards


In [None]:
from unsloth import FastLanguageModel, FastModel

BASE_MODEL = "unsloth/smollm2-135m"
MAX_LEN = 768

policy, tokenizer = FastModel.from_pretrained(
    model_name      = BASE_MODEL,
    max_seq_length  = MAX_LEN,
    load_in_4bit    = True,
    full_finetuning = False,
)
policy = FastLanguageModel.get_peft_model(
    policy,
    r=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha=16, lora_dropout=0.05, bias="none",
    use_gradient_checkpointing="unsloth",
    max_seq_length=MAX_LEN,
)


In [None]:
def to_prompt(ex):
    # short, explicit instruction
    return f"""Solve the math word problem step by step, then answer with '#### <final_number>' on the last line.

Problem:
{ex['question']}
"""

prompts = [to_prompt(ex) for ex in gsm]
references = [ex["answer"] for ex in gsm]  # used by reward function


In [None]:
# 1) Build a Dataset with prompts (and keep references separately for the reward)
from datasets import Dataset

train_ds = Dataset.from_dict({"prompt": prompts})
len(train_ds), train_ds[0]


In [None]:
# assumes you already created `prompts` and `references` lists earlier
prompt_to_answer = {p: a for p, a in zip(prompts, references)}


In [None]:
import re

def extract_final_number(s: str):
    m = re.search(r"####\s*(-?\d+(?:\.\d+)?)", s)
    return m.group(1).strip() if m else None

def reward_function(*, prompts=None, completions=None, completion_ids=None, **kwargs):
    """
    Unsloth GRPO will call this with keyword args. We use:
      - prompts:     list[str]   (batch prompts)
      - completions: list[str]   (model generations for each prompt)
    Return: list[float] rewards length == len(completions)
    """
    rewards = []
    for p, gen in zip(prompts or [], completions or []):
        gold = prompt_to_answer.get(p, "")
        got = extract_final_number(gen or "")
        ans = extract_final_number(gold or "")
        rewards.append(1.0 if (ans is not None and got == ans) else 0.0)
    return rewards


In [None]:
import os, sys, glob, site

venv_path = "/content/unsloth-venv"
pyver = f"{sys.version_info.major}.{sys.version_info.minor}"

# Find the venv site-packages folder automatically
site_dir = f"{venv_path}/lib/python{pyver}/site-packages"
cands = glob.glob(f"{venv_path}/lib/python*/site-packages")
if not os.path.isdir(site_dir) and cands:
    site_dir = cands[0]

site.addsitedir(site_dir)

import torch, transformers
print("‚úÖ Attached venv at:", site_dir)
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)


In [None]:
# Clean venv
!rm -rf /content/unsloth-venv
!python -m venv /content/unsloth-venv --without-pip
!curl -sS https://bootstrap.pypa.io/get-pip.py -o get-pip.py
!/content/unsloth-venv/bin/python get-pip.py
!/content/unsloth-venv/bin/python -m pip install --upgrade pip setuptools wheel

# Torch (CUDA 12.4 wheel), Transformers, TRL, and Unsloth + deps
!/content/unsloth-venv/bin/python -m pip install --index-url https://download.pytorch.org/whl/cu124 torch==2.5.1+cu124
!/content/unsloth-venv/bin/python -m pip install \
  "transformers==4.56.2" "trl==0.24.0" \
  "accelerate>=0.34.2" "peft>=0.13.2" "bitsandbytes>=0.45.0" "datasets>=2.20.0" \
  unsloth unsloth_zoo


In [None]:
# Remove conflicting packages from the *venv*
!/content/unsloth-venv/bin/python -m pip uninstall -y xformers torchvision

# (Optional) make sure torch & core libs are exactly what we want
!/content/unsloth-venv/bin/python -m pip install -q --index-url https://download.pytorch.org/whl/cu124 torch==2.5.1+cu124
!/content/unsloth-venv/bin/python -m pip install -q "transformers==4.56.2" "trl==0.24.0" "accelerate>=0.34.2" "peft>=0.13.2" "datasets>=2.20.0" unsloth unsloth_zoo


In [None]:
from datasets import Dataset

# A small set of arithmetic prompts
prompts = [
    "Q: 17 + 28 = ?\nA:",
    "Q: 65 - 19 = ?\nA:",
    "Q: 7 * 8 = ?\nA:",
    "Q: 144 / 12 = ?\nA:",
    "Q: 29 + 34 = ?\nA:",
    "Q: 81 - 27 = ?\nA:",
    "Q: 9 * 9 = ?\nA:",
    "Q: 56 / 7 = ?\nA:",
    "Q: 12 + 45 = ?\nA:",
    "Q: 100 - 58 = ?\nA:",
]

# References (ground-truth answers as strings)
references = [
    "45", "46", "56", "12", "63", "54", "81", "8", "57", "42",
]

# Trainer expects a dataset with a "prompt" field
train_ds = Dataset.from_dict({"prompt": prompts})
len(train_ds), train_ds[0]


In [None]:
import re
import torch

def extract_number(text):
    # keep last number-like token
    nums = re.findall(r"-?\d+(?:\.\d+)?", text)
    return nums[-1] if nums else None

@torch.no_grad()
def reward_math(prompts, completions, completion_ids=None, **kwargs):
    refs = kwargs.get("references", [])
    rewards = []
    for i, (p, c) in enumerate(zip(prompts, completions)):
        ref = refs[i % len(refs)]  # cycle if needed
        pred = extract_number(c) or ""
        # exact match reward; partial credit if digits overlap
        if pred.strip() == ref.strip():
            r = 1.0
        elif re.sub(r"\D", "", pred) == re.sub(r"\D", "", ref) and pred != "":
            r = 0.5
        else:
            r = 0.0
        rewards.append(r)
    return torch.tensor(rewards, device="cuda", dtype=torch.float32)


In [None]:
print("üöÄ Starting GRPO reasoning training‚Ä¶")
trainer.train()


In [None]:
def generate(model, tok, prompt):
    inputs = tok(prompt, return_tensors="pt").to(model.device)
    out = model.generate(
        **inputs,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        max_new_tokens=64,
        pad_token_id=tok.eos_token_id,
    )
    return tok.decode(out[0], skip_special_tokens=True)

test_q = "Q: 23 + 19 = ?\nA:"
print("=== After GRPO ===")
print(generate(trainer.model.eval(), tokenizer, test_q))


In [None]:
# should print a GPU name and True
import torch, platform
print("CUDA available:", torch.cuda.is_available())
print("Torch:", torch.__version__, "| CUDA build:", torch.version.cuda)


# (GRPO Reasoning)

In [None]:
!nvidia-smi
import torch
print("‚úÖ CUDA available:", torch.cuda.is_available())


In [None]:
!pip install -q "transformers==4.57.1" "trl==0.24.0" \
               "accelerate>=0.34.2" "peft>=0.13.2" \
               "datasets>=2.20.0" "bitsandbytes>=0.45.0" \
               unsloth unsloth_zoo


In [None]:
# === 1. Dataset ===
from datasets import load_dataset, Dataset
import re

gsm = load_dataset("openai/gsm8k", "main", split="train[:1000]")

def to_prompt(rec):
    return f"""Solve the math word problem step by step, then answer with '#### <final_number>' on the last line.

Problem:
{rec['question']}
"""
prompts = [to_prompt(x) for x in gsm]
answers = [x["answer"] for x in gsm]
train_ds = Dataset.from_dict({"prompt": prompts})
answer_map = dict(zip(prompts, answers))

# === 2. Reward function ===
def extract_final_number(text):
    m = re.search(r"####\s*(-?\d+(?:\.\d+)?)", text)
    return m.group(1).strip() if m else None

def reward_fn(*, prompts=None, completions=None, completion_ids=None, **kwargs):
    rewards = []
    for p, gen in zip(prompts or [], completions or []):
        gold = answer_map.get(p, "")
        got  = extract_final_number(gen or "")
        ref  = extract_final_number(gold or "")
        rewards.append(1.0 if got == ref and ref is not None else 0.0)
    return rewards

# === 3. Model (LoRA + 4-bit) ===
from unsloth import FastLanguageModel, FastModel

BASE_MODEL = "unsloth/smollm2-135m"
MAX_LEN    = 768

policy, tokenizer = FastModel.from_pretrained(
    model_name      = BASE_MODEL,
    max_seq_length  = MAX_LEN,
    load_in_4bit    = True,
    full_finetuning = False,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

policy = FastLanguageModel.get_peft_model(
    policy,
    r=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj",
                    "gate_proj","up_proj","down_proj"],
    lora_alpha=16, lora_dropout=0.05, bias="none",
    use_gradient_checkpointing="unsloth",
    max_seq_length=MAX_LEN,
)

# === 4. GRPO training ===
from trl import GRPOTrainer, GRPOConfig

cfg = GRPOConfig(
    output_dir="outputs_grpo",
    per_device_train_batch_size=4,   # raise to 8 on L4/A100
    gradient_accumulation_steps=1,
    max_steps=100,                   # longer = better; start small
    logging_steps=5,
    save_steps=50,
    learning_rate=5e-6,
    max_prompt_length=384,
    max_completion_length=192,
    beta=0.03,
)

trainer = GRPOTrainer(
    model=policy,
    tokenizer=tokenizer,
    args=cfg,
    reward_funcs=[reward_fn],
    train_dataset=train_ds,
    train_kwargs={},
)

print("‚úÖ Starting GRPO training‚Ä¶ (first step compiles kernels, be patient)")
trainer.train()


In [None]:
# 1) Build policy (LoRA) and frozen ref from SAME BASE
from unsloth import FastLanguageModel, FastModel

BASE_MODEL = "unsloth/smollm2-135m"
MAX_LEN    = 768

policy, tokenizer = FastModel.from_pretrained(BASE_MODEL, max_seq_length=MAX_LEN, load_in_4bit=True, full_finetuning=False)
ref_policy, _     = FastModel.from_pretrained(BASE_MODEL, max_seq_length=MAX_LEN, load_in_4bit=True, full_finetuning=False)

# pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

# add LoRA
policy = FastLanguageModel.get_peft_model(
    policy,
    r=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha=16, lora_dropout=0.05, bias="none",
    use_gradient_checkpointing="unsloth",
    max_seq_length=MAX_LEN,
)

# 2) Make BOTH models return hidden states; freeze ref; disable cache for training
for m in (policy, ref_policy):
    m.config.output_hidden_states = True
    m.config.use_cache = False
policy.train(); ref_policy.eval()
for p in ref_policy.parameters():
    p.requires_grad = False


In [None]:
# 3) GRPO config with explicit num_generations (batch size must be a multiple)
from trl import GRPOTrainer, GRPOConfig

NUM_GENERATIONS = 2       # try 2 on T4 to reduce memory
PER_DEVICE_BSZ  = 4       # MUST be 2, 4, 6‚Ä¶ (multiple of NUM_GENERATIONS)

cfg = GRPOConfig(
    output_dir="outputs_grpo_unsloth",
    per_device_train_batch_size=PER_DEVICE_BSZ,
    gradient_accumulation_steps=1,
    max_steps=100,
    logging_steps=5,
    save_steps=50,
    learning_rate=5e-6,
    max_prompt_length=320,
    max_completion_length=128,
    beta=0.03,
    num_generations=NUM_GENERATIONS,
    dataloader_num_workers=0,
)


In [None]:
# Must be set BEFORE importing unsloth
import os
os.environ["UNSLOTH_DONT_PATCH_TRAINERS"] = "1"

# Import TRL/Transformers/Torch FIRST
import torch, transformers, trl

# Grab stable references to vanilla TRL classes NOW
VanillaGRPOTrainer = trl.GRPOTrainer
VanillaGRPOConfig  = trl.GRPOConfig

print("Vanilla GRPOTrainer from:", VanillaGRPOTrainer.__module__)

# Now import Unsloth (model speedups), AFTER we saved the vanilla classes
import unsloth
from unsloth import FastLanguageModel, FastModel

print("Torch:", torch.__version__, "| Transformers:", transformers.__version__, "| TRL:", trl.__version__)


In [None]:
!pip -q install "transformers==4.57.1" "trl==0.24.0" \
                "accelerate>=0.34.2" "peft>=0.13.2" \
                "datasets>=2.20.0" "bitsandbytes>=0.45.0" \
                unsloth unsloth_zoo


In [None]:
# MUST be set before importing unsloth
%env UNSLOTH_DONT_PATCH_TRAINERS=1
import os
os.environ["UNSLOTH_DONT_PATCH_TRAINERS"] = "1"

# Import TRL first, and take references to its vanilla GRPO classes
import torch, transformers, trl
VanillaGRPOTrainer = trl.GRPOTrainer
VanillaGRPOConfig  = trl.GRPOConfig
print("Vanilla GRPOTrainer from:", VanillaGRPOTrainer.__module__)

# Now import Unsloth (model speedups); it will NOT patch trainers
import unsloth
from unsloth import FastLanguageModel, FastModel

print("Torch:", torch.__version__, "| Transformers:", transformers.__version__, "| TRL:", trl.__version__)


In [None]:
# Data + reward (same as before)
from datasets import load_dataset, Dataset
import re

gsm = load_dataset("openai/gsm8k", "main", split="train[:1000]")
def to_prompt(rec):
    return f"""Solve the math word problem step by step, then answer with '#### <final_number>' on the last line.

Problem:
{rec['question']}
"""
prompts    = [to_prompt(x) for x in gsm]
references = [x["answer"] for x in gsm]
train_ds   = Dataset.from_dict({"prompt": prompts})
prompt2gold = dict(zip(prompts, references))

def extract_final_number(s: str):
    m = re.search(r"####\s*(-?\d+(?:\.\d+)?)", s)
    return m.group(1).strip() if m else None

def reward_fn(*, prompts=None, completions=None, **kwargs):
    rewards = []
    for p, gen in zip(prompts or [], completions or []):
        gold = prompt2gold.get(p, "")
        got  = extract_final_number(gen or "")
        ans  = extract_final_number(gold or "")
        rewards.append(1.0 if (ans is not None and got == ans) else 0.0)
    return rewards


In [None]:
# Unsloth model (4-bit + LoRA)
BASE_MODEL = "unsloth/smollm2-135m"
MAX_LEN    = 768

policy, tokenizer = FastModel.from_pretrained(
    model_name      = BASE_MODEL,
    max_seq_length  = MAX_LEN,
    load_in_4bit    = True,
    full_finetuning = False,
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

policy = FastLanguageModel.get_peft_model(
    policy,
    r=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha=16, lora_dropout=0.05, bias="none",
    use_gradient_checkpointing="unsloth",
    max_seq_length=MAX_LEN,
)


In [None]:
# VANILLA TRL GRPO config ‚Äî T4 safe & divisible
cfg = VanillaGRPOConfig(
    output_dir="outputs_grpo_vanilla",
    per_device_train_batch_size=2,   # small to fit T4
    generation_batch_size=2,         # <-- divisible by num_generations
    num_generations=2,               # <-- lower from default 8

    gradient_accumulation_steps=1,
    max_steps=100,
    logging_steps=5,
    save_steps=50,
    learning_rate=5e-6,

    # keep lengths modest for memory
    max_prompt_length=320,
    max_completion_length=128,

    beta=0.03,
    dataloader_num_workers=0,

    # precision for T4
    bf16=False,
    fp16=True,
    tf32=False,
)

# ‚úÖ Correct for trl==0.24.0 (no tokenizer arg)
trainer = VanillaGRPOTrainer(
    model=policy,
    args=cfg,
    reward_funcs=[reward_fn],
    train_dataset=train_ds,
)


print("‚úÖ Starting GRPO (vanilla TRL with Unsloth model)‚Ä¶")
trainer.train()


In [None]:
print("‚úÖ Starting GRPO (vanilla TRL with Unsloth model)‚Ä¶")
trainer.train()


In [None]:
# ‚ö° Fast, batched evaluation on a smaller slice
from datasets import load_dataset
from math import ceil
from tqdm.auto import tqdm
import torch, re, time

# --- knobs you can tweak ---
N_TEST = 80          # try 40‚Äì100; smaller = faster
BATCH  = 8           # 4‚Äì16 depending on VRAM
MAX_PROMPT_LEN = 320
MAX_NEW = 96         # shorter = faster; 64‚Äì128 is fine
USE_FP16 = True      # keep True on T4

# --- helpers ---
def to_prompt(rec):
    return f"""Solve the math word problem step by step, then answer with '#### <final_number>' on the last line.

Problem:
{rec['question']}
"""

def extract_final_number(s: str):
    m = re.search(r"####\s*(-?\d+(?:\.\d+)?)", s)
    return m.group(1).strip() if m else None

# --- data ---
test = load_dataset("openai/gsm8k", "main", split=f"test[:{N_TEST}]")
prompts_eval = [to_prompt(x) for x in test]
answers_eval = [x["answer"] for x in test]

policy.eval()
device = next(policy.parameters()).device

# batch tokenization helper (pad to same length to keep it fast)
def batch_encode(texts):
    return tokenizer(
        texts, return_tensors="pt", padding=True, truncation=True,
        max_length=MAX_PROMPT_LEN
    ).to(device)

correct = 0
gens = []
t0 = time.time()

with torch.inference_mode():
    if USE_FP16 and torch.cuda.is_available():
        autocast_ctx = torch.cuda.amp.autocast(dtype=torch.float16)
    else:
        # No-op context manager
        class _noop:
            def __enter__(self): return None
            def __exit__(self, *a): return False
        autocast_ctx = _noop()

    with autocast_ctx:
        for i in tqdm(range(0, len(prompts_eval), BATCH), desc="Evaluating"):
            batch_prompts = prompts_eval[i:i+BATCH]
            batch_answers = answers_eval[i:i+BATCH]

            x = batch_encode(batch_prompts)

            # greedy decoding is fastest & stable for exact-match metric
            y = policy.generate(
                **x,
                max_new_tokens=MAX_NEW,
                do_sample=False,
                num_beams=1,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

            outs = tokenizer.batch_decode(y, skip_special_tokens=True)
            gens.extend(outs)

            for out, gold in zip(outs, batch_answers):
                got = extract_final_number(out)
                ref = extract_final_number(gold)
                if got is not None and got == ref:
                    correct += 1

dt = time.time() - t0
acc = correct / len(prompts_eval)
print(f"\n‚úÖ Exact final-number accuracy: {acc:.3f} on {len(prompts_eval)} problems")
print(f"‚è±Ô∏è Time: {dt:.1f}s | Throughput: {len(prompts_eval)/dt:.2f} samples/sec")

# peek at a few generations
for k in range(min(3, len(gens))):
    print(f"\n--- Example {k+1} ---")
    print(gens[k][:800])


In [None]:
import re, pandas as pd, os

def extract_final_number(s: str):
    m = re.search(r"####\s*(-?\d+(?:\.\d+)?)", s)
    return m.group(1).strip() if m else None

rows = []
for p, gold, gen in zip(prompts_eval, answers_eval, gens):
    pred = extract_final_number(gen or "")
    ref  = extract_final_number(gold or "")
    rows.append({
        "prompt": p,
        "gold_answer": gold,
        "generated": gen,
        "pred_final_number": pred,
        "ref_final_number": ref,
        "correct": (pred is not None and ref is not None and pred == ref),
    })

df = pd.DataFrame(rows)
csv_path = "grpo_eval_results.csv"
df.to_csv(csv_path, index=False)
print("Saved:", os.path.abspath(csv_path))
df.head(3)


In [None]:
import shutil, os

artifacts = [
    "grpo_smollm2_lora_adapters",  # comment out if you didn't save adapters
    "grpo_smollm2_merged_fp16",    # comment out if you didn't merge
    "outputs_grpo_vanilla",        # trainer logs/checkpoints
    "grpo_eval_results.csv",
]

for a in artifacts:
    if os.path.exists(a):
        out = a.rstrip("/").replace("/", "_") + ".zip"
        shutil.make_archive(a, 'zip', a) if os.path.isdir(a) else shutil.copy(a, out)
        print("Zipped:", out)
    else:
        print("Skip (not found):", a)


In [None]:
SAVE_DIR = "grpo_smollm2_final"
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print("Saved to:", SAVE_DIR)


# Continued Pretraining

In [None]:
!pip -q install "transformers==4.57.1" "peft==0.13.2" \
                "datasets>=2.20.0" "accelerate>=0.34.2" \
                "bitsandbytes>=0.45.0"


In [None]:
try:
    import unsloth
    raise RuntimeError("unsloth is present (should NOT be). Do: Runtime ‚Üí Factory reset runtime, then rerun installs.")
except Exception as e:
    print("‚úÖ unsloth not importable (good):", type(e).__name__)


In [None]:
# ‚úÖ Build Hindi text dataset (streamed) ‚Üí train_ds / valid_ds
from datasets import load_dataset, Dataset
from itertools import islice

N_LINES = 8000   # lower to 6000 if you want it faster

def get_hi_stream():
    # 1) Parquet-backed Wikipedia from the wikimedia org (no custom code)
    for snap in ["20231101.hi", "20240101.hi"]:
        try:
            print(f"Trying wikimedia/wikipedia:{snap} (parquet)‚Ä¶")
            return load_dataset("wikimedia/wikipedia", snap, split="train", streaming=True)
        except Exception as e:
            print("  -> failed:", type(e).__name__)
    # 2) Classic wikipedia loader (requires trust_remote_code)
    for snap in ["20231101.hi", "20230601.hi"]:
        try:
            print(f"Trying wikipedia:{snap} with trust_remote_code‚Ä¶")
            return load_dataset("wikipedia", snap, split="train", streaming=True, trust_remote_code=True)
        except Exception as e:
            print("  -> failed:", type(e).__name__)
    raise RuntimeError("No public Hindi Wikipedia snapshot available in this runtime.")

stream = get_hi_stream()

def cleaned_lines(gen):
    for row in gen:
        txt = (row.get("text") or "").strip()
        if len(txt) > 10:
            yield txt

sample = list(islice(cleaned_lines(stream), N_LINES))
if len(sample) < 1000:
    raise RuntimeError(f"Only {len(sample)} lines collected ‚Äî increase N_LINES or re-run.")

cut = int(len(sample) * 0.98)
train_ds = Dataset.from_dict({"text": sample[:cut]})
valid_ds = Dataset.from_dict({"text": sample[cut:]})

print(f"‚úÖ Dataset ready: {len(train_ds)} train / {len(valid_ds)} valid")
print("‚Ä¢ Example:", train_ds[0]["text"][:140].replace("\n"," "))


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

BASE_MODEL = "unsloth/smollm2-135m"
MAX_LEN    = 512

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.model_max_length = MAX_LEN

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, quantization_config=bnb_cfg, device_map="auto",
)
model.config.use_cache = False

lora_cfg = LoraConfig(
    r=16, lora_alpha=16, lora_dropout=0.05, bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()


In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

def tok_fn(batch):
    out = tokenizer(
        batch["text"], truncation=True, max_length=MAX_LEN, padding="max_length",
    )
    out["labels"] = out["input_ids"].copy()
    return out

cols = list(train_ds.column_names)
t_train = train_ds.map(tok_fn, batched=True, remove_columns=cols)
t_eval  = valid_ds.map(tok_fn, batched=True, remove_columns=cols)
t_train.set_format("torch"); t_eval.set_format("torch")

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

args = TrainingArguments(
    output_dir="outputs_cpt_hi_hf",
    per_device_train_batch_size=4,     # if OOM: 2
    gradient_accumulation_steps=2,     # effective batch ~8
    num_train_epochs=1,
    learning_rate=2e-4, warmup_ratio=0.03, weight_decay=0.0,
    logging_steps=20, save_steps=200, save_total_limit=2,
    fp16=True, bf16=False, tf32=False, report_to=[], dataloader_num_workers=0,
)

trainer = Trainer(
    model=model, args=args,
    train_dataset=t_train, eval_dataset=t_eval,
    data_collator=collator,
)

print("‚úÖ Starting Continued Pretraining (Hindi)‚Ä¶")
trainer.train()


In [None]:
import math, os
metrics = trainer.evaluate()
loss = metrics.get("eval_loss", None)
ppl = math.exp(loss) if loss is not None else None
print(f"Validation loss: {loss} | Perplexity: {ppl}")

out_dir = "cpt_hi_smollm2_lora_adapters"
model.save_pretrained(out_dir)
tokenizer.save_pretrained(out_dir)
print("üíæ Saved adapters to:", os.path.abspath(out_dir))


In [None]:
prompt = "‡§≠‡§æ‡§∞‡§§ ‡§ï‡•Ä ‡§∏‡•ç‡§µ‡§§‡§Ç‡§§‡•ç‡§∞‡§§‡§æ ‡§ï‡•á ‡§á‡§§‡§ø‡§π‡§æ‡§∏ ‡§™‡§∞ ‡§§‡•Ä‡§® ‡§µ‡§æ‡§ï‡•ç‡§Ø ‡§≤‡§ø‡§ñ‡§ø‡§è‡•§"
x = tokenizer(prompt, return_tensors="pt").to(next(model.parameters()).device)
with torch.inference_mode():
    y = model.generate(**x, max_new_tokens=120, do_sample=True, temperature=0.8, top_p=0.9)
print(tokenizer.decode(y[0], skip_special_tokens=True))


In [None]:
import os, shutil

ARTIFACTS = [
    "cpt_hi_smollm2_lora_adapters",  # adapters folder from step 6
    "outputs_cpt_hi_hf",             # trainer logs/checkpoints
]

for path in ARTIFACTS:
    if os.path.exists(path):
        out = path.rstrip("/").replace("/", "_") + ".zip"
        if os.path.isdir(path):
            shutil.make_archive(path, 'zip', path)
            print("Zipped:", out)
        else:
            shutil.copy(path, out)
            print("Copied:", out)
    else:
        print("Skip (not found):", path)


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

BASE_MODEL = "unsloth/smollm2-135m"
MERGED_DIR = "cpt_hi_smollm2_merged_fp16"
ADAPTERS   = "cpt_hi_smollm2_lora_adapters"

# reload base in fp16 (not 4-bit) ‚Üí clean merge
base_fp16 = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, torch_dtype=torch.float16, device_map="auto"
)
merged = PeftModel.from_pretrained(base_fp16, ADAPTERS).merge_and_unload()

tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
merged.save_pretrained(MERGED_DIR)
tok.save_pretrained(MERGED_DIR)
print("‚úÖ Merged model saved to:", MERGED_DIR)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Use adapters model (PEFT) ‚Äì or switch to MERGED_DIR if you merged
from peft import PeftModel
BASE_MODEL = "unsloth/smollm2-135m"
ADAPTERS   = "cpt_hi_smollm2_lora_adapters"

tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, device_map="auto", torch_dtype=torch.float16
)
model = PeftModel.from_pretrained(base, ADAPTERS)

prompt = "‡§≠‡§æ‡§∞‡§§ ‡§ï‡•Ä ‡§∏‡•ç‡§µ‡§§‡§Ç‡§§‡•ç‡§∞‡§§‡§æ ‡§ï‡•á ‡§á‡§§‡§ø‡§π‡§æ‡§∏ ‡§™‡§∞ ‡§§‡•Ä‡§® ‡§µ‡§æ‡§ï‡•ç‡§Ø ‡§≤‡§ø‡§ñ‡§ø‡§è‡•§"
x = tok(prompt, return_tensors="pt").to(model.device)
with torch.inference_mode():
    y = model.generate(**x, max_new_tokens=120, do_sample=True, temperature=0.8, top_p=0.9)
print(tok.decode(y[0], skip_special_tokens=True))


In [None]:
# Save continued-pretraining model on Hindi corpus
SAVE_DIR = "continued_pretraining_hindi_smollm2_final"

trainer.save_model(SAVE_DIR)           # saves model weights + config
tokenizer.save_pretrained(SAVE_DIR)    # saves tokenizer files
print("Saved to:", SAVE_DIR)


# DPO (Direct Preference Optimization)

In [None]:
!pip -q install "transformers==4.57.1" "trl==0.24.0" \
                "peft==0.13.2" "accelerate>=0.34.2" \
                "datasets>=2.20.0" "bitsandbytes>=0.45.0"


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

BASE_MODEL = "unsloth/smollm2-135m"
MAX_PROMPT_LEN = 256
MAX_COMPLETION_LEN = 256
MAX_LEN = MAX_PROMPT_LEN + MAX_COMPLETION_LEN

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.model_max_length = MAX_LEN

# try 4-bit; fall back to fp16 if bitsandbytes is unavailable
def load_model():
    try:
        from transformers import BitsAndBytesConfig
        bnb_cfg = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_quant_type="nf4",
        )
        print("‚úÖ Loading 4-bit quantized model‚Ä¶")
        m = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL,
            quantization_config=bnb_cfg,
            device_map="auto",
        )
    except Exception as e:
        print("‚ö†Ô∏è 4-bit load failed, falling back to fp16:", type(e).__name__, "-", e)
        m = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL,
            torch_dtype=torch.float16,
            device_map="auto",
        )
    m.config.use_cache = False
    return m

model = load_model()
print("Model device:", next(model.parameters()).device)


In [None]:
# --- force PEFT to not use bitsandbytes ---
import os
os.environ["BNB_TRITON_DISABLE"] = "1"   # harmless even if bnb isn't present

# If bnb was ever partially imported this session, nuke it:
import sys
for k in list(sys.modules.keys()):
    if k.startswith("bitsandbytes"):
        sys.modules.pop(k, None)

# Monkeypatch PEFT's bnb detection to always False
import peft
import peft.tuners.lora.model as lora_model
lora_model.is_bnb_available = lambda: False

print("‚úÖ Patched PEFT: is_bnb_available() -> False")


In [None]:
from peft import LoraConfig, get_peft_model

lora_cfg = LoraConfig(
    r=16, lora_alpha=16, lora_dropout=0.05, bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg)   # <-- will not import bitsandbytes now
model.print_trainable_parameters()


In [None]:
from datasets import load_dataset

# Small, fast slice for demo; increase later (e.g., train[:8000])
raw = load_dataset("Anthropic/hh-rlhf", split="train[:2000]")

def add_prompt(row):
    ch = row["chosen"] or ""
    first_line = ch.split("\n", 1)[0].strip()
    row["prompt"] = first_line[:200] if first_line else "Please respond helpfully."
    return row

ds = raw.map(add_prompt, desc="Adding prompt")
ds = ds.select_columns(["prompt", "chosen", "rejected"])
print(ds[0])
print("‚úÖ Dataset ready:", ds)


In [None]:
!pip install -q "trl==0.8.6" "transformers==4.44.2" "accelerate" "datasets" "peft==0.12.0"


In [None]:
import trl
print("TRL version:", trl.__version__)


In [None]:
from transformers import AutoTokenizer

BASE_MODEL = "unsloth/smollm2-135m"

# Force the Python (slow) tokenizer ‚Äî avoids the Rust backend entirely
tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL,
    use_fast=False,          # üëà key change
)
print("Loaded tokenizer (slow). pad_token:", tokenizer.pad_token)


In [None]:
# install a compatible tokenizers for transformers 4.57.1
!pip -q install --no-deps "tokenizers==0.20.3"

# sanity: show versions
import importlib.metadata as im
for p in ["transformers","tokenizers","trl","accelerate","peft","datasets"]:
    try: print(p, im.version(p))
    except: print(p, "‚ùå NOT INSTALLED")


In [None]:
!pip -q install --no-deps --force-reinstall "tokenizers==0.22.0"


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

BASE_MODEL = "unsloth/smollm2-135m"
MAX_LEN = 512

# Try fast; if it errors, switch to use_fast=False
try:
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
except Exception:
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.model_max_length = MAX_LEN

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, torch_dtype=torch.float16, device_map="auto"
)
model.config.use_cache = False
print("‚úÖ Loaded. Device:", next(model.parameters()).device)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

BASE_MODEL = "unsloth/smollm2-135m"
MAX_PROMPT_LEN = 256
MAX_COMPLETION_LEN = 256
MAX_LEN = MAX_PROMPT_LEN + MAX_COMPLETION_LEN

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.model_max_length = MAX_LEN

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, torch_dtype=torch.float16, device_map="auto"
)
model.config.use_cache = False
print("‚úÖ Loaded. Device:", next(model.parameters()).device)


In [None]:
# Block any accidental bitsandbytes usage
import os, sys
for k in list(sys.modules.keys()):
    if k.startswith("bitsandbytes"):
        sys.modules.pop(k, None)
os.environ["BNB_TRITON_DISABLE"] = "1"

import peft
import peft.import_utils as peft_import_utils
peft_import_utils.is_bnb_available = lambda: False
peft_import_utils.is_bnb_4bit_available = lambda: False
try:
    import peft.tuners.lora.model as lora_model
    lora_model.is_bnb_available = lambda: False
    lora_model.is_bnb_4bit_available = lambda: False
except Exception:
    pass
print("PEFT bnb checks:", peft_import_utils.is_bnb_available(), peft_import_utils.is_bnb_4bit_available())

from peft import LoraConfig, get_peft_model
lora_cfg = LoraConfig(
    r=16, lora_alpha=16, lora_dropout=0.05, bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()


In [None]:
from datasets import load_dataset

raw = load_dataset("Anthropic/hh-rlhf", split="train[:2000]")

def add_prompt(row):
    ch = row["chosen"] or ""
    first = ch.split("\n", 1)[0].strip()
    row["prompt"] = first[:200] if first else "Please respond helpfully."
    return row

ds = raw.map(add_prompt, desc="Adding prompt").select_columns(["prompt","chosen","rejected"])
print(ds[0]); print("‚úÖ Dataset size:", len(ds))


In [None]:
!pip -q install --no-deps "accelerate==0.34.2"
import os; os.kill(os.getpid(), 9)


In [None]:
import importlib.metadata as im
for p in ["transformers","tokenizers","trl","accelerate","peft","datasets"]:
    try: print(p, im.version(p))
    except: print(p, "‚ùå NOT INSTALLED")
# Expect: accelerate 0.34.2 (with transformers 4.57.1, tokenizers 0.22.x, trl 0.24.0)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

BASE_MODEL = "unsloth/smollm2-135m"
MAX_PROMPT_LEN = 256
MAX_COMPLETION_LEN = 256
MAX_LEN = MAX_PROMPT_LEN + MAX_COMPLETION_LEN

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.model_max_length = MAX_LEN

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, torch_dtype=torch.float16, device_map="auto"
)
model.config.use_cache = False
print("‚úÖ Loaded. Device:", next(model.parameters()).device)


In [None]:
# Block any accidental bitsandbytes usage
import os, sys
for k in list(sys.modules.keys()):
    if k.startswith("bitsandbytes"):
        sys.modules.pop(k, None)
os.environ["BNB_TRITON_DISABLE"] = "1"

import peft
import peft.import_utils as peft_import_utils
peft_import_utils.is_bnb_available = lambda: False
peft_import_utils.is_bnb_4bit_available = lambda: False
try:
    import peft.tuners.lora.model as lora_model
    lora_model.is_bnb_available = lambda: False
    lora_model.is_bnb_4bit_available = lambda: False
except Exception:
    pass
print("PEFT bnb checks:", peft_import_utils.is_bnb_available(), peft_import_utils.is_bnb_4bit_available())

from peft import LoraConfig, get_peft_model
lora_cfg = LoraConfig(
    r=16, lora_alpha=16, lora_dropout=0.05, bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()


In [None]:
from datasets import load_dataset

raw = load_dataset("Anthropic/hh-rlhf", split="train[:2000]")

def add_prompt(row):
    ch = row["chosen"] or ""
    first = ch.split("\n", 1)[0].strip()
    row["prompt"] = first[:200] if first else "Please respond helpfully."
    return row

ds = raw.map(add_prompt, desc="Adding prompt").select_columns(["prompt","chosen","rejected"])
print(ds[0]); print("‚úÖ Dataset size:", len(ds))


In [None]:
# Keep the earlier ds (columns: prompt/chosen/rejected) and tokenizer
MAX_PROMPT_LEN = 256
MAX_COMPLETION_LEN = 256

def _truncate_to(text, max_len):
    ids = tokenizer(
        text,
        truncation=True,
        max_length=max_len,
        add_special_tokens=False,
    )["input_ids"]
    return tokenizer.decode(ids, skip_special_tokens=True)

def clip_completions(row):
    row["chosen"]   = _truncate_to(row["chosen"],   MAX_COMPLETION_LEN)
    row["rejected"] = _truncate_to(row["rejected"], MAX_COMPLETION_LEN)
    return row

ds = ds.map(clip_completions, desc="Truncating chosen/rejected")


In [None]:
MAX_PROMPT_LEN = 256
MAX_COMPLETION_LEN = 256

def _truncate_to(text, max_len):
    ids = tokenizer(text, truncation=True, max_length=max_len, add_special_tokens=False)["input_ids"]
    return tokenizer.decode(ids, skip_special_tokens=True)

def clip_all(row):
    row["prompt"]   = _truncate_to(row["prompt"],   MAX_PROMPT_LEN)
    row["chosen"]   = _truncate_to(row["chosen"],   MAX_COMPLETION_LEN)
    row["rejected"] = _truncate_to(row["rejected"], MAX_COMPLETION_LEN)
    return row

ds = ds.map(clip_all, desc="Clipping prompt/chosen/rejected")


In [None]:
def lens(row):
    return {
        "prompt_len":   len(tokenizer(row["prompt"], add_special_tokens=False)["input_ids"]),
        "chosen_len":   len(tokenizer(row["chosen"], add_special_tokens=False)["input_ids"]),
        "rejected_len": len(tokenizer(row["rejected"], add_special_tokens=False)["input_ids"]),
    }

sample = ds.select(range(32)).map(lens)
max_prompt   = max(sample["prompt_len"])
max_chosen   = max(sample["chosen_len"])
max_rejected = max(sample["rejected_len"])
print("Max prompt/chosen/rejected:", max_prompt, max_chosen, max_rejected)
print("Model max:", tokenizer.model_max_length)


In [None]:
!pip -q install --no-deps "accelerate==0.34.2"
import os; os.kill(os.getpid(), 9)


In [None]:
import importlib.metadata as im
for p in ["transformers","tokenizers","trl","accelerate","peft","datasets"]:
    try: print(p, im.version(p))
    except: print(p, "‚ùå NOT INSTALLED")
# Expect: transformers 4.57.1 | tokenizers 0.22.x | trl 0.24.0 | accelerate 0.34.2 | peft 0.13.2


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

BASE_MODEL = "unsloth/smollm2-135m"
MAX_PROMPT_LEN, MAX_COMPLETION_LEN = 256, 256
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.model_max_length = MAX_PROMPT_LEN + MAX_COMPLETION_LEN

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL, torch_dtype=torch.float16, device_map="auto"
)
model.config.use_cache = False
print("‚úÖ Loaded. Device:", next(model.parameters()).device)


In [None]:
import os, sys
for k in list(sys.modules.keys()):
    if k.startswith("bitsandbytes"):
        sys.modules.pop(k, None)
os.environ["BNB_TRITON_DISABLE"] = "1"

import peft
import peft.import_utils as peft_import_utils
peft_import_utils.is_bnb_available = lambda: False
peft_import_utils.is_bnb_4bit_available = lambda: False
try:
    import peft.tuners.lora.model as lora_model
    lora_model.is_bnb_available = lambda: False
    lora_model.is_bnb_4bit_available = lambda: False
except Exception:
    pass

from peft import LoraConfig, get_peft_model
lora_cfg = LoraConfig(
    r=16, lora_alpha=16, lora_dropout=0.05, bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()


In [None]:
from datasets import load_dataset

raw = load_dataset("Anthropic/hh-rlhf", split="train[:2000]")

def add_prompt(row):
    ch = row["chosen"] or ""
    first = ch.split("\n", 1)[0].strip()
    row["prompt"] = first[:200] if first else "Please respond helpfully."
    return row

ds = raw.map(add_prompt, desc="Adding prompt").select_columns(["prompt","chosen","rejected"])

def _truncate_to(text, max_len):
    ids = tokenizer(text, truncation=True, max_length=max_len, add_special_tokens=False)["input_ids"]
    return tokenizer.decode(ids, skip_special_tokens=True)

def clip_all(row):
    row["prompt"]   = _truncate_to(row["prompt"],   MAX_PROMPT_LEN)
    row["chosen"]   = _truncate_to(row["chosen"],   MAX_COMPLETION_LEN)
    row["rejected"] = _truncate_to(row["rejected"], MAX_COMPLETION_LEN)
    return row

ds = ds.map(clip_all, desc="Clipping prompt/chosen/rejected")
print(ds[0]); print("‚úÖ Dataset size:", len(ds))


In [None]:
# Make accelerate compatible with transformers 4.57.1
!pip -q install --no-deps --force-reinstall "accelerate==0.34.2"

# Hard-restart Python so the new wheel is actually used
import os; os.kill(os.getpid(), 9)


In [None]:
import importlib.metadata as im
for p in ["transformers","tokenizers","trl","accelerate","peft","datasets"]:
    try: print(p, im.version(p))
    except: print(p, "‚ùå NOT INSTALLED")
# Expect: transformers 4.57.1 | tokenizers 0.22.x | trl 0.24.0 | accelerate 0.34.2 | peft 0.13.2


In [None]:
# Define these again before your DPOConfig
MAX_PROMPT_LEN = 256
MAX_COMPLETION_LEN = 256

from trl import DPOConfig

dpo_cfg = DPOConfig(
    output_dir="outputs_dpo_smollm2",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    max_steps=200,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    learning_rate=5e-6,
    beta=0.1,  # Goes inside config
    max_prompt_length=MAX_PROMPT_LEN,
    max_completion_length=MAX_COMPLETION_LEN,
    fp16=True,
    bf16=False,
    tf32=False,
    report_to=[],
    dataloader_num_workers=0,
    remove_unused_columns=False,
)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

BASE_MODEL = "unsloth/smollm2-135m"
MAX_PROMPT_LEN = 256
MAX_COMPLETION_LEN = 256

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = MAX_PROMPT_LEN + MAX_COMPLETION_LEN

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto"
)
model.config.use_cache = False

print("‚úÖ Model + Tokenizer ready on", next(model.parameters()).device)


In [None]:
# === Rebuild the tiny preference dataset for DPO ===
from datasets import load_dataset

# If tokenizer isn't in scope (after a restart), quickly reload it
try:
    tokenizer
except NameError:
    from transformers import AutoTokenizer
    BASE_MODEL = "unsloth/smollm2-135m"
    MAX_PROMPT_LEN, MAX_COMPLETION_LEN = 256, 256
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.model_max_length = MAX_PROMPT_LEN + MAX_COMPLETION_LEN

raw = load_dataset("Anthropic/hh-rlhf", split="train[:2000]")

def add_prompt(row):
    ch = row["chosen"] or ""
    first = ch.split("\n", 1)[0].strip()
    row["prompt"] = first[:200] if first else "Please respond helpfully."
    return row

ds = raw.map(add_prompt, desc="Adding prompt").select_columns(["prompt","chosen","rejected"])

def _truncate_to(text, max_len):
    ids = tokenizer(text, truncation=True, max_length=max_len, add_special_tokens=False)["input_ids"]
    return tokenizer.decode(ids, skip_special_tokens=True)

def clip_all(row):
    row["prompt"]   = _truncate_to(row["prompt"],   MAX_PROMPT_LEN)
    row["chosen"]   = _truncate_to(row["chosen"],   MAX_COMPLETION_LEN)
    row["rejected"] = _truncate_to(row["rejected"], MAX_COMPLETION_LEN)
    return row

ds = ds.map(clip_all, desc="Clipping prompt/chosen/rejected")
print("‚úÖ ds ready with columns:", ds.column_names, "| size:", len(ds))
print(ds[0])


In [None]:
# === One-shot compatibility patch for accelerate < 0.34 ===
import inspect, accelerate

sig = inspect.signature(accelerate.Accelerator.unwrap_model)
needs_patch = "keep_torch_compile" not in sig.parameters

if needs_patch:
    _old_unwrap = accelerate.Accelerator.unwrap_model
    def _unwrap_compat(self, model, *args, **kwargs):
        kwargs.pop("keep_torch_compile", None)  # drop unknown kwarg
        return _old_unwrap(self, model, *args, **kwargs)
    accelerate.Accelerator.unwrap_model = _unwrap_compat
    print("ü©π Patched accelerate.Accelerator.unwrap_model (no restart needed).")
else:
    print("‚úÖ accelerate already supports keep_torch_compile.")

# (Optional) show versions once so we know the runtime state
try:
    import importlib.metadata as im
    print("versions:",
          "transformers", im.version("transformers"),
          "| tokenizers", im.version("tokenizers"),
          "| trl", im.version("trl"),
          "| accelerate", im.version("accelerate"),
          "| peft", im.version("peft"))
except Exception as _e:
    print("version-check skipped:", _e)


In [None]:
# --- Minimal collator for TRL DPOTrainer (version-proof) ---
import torch

class SimpleDPOCollator:
    """
    Expects dataset items with keys: 'prompt', 'chosen', 'rejected'.
    Returns Long tensors:
      chosen_input_ids / chosen_attention_mask
      rejected_input_ids / rejected_attention_mask
    """
    def __init__(self, tokenizer, max_len: int):
        self.tok = tokenizer
        self.max_len = max_len

    def __call__(self, features):
        prompts  = [ex["prompt"] for ex in features]
        chosens  = [ex["chosen"] for ex in features]
        rejecteds= [ex["rejected"] for ex in features]

        # Concatenate prompt + completion (chosen/rejected)
        chosen_texts   = [f"{p}\n{c}" for p, c in zip(prompts, chosens)]
        rejected_texts = [f"{p}\n{r}" for p, r in zip(prompts, rejecteds)]

        ch = self.tok(
            chosen_texts,
            padding=True,
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )
        rj = self.tok(
            rejected_texts,
            padding=True,
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )

        # ensure Long dtype for embedding lookups
        ch["input_ids"] = ch["input_ids"].long()
        rj["input_ids"] = rj["input_ids"].long()
        ch["attention_mask"] = ch["attention_mask"].long()
        rj["attention_mask"] = rj["attention_mask"].long()

        return {
            "chosen_input_ids":   ch["input_ids"],
            "chosen_attention_mask": ch["attention_mask"],
            "rejected_input_ids": rj["input_ids"],
            "rejected_attention_mask": rj["attention_mask"],
        }

# build the collator
MAX_TOTAL_LEN = tokenizer.model_max_length  # e.g. 512
collator = SimpleDPOCollator(tokenizer, MAX_TOTAL_LEN)
print("‚úÖ SimpleDPOCollator ready (max_len =", MAX_TOTAL_LEN, ")")


In [None]:
# Patch unwrap_model only if your accelerate doesn't know the kwarg
import inspect, accelerate
if "keep_torch_compile" not in inspect.signature(accelerate.Accelerator.unwrap_model).parameters:
    _old = accelerate.Accelerator.unwrap_model
    def _unwrap_compat(self, model, *args, **kwargs):
        kwargs.pop("keep_torch_compile", None)
        return _old(self, model, *args, **kwargs)
    accelerate.Accelerator.unwrap_model = _unwrap_compat
    print("ü©π Patched accelerate.Accelerator.unwrap_model")


In [None]:
from datasets import load_dataset

# If not already in memory, re-create tokenizer, model_max_length, etc.
try:
    tokenizer
except NameError:
    from transformers import AutoTokenizer
    BASE_MODEL = "unsloth/smollm2-135m"
    MAX_PROMPT_LEN, MAX_COMPLETION_LEN = 256, 256
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.model_max_length = MAX_PROMPT_LEN + MAX_COMPLETION_LEN

raw = load_dataset("Anthropic/hh-rlhf", split="train[:2000]")

def add_prompt(row):
    ch = row["chosen"] or ""
    first = ch.split("\n", 1)[0].strip()
    row["prompt"] = first[:200] if first else "Please respond helpfully."
    return row

ds_text = raw.map(add_prompt).select_columns(["prompt","chosen","rejected"])

def _truncate_to(text, max_len):
    ids = tokenizer(text, truncation=True, max_length=max_len, add_special_tokens=False)["input_ids"]
    return tokenizer.decode(ids, skip_special_tokens=True)

def clip_all(row):
    row["prompt"]   = _truncate_to(row["prompt"],   MAX_PROMPT_LEN)
    row["chosen"]   = _truncate_to(row["chosen"],   MAX_COMPLETION_LEN)
    row["rejected"] = _truncate_to(row["rejected"], MAX_COMPLETION_LEN)
    return row

ds_text = ds_text.map(clip_all, desc="Clipping prompt/chosen/rejected")
print("‚úÖ ds_text columns:", ds_text.column_names, "| size:", len(ds_text))


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

BASE_MODEL = "unsloth/smollm2-135m"

# ‚¨áÔ∏è key change: use_fast=False
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.config.use_cache = False


In [None]:
from datasets import load_dataset

# Small subset of HH-RLHF dataset
raw = load_dataset("Anthropic/hh-rlhf", split="train[:2000]")

# Add a short "prompt" extracted from the chosen text
def add_prompt(row):
    ch = row["chosen"] or ""
    first = ch.split("\n", 1)[0].strip()
    row["prompt"] = first[:200] if first else "Please respond helpfully."
    return row

ds = raw.map(add_prompt).select_columns(["prompt", "chosen", "rejected"])
print("‚úÖ Dataset columns:", ds.column_names)
print("Example:\n", ds[0])


In [None]:
import torch

MAX_PROMPT_LEN, MAX_COMPLETION_LEN = 256, 256  # keep same as earlier

def dpo_collator(features):
    # Text lists
    prompts   = [f["prompt"]   for f in features]
    chosens   = [f["chosen"]   for f in features]
    rejecteds = [f["rejected"] for f in features]

    # Tokenize & pad each field separately
    p = tokenizer(
        prompts, padding=True, truncation=True,
        max_length=MAX_PROMPT_LEN, return_tensors="pt"
    )
    c = tokenizer(
        chosens, padding=True, truncation=True,
        max_length=MAX_COMPLETION_LEN, return_tensors="pt"
    )
    r = tokenizer(
        rejecteds, padding=True, truncation=True,
        max_length=MAX_COMPLETION_LEN, return_tensors="pt"
    )

    # Return the exact keys DPOTrainer expects when no tokenizer/processing_class is given
    return {
        "prompt_input_ids":         p["input_ids"],
        "prompt_attention_mask":    p["attention_mask"],
        "chosen_input_ids":         c["input_ids"],
        "chosen_attention_mask":    c["attention_mask"],
        "rejected_input_ids":       r["input_ids"],
        "rejected_attention_mask":  r["attention_mask"],
    }


In [None]:
import torch
from trl import DPOTrainer
from transformers import TrainingArguments

# --- if you *don't* already have dpo_cfg, uncomment this minimal one ---
# dpo_cfg = TrainingArguments(
#     output_dir="outputs_dpo_smollm2",
#     per_device_train_batch_size=4,   # if OOM: 2
#     gradient_accumulation_steps=1,
#     num_train_epochs=1,
#     logging_steps=10,
#     save_steps=50,
#     learning_rate=5e-6,
#     fp16=False, bf16=False,          # we disable AMP
#     report_to=[],
#     remove_unused_columns=False,
#     optim="adamw_torch",
# )

MAX_PROMPT_LEN = 256
MAX_COMPLETION_LEN = 256

trainer = DPOTrainer(
    model=model,
    args=dpo_cfg,              # your TrainingArguments from before (fp16=False)
    train_dataset=ds,          # columns: prompt / chosen / rejected
    tokenizer=tokenizer,
    max_prompt_length=MAX_PROMPT_LEN,
    max_target_length=MAX_COMPLETION_LEN,
    max_length=MAX_PROMPT_LEN + MAX_COMPLETION_LEN,
    precompute_ref_log_probs=False,
)

# ------------------ HARD DISABLE GradScaler / AMP in Accelerate ------------------
# Some Accelerate/Trainer combos still build a GradScaler. We disable it safely.
acc = trainer.accelerator

# 1) Turn off the scaler if present
if getattr(acc, "scaler", None) is not None:
    try:
        # Preferred: mark scaler as disabled
        acc.scaler._enabled = False
    except Exception:
        # Fallback: remove reference
        acc.scaler = None

# 2) No-op unscale to avoid "Attempting to unscale FP16 gradients."
def _noop_unscale(*args, **kwargs):
    return None
acc.unscale_gradients = _noop_unscale

# 3) Make sure no autocast sneaks in
try:
    torch.set_autocast_enabled(False)  # works on newer torch; harmless otherwise
except Exception:
    pass

# 4) Ensure model/optimizer are in fp32
model.to(torch.float32)

print("üöÄ Starting DPO training (AMP force-disabled)‚Ä¶")
trainer.train()


In [None]:
import torch
from transformers import TextStreamer

gen_kwargs = dict(
    max_new_tokens=200,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.05,
    pad_token_id=tokenizer.eos_token_id,
)

def chat(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(trainer.model.device)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    with torch.no_grad():
        out = trainer.model.generate(**inputs, streamer=streamer, **gen_kwargs)
    print("\n")

test_prompts = [
    "Explain what DPO (Direct Preference Optimization) is in 2-3 sentences.",
    "Write a short, friendly email to thank a mentor for their help.",
]

for p in test_prompts:
    print("### PROMPT:", p)
    chat(p)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch, copy

BASE_MODEL = "unsloth/smollm2-135m"  # the same one you trained from

base_tok = tokenizer  # reuse
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.float32).to("cuda")

def generate(model, tok, prompt):
    inputs = tok(prompt, return_tensors="pt").to(model.device)
    out = model.generate(**inputs, max_new_tokens=150, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tok.eos_token_id)
    return tok.decode(out[0], skip_special_tokens=True)

cmp_prompt = "Give three tips to stay focused while studying."

print("----- BASE -----")
print(generate(base_model, base_tok, cmp_prompt))

print("\n----- DPO-TUNED -----")
print(generate(trainer.model, tokenizer, cmp_prompt))


In [None]:
SAVE_DIR = "dpo_smollm2_final"
trainer.save_model(SAVE_DIR)           # saves model weights, config
tokenizer.save_pretrained(SAVE_DIR)    # saves tokenizer
print("Saved to:", SAVE_DIR)
