In [1]:
!pip -q install "trl>=0.20.0" "peft>=0.17.0" "transformers>=4.55.0" datasets

import torch, random, os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Mxfp4Config
from peft import LoraConfig
from trl import ORPOTrainer, ORPOConfig

jsonl_path = "/content/orpo_pairs_best_contact.jsonl"   # {"prompt","chosen","rejected"}
output_dir = "/content/oss20b-orpo"
model_name = "openai/gpt-oss-20b"
seed = 7

random.seed(seed); torch.manual_seed(seed)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token

quantization_config = Mxfp4Config(dequantize=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    attn_implementation="eager",
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,  # MXFP4 → dequantize to bf16 for LoRA
    use_cache=False,
    device_map="auto",
)

peft_config = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05,
    target_modules="all-linear",  # works with oss-20b; expand if you later target MoE experts
    bias="none", task_type="CAUSAL_LM"
)

ds = load_dataset("json", data_files=jsonl_path, split="train")

args = ORPOConfig(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    logging_steps=20,
    save_steps=500,
    beta=0.1,
    gradient_checkpointing=True,
    bf16=True, tf32=True,
    report_to=[],
)

trainer = ORPOTrainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    peft_config=peft_config,
    train_dataset=ds,
)

trainer.train()
trainer.save_model(os.path.join(output_dir, "checkpoint-final"))
tokenizer.save_pretrained(os.path.join(output_dir, "checkpoint-final"))

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/564.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━[0m [32m471.0/564.7 kB[0m [31m13.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25h

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/27.9M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/98.0 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00000-of-00002.safetensors:   0%|          | 0.00/4.79G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.17G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.80G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.98 GiB. GPU 0 has a total capacity of 14.74 GiB of which 1.97 GiB is free. Process 5423 has 12.77 GiB memory in use. Of the allocated memory 10.15 GiB is allocated by PyTorch, and 2.51 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [3]:
from transformers import AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig

use_4bit = True
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
)

cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
quant_in_repo = getattr(cfg, "quantization_config", None)
quant_name = getattr(getattr(quant_in_repo, "__class__", None), "__name__", "None")

if quant_name in ("None", "NoneType"):
    # safe: repo is not pre-quantized → use bnb 4-bit or full precision
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=bnb_config if use_4bit else None,
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    )
elif quant_name == "BitsAndBytesConfig":
    # repo already set up for bnb; don’t pass your own config
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    )
else:
    # e.g., Mxfp4Config/AwqConfig/GptqConfig (pre-quantized). Not suitable for LoRA training.
    raise ValueError(f"Repo is pre-quantized with {quant_name}. Choose a non-quantized base (e.g., Qwen2.5-3B-Instruct or Mistral-7B-Instruct) for ORPO LoRA.")

ValueError: Repo is pre-quantized with dict. Choose a non-quantized base (e.g., Qwen2.5-3B-Instruct or Mistral-7B-Instruct) for ORPO LoRA.