In [5]:
# 📦 Install Unsloth and dependencies PROPERLY
%%capture
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

In [6]:
# load model
from unsloth import FastLanguageModel, PatchDPOTrainer
import torch

# Important: Patch DPO trainer for Unsloth compatibility
PatchDPOTrainer()

# --- Load model ---
max_seq_length = 4096
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",  # 🆕 Different model
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.4.1: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

In [7]:
# add LORA
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 42,
    use_rslora = False,
    loftq_config = None,
)

In [8]:
# dataset
from datasets import load_dataset

# 🆕 Use a different dataset: OpenAssistant's 200k
dataset = load_dataset("OpenAssistant/oasst1", split="train")
dataset = dataset.select(range(500))  # 🔥 500 samples for a fast Colab demo

# --- Format Dataset for DPO ---
EOS_TOKEN = tokenizer.eos_token

def format_prompt(sample):
    instruction = sample["prompt"] if "prompt" in sample else sample["text"]  # fallback
    input_text = ""
    accepted = sample["text"]
    rejected = "I'm sorry, but I can't help with that."  # Dummy negative

    sample["prompt"] = f"Below is an instruction. Complete it.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n"
    sample["chosen"] = accepted + EOS_TOKEN
    sample["rejected"] = rejected + EOS_TOKEN
    return sample

dataset = dataset.map(format_prompt)
dataset = dataset.remove_columns([col for col in dataset.column_names if col not in ["prompt", "chosen", "rejected"]])

# --- Check sample ---
import pprint
pprint.pprint(dataset[0])

README.md:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

(…)-00000-of-00001-b42a775f407cee45.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

(…)-00000-of-00001-134b8fd0c89408b6.parquet:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

{'chosen': 'Can you write a short introduction about the relevance of the term '
           '"monopsony" in economics? Please use examples related to potential '
           'monopsonies in the labour market and cite relevant research.</s>',
 'prompt': 'Below is an instruction. Complete it.\n'
           '\n'
           '### Instruction:\n'
           'Can you write a short introduction about the relevance of the term '
           '"monopsony" in economics? Please use examples related to potential '
           'monopsonies in the labour market and cite relevant research.\n'
           '\n'
           '### Input:\n'
           '\n'
           '\n'
           '### Response:\n',
 'rejected': "I'm sorry, but I can't help with that.</s>"}


In [9]:
# train with orpo
from trl import DPOTrainer, DPOConfig
from unsloth import is_bfloat16_supported

trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = DPOConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 1,  # 🔥 Adjust as needed
        learning_rate = 5e-6,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.0,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "outputs",
        report_to = "none",
    ),
    beta = 0.1,
    train_dataset = dataset,
    tokenizer = tokenizer,
    max_length = 1024,
    max_prompt_length = 512,
)

trainer.train()

Extracting prompt in train dataset (num_proc=12):   0%|          | 0/500 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=12):   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=12):   0%|          | 0/500 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 1 | Total steps = 62
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 83,886,080/7,000,000,000 (1.20% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.6931,0.0,0.0,0.0,0.0,-89.980812,-91.749756,-2.140055,-2.540161,0,0,0,0
2,0.6931,0.0,0.0,0.0,0.0,-78.720428,-93.605804,-1.9393,-2.460853,No Log,No Log,No Log,No Log
3,0.6931,0.009004,0.00823,0.375,0.000774,-75.937157,-97.214111,-2.303941,-2.571723,No Log,No Log,No Log,No Log
4,0.6711,0.025328,-0.02014,0.75,0.045468,-50.049755,-86.55481,-2.397795,-2.615207,No Log,No Log,No Log,No Log
5,0.6762,-0.005252,-0.039715,0.875,0.034463,-55.78981,-99.438599,-2.243343,-2.542661,No Log,No Log,No Log,No Log
6,0.6546,0.017451,-0.061454,1.0,0.078905,-58.722321,-87.987938,-2.081402,-2.568997,No Log,No Log,No Log,No Log
7,0.6049,0.05677,-0.128867,1.0,0.185637,-77.092896,-105.220428,-1.552527,-2.548612,No Log,No Log,No Log,No Log
8,0.5626,0.10423,-0.182141,0.875,0.286371,-138.085098,-79.871201,-2.380984,,No Log,No Log,No Log,No Log
9,0.5299,0.074952,-0.296856,0.875,0.371807,-131.768051,-75.275101,-2.289973,-2.705167,No Log,No Log,No Log,No Log
10,0.4,0.193581,-0.542943,1.0,0.736523,-72.919884,-104.112877,-1.672633,-2.492047,No Log,No Log,No Log,No Log


TrainOutput(global_step=62, training_loss=0.14335419213515002, metrics={'train_runtime': 223.2164, 'train_samples_per_second': 2.24, 'train_steps_per_second': 0.278, 'total_flos': 0.0, 'train_loss': 0.14335419213515002, 'epoch': 0.992})

In [10]:
# inference
FastLanguageModel.for_inference(model)

test_prompt = "Explain why rainbows are curved."

inputs = tokenizer(
    [
        f"Below is an instruction. Complete it.\n\n### Instruction:\n{test_prompt}\n\n### Input:\n\n### Response:\n"
    ],
    return_tensors="pt",
).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=100, use_cache=True)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['Below is an instruction. Complete it.\n\n### Instruction:\nExplain why rainbows are curved.\n\n### Input:\n\n### Response:\n\nRainbows are curved because they are formed by refracting light in water droplets. When sunlight enters a water droplet, it is refracted, or bent, and separates into its component colors due to differing wavelengths. This separation of light is known as dispersion. The light then reflects off the inside surface of the droplet and refracts again as it exits the droplet. The curvature of rainbows results from the']


In [11]:
# save model
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

# Save merged model in 16bit
model.save_pretrained_merged("merged_model", tokenizer, save_method="merged_16bit")

Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 4.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 53.6 out of 83.48 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:00<00:00, 52.81it/s]


Unsloth: Saving tokenizer... Done.
Done.
