In [1]:
import gc
import torch
from unsloth import FastLanguageModel

import transformers
from datasets import load_dataset
from peft import LoraConfig
from trl import DPOTrainer, DPOConfig, SFTConfig, SFTTrainer
import bitsandbytes as bnb

from tqdm import tqdm
import numpy as np
import warnings


warnings.filterwarnings("ignore")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch SmolVLMForConditionalGeneration forward function.
ü¶• Unsloth Zoo will now patch everything to make training faster!
INFO 05-13 17:28:08 [__init__.py:239] Automatically detected platform cuda.


In [None]:
max_seq_length = 2048
lora_rank = 32

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-3B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True,
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.8, 
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth",
    random_state = 2811,
)

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch SmolVLMForConditionalGeneration forward function.
ü¶• Unsloth Zoo will now patch everything to make training faster!
INFO 05-13 15:00:59 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.4.1: Fast Qwen2 patching. Transformers: 4.51.3. vLLM: 0.8.4.
   \\   /|    NVIDIA GeForce RTX 4070 Ti SUPER. Num GPUs = 1. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.4.1 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [3]:
dataset = load_dataset("5CD-AI/Vietnamese-Multi-turn-Chat-Alpaca", split="train")

In [4]:
dataset, dataset[0]

(Dataset({
     features: ['conversations', 'id'],
     num_rows: 12697
 }),
 {'conversations': [{'from': 'human',
    'value': 'H√£y ch·ªânh s·ª≠a c√¢u n√†y ƒë·ªÉ ng·∫Øn g·ªçn h∆°n m√† kh√¥ng m·∫•t ƒëi √Ω nghƒ©a: "Tr·∫≠n ƒë·∫•u l√† m·ªôt th·∫•t b·∫°i n·∫∑ng n·ªÅ m·∫∑c d√π th·ª±c t·∫ø l√† c·∫£ ƒë·ªôi ƒë√£ t·∫≠p luy·ªán trong nhi·ªÅu tu·∫ßn."'},
   {'from': 'gpt',
    'value': 'Nhi·ªÅu tu·∫ßn hu·∫•n luy·ªán c·ªßa ƒë·ªôi ƒë√£ d·∫´n ƒë·∫øn m·ªôt th·∫•t b·∫°i n·∫∑ng n·ªÅ.'},
   {'from': 'human',
    'value': 'B·∫°n c√≥ th·ªÉ ƒë·ªÅ xu·∫•t m·ªôt s·ªë chi·∫øn l∆∞·ª£c m√† nh√≥m c√≥ th·ªÉ s·ª≠ d·ª•ng ƒë·ªÉ c·∫£i thi·ªán hi·ªáu su·∫•t c·ªßa h·ªç trong tr·∫≠n ƒë·∫•u ti·∫øp theo kh√¥ng?'},
   {'from': 'gpt',
    'value': 'Ch·∫Øc ch·∫Øn, ƒë√¢y l√† m·ªôt s·ªë chi·∫øn l∆∞·ª£c m√† nh√≥m c√≥ th·ªÉ s·ª≠ d·ª•ng ƒë·ªÉ c·∫£i thi·ªán hi·ªáu su·∫•t c·ªßa m√¨nh trong tr·∫≠n ƒë·∫•u ti·∫øp theo: 1. Ph√¢n t√≠ch tr·∫≠n ƒë·∫•u tr∆∞·ªõc b·∫±ng c√°ch xem l·∫°i c·∫£nh quay tr·∫≠n ƒë·∫•u ƒë·ªÉ x√°c ƒë·ªãnh ƒëi·ªÉm y·∫

## **SFT**

In [5]:
SYS_INSTRUCT = "B·∫°n l√† m·ªôt tr·ª£ l√Ω AI th√¢n thi·ªán, h√£y tr·∫£ l·ªùi b·∫±ng ti·∫øng Vi·ªát."

def convert_to_chat_format(conversations):
    messages = [{"role": "system", "content": SYS_INSTRUCT}]
    for msg in conversations:
        role = "user" if msg["from"] == "human" else "assistant"
        messages.append({"role": role, "content": msg["value"]})
    return messages

def format_prompt(example):
    messages = convert_to_chat_format(example["conversations"])
    return {
        "text": tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
    }

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_seq_length,
    )

dataset = dataset.map(format_prompt, remove_columns=dataset.column_names)
dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/12697 [00:00<?, ? examples/s]

In [6]:
print(dataset[0]['text'])

<|im_start|>system
B·∫°n l√† m·ªôt tr·ª£ l√Ω AI th√¢n thi·ªán, h√£y tr·∫£ l·ªùi b·∫±ng ti·∫øng Vi·ªát.<|im_end|>
<|im_start|>user
H√£y ch·ªânh s·ª≠a c√¢u n√†y ƒë·ªÉ ng·∫Øn g·ªçn h∆°n m√† kh√¥ng m·∫•t ƒëi √Ω nghƒ©a: "Tr·∫≠n ƒë·∫•u l√† m·ªôt th·∫•t b·∫°i n·∫∑ng n·ªÅ m·∫∑c d√π th·ª±c t·∫ø l√† c·∫£ ƒë·ªôi ƒë√£ t·∫≠p luy·ªán trong nhi·ªÅu tu·∫ßn."<|im_end|>
<|im_start|>assistant
Nhi·ªÅu tu·∫ßn hu·∫•n luy·ªán c·ªßa ƒë·ªôi ƒë√£ d·∫´n ƒë·∫øn m·ªôt th·∫•t b·∫°i n·∫∑ng n·ªÅ.<|im_end|>
<|im_start|>user
B·∫°n c√≥ th·ªÉ ƒë·ªÅ xu·∫•t m·ªôt s·ªë chi·∫øn l∆∞·ª£c m√† nh√≥m c√≥ th·ªÉ s·ª≠ d·ª•ng ƒë·ªÉ c·∫£i thi·ªán hi·ªáu su·∫•t c·ªßa h·ªç trong tr·∫≠n ƒë·∫•u ti·∫øp theo kh√¥ng?<|im_end|>
<|im_start|>assistant
Ch·∫Øc ch·∫Øn, ƒë√¢y l√† m·ªôt s·ªë chi·∫øn l∆∞·ª£c m√† nh√≥m c√≥ th·ªÉ s·ª≠ d·ª•ng ƒë·ªÉ c·∫£i thi·ªán hi·ªáu su·∫•t c·ªßa m√¨nh trong tr·∫≠n ƒë·∫•u ti·∫øp theo: 1. Ph√¢n t√≠ch tr·∫≠n ƒë·∫•u tr∆∞·ªõc b·∫±ng c√°ch xem l·∫°i c·∫£nh quay tr·∫≠n ƒë·∫•u ƒë·ªÉ x√°c ƒë·ªãnh ƒëi·ªÉm y·∫øu v√† c√°c lƒ©nh 

In [7]:
# data collator for causal LM
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [8]:
hyperparameters = {
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 8,
    "gradient_checkpointing": True,
    "learning_rate": 3e-5,
    "logging_steps": 100,
    "num_train_epochs": 1,
    "save_strategy": "steps",
    "overwrite_output_dir": True,
    "optim": "paged_adamw_8bit",
    "warmup_ratio": 0.05,
    "bf16": True,
    "report_to": "none",
}
MAX_LENGTH = 2048

In [9]:
import os
os.environ['UNSLOTH_RETURN_LOGITS'] = '1'

In [10]:
SFT_OUTPUT_DIR = f"Qwen2.5-3B-Instruct-multi-conversation"

In [11]:
sft_config = SFTConfig(
    **{ **hyperparameters, "output_dir":
       SFT_OUTPUT_DIR , "max_seq_length": MAX_LENGTH, "packing": True}
)


sft_trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset,
    data_collator=data_collator,
)

sft_trainer.train()

Converting train dataset to ChatML (num_proc=20):   0%|          | 0/12697 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=20):   0%|          | 0/12697 [00:00<?, ? examples/s]

Packing train dataset (num_proc=20):   0%|          | 0/12697 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 6,354 | Num Epochs = 1 | Total steps = 397
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 59,867,136/3,000,000,000 (2.00% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
100,1.3574
200,1.2292
300,1.2072


TrainOutput(global_step=397, training_loss=1.2482317677072674, metrics={'train_runtime': 4142.437, 'train_samples_per_second': 1.534, 'train_steps_per_second': 0.096, 'total_flos': 2.212532916804649e+17, 'train_loss': 1.2482317677072674})

In [45]:
torch.cuda.empty_cache()

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    './Qwen2.5-3B-Instruct-multi-conversation/checkpoint-397',
    max_seq_length = 2048,
    load_in_4bit = True,
    dtype=torch.bfloat16,
)

==((====))==  Unsloth 2025.4.1: Fast Qwen2 patching. Transformers: 4.51.3. vLLM: 0.8.4.
   \\   /|    NVIDIA GeForce RTX 4070 Ti SUPER. Num GPUs = 1. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
model.push_to_hub_merged(
    "binhphap5/Qwen2.5-3B-Instruct-Chat-sft",
    commit_message="Merge lora weights to push to hub",
    token="???"
)

tokenizer.push_to_hub(
    "binhphap5/Qwen2.5-3B-Instruct-Chat-sft",
    commit_message="Push tokenizer to hub",
    token="???"
)

In [50]:
from transformers import GenerationConfig

generation_config = GenerationConfig(
    max_new_tokens=128,
    temperature=0.8,
    top_p=0.95,
    top_k=10,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

prompt = [
    {"role": "system", "content": (
        "B·∫°n l√† m·ªôt tr·ª£ l√Ω AI th√¢n thi·ªán, "
        "h√£y tr·∫£ l·ªùi b·∫±ng ti·∫øng Vi·ªát."
    )},
    {"role": "user", "content": (
        "H√£y ch·ªânh s·ª≠a c√¢u n√†y ƒë·ªÉ ng·∫Øn g·ªçn h∆°n m√† kh√¥ng m·∫•t ƒëi √Ω nghƒ©a: "
        "\"Tr·∫≠n ƒë·∫•u l√† m·ªôt th·∫•t b·∫°i n·∫∑ng n·ªÅ "
        "m·∫∑c d√π th·ª±c t·∫ø l√† c·∫£ ƒë·ªôi ƒë√£ t·∫≠p luy·ªán trong nhi·ªÅu tu·∫ßn.\""
    )},
    {"role": "assistant", "content": (
        "Nhi·ªÅu tu·∫ßn hu·∫•n luy·ªán c·ªßa ƒë·ªôi ƒë√£ d·∫´n ƒë·∫øn m·ªôt th·∫•t b·∫°i n·∫∑ng n·ªÅ."
    )},
    {"role": "user", "content": (
        "B·∫°n c√≥ th·ªÉ ƒë·ªÅ xu·∫•t m·ªôt s·ªë chi·∫øn l∆∞·ª£c m√† "
        "nh√≥m c√≥ th·ªÉ s·ª≠ d·ª•ng ƒë·ªÉ c·∫£i thi·ªán hi·ªáu su·∫•t "
        "c·ªßa h·ªç trong tr·∫≠n ƒë·∫•u ti·∫øp theo kh√¥ng?"
    )}
]

chat_text = tokenizer.apply_chat_template(
    prompt,
    add_generation_prompt=True,
    tokenize=False
)

inputs = tokenizer(
    chat_text,
    return_tensors="pt"
).to("cuda:0")

with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        generation_config=generation_config,
    )
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

if "assistant\n" in output_text:
    answer = output_text.split("assistant\n")[-1].strip()
else:
    answer = output_text.strip()

print("Assistant reply:", answer)

Assistant reply: Ch·∫Øc ch·∫Øn! D∆∞·ªõi ƒë√¢y l√† m·ªôt s·ªë chi·∫øn l∆∞·ª£c m√† nh√≥m c√≥ th·ªÉ s·ª≠ d·ª•ng ƒë·ªÉ c·∫£i thi·ªán hi·ªáu su·∫•t c·ªßa h·ªç trong tr·∫≠n ƒë·∫•u ti·∫øp theo: 1. X√°c ƒë·ªãnh ƒëi·ªÉm y·∫øu: ƒê√°nh gi√° l·∫°i tr·∫≠n ƒë·∫•u v√† x√°c ƒë·ªãnh nh·ªØng ƒëi·ªÉm y·∫øu c·ªßa nh√≥m. ƒêi·ªÅu n√†y c√≥ th·ªÉ gi√∫p nh√≥m x√°c ƒë·ªãnh nh·ªØng ƒëi·ªÉm c·∫ßn c·∫£i thi·ªán v√† t·∫≠p trung v√†o nh·ªØng ƒëi·ªÉm ƒë√≥ trong tr·∫≠n ƒë·∫•u ti·∫øp theo. 2. TƒÉng c∆∞·ªùng t·∫≠p luy·ªán: ƒê·∫∑t ra m·ª•c ti√™u t·∫≠p luy·ªán c·ª• th·ªÉ v√† t·∫≠p trung v√†o nh·ªØng k·ªπ nƒÉng c·ª• th·ªÉ m√† nh√≥m c·∫ßn c·∫£i thi·ªán. ƒêi·ªÅu n√†y c√≥ th·ªÉ bao g·ªìm vi·ªác t·∫≠p luy·ªán


## **DPO**

In [2]:
dataset = load_dataset("thainq107/Vi-Alpaca-Preference")

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    "binhphap5/Qwen2.5-3B-Instruct-Chat-sft",
    max_seq_length = 2048,
    load_in_4bit = True,
    dtype=torch.bfloat16,
)
lora_rank = 16
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth",
    random_state = 2811,
)

==((====))==  Unsloth 2025.4.1: Fast Qwen2 patching. Transformers: 4.51.3. vLLM: 0.8.4.
   \\   /|    NVIDIA GeForce RTX 4070 Ti SUPER. Num GPUs = 1. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Unsloth 2025.4.1 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [4]:
from unsloth import PatchDPOTrainer

PatchDPOTrainer()

In [5]:
def convert_to_conversational_preference_format(example):
    return {
        "id": example["id"],
        "prompt": [{"role": "system",
                    "content": "B·∫°n l√† m·ªôt tr·ª£ l√Ω AI th√¢n thi·ªán, h√£y tr·∫£ l·ªùi b·∫±ng ti·∫øng Vi·ªát."},
                   {"role": "user",
                    "content": example["question"]}],
        "chosen": [{"role": "assistant",
                    "content": example["chosen"]}],
        "rejected": [{"role": "assistant",
                      "content": example["rejected"]}],
    }

dpo_dataset = dataset.map(convert_to_conversational_preference_format)


In [15]:
def get_model_response(model, tokenizer, instruction):
    cur_conversation = [
        {"role": "system", "content": "B·∫°n l√† m·ªôt tr·ª£ l√Ω AI th√¢n thi·ªán, h√£y tr·∫£ l·ªùi b·∫±ng ti·∫øng Vi·ªát."},
        {"role": "user", "content": instruction}
    ]
    cur_input_prompt = tokenizer.apply_chat_template(
        cur_conversation, add_generation_prompt=True, tokenize=True
    )
    cur_output_ids = model.generate(
        input_ids=torch.LongTensor([cur_input_prompt]).to(model.device),
        attention_mask=torch.ones(1, len(cur_input_prompt)).to(model.device),
        max_new_tokens=1000,
        pad_token_id=tokenizer.eos_token_id,
    )
    cur_generated_ids = cur_output_ids[0][len(cur_input_prompt):]
    return tokenizer.decode(cur_generated_ids, skip_special_tokens=True)

instruction = "T·∫°o m·ªôt meme b·∫±ng c√°ch s·ª≠ d·ª•ng c·ª•m t·ª´ 'th·ªùi trang mu·ªôn'."
response = get_model_response(model, tokenizer, instruction)
print(f"Instruction: {instruction}")
print(f"Response: {response}")

Instruction: T·∫°o m·ªôt meme b·∫±ng c√°ch s·ª≠ d·ª•ng c·ª•m t·ª´ 'th·ªùi trang mu·ªôn'.
Response: T·∫°o meme v·ªõi c√¢u "th·ªùi trang mu·ªôn" kh√¥ng kh√≥, nh∆∞ng t√¥i kh√¥ng th·ªÉ t·∫°o h√¨nh ·∫£nh hay video meme ƒë∆∞·ª£c v√¨ t√¥i l√† tr·ª£ l√Ω AI d·ª±a tr√™n vƒÉn b·∫£n. Tuy nhi√™n, t√¥i c√≥ th·ªÉ m√¥ t·∫£ cho b·∫°n m·ªôt meme c√≥ th·ªÉ l√†m t·ª´ c·ª•m t·ª´ n√†y:

Meme: 
- H√¨nh n·ªÅn l√† m·ªôt ng∆∞·ªùi m·∫∑c qu·∫ßn √°o th·ªùi trang hi·ªán ƒë·∫°i (v√≠ d·ª•: √°o kho√°c da, qu·∫ßn skinny jeans, gi√†y cao g√≥t).
- Trong g√≥c tr√°i c·ªßa h√¨nh n·ªÅn, xu·∫•t hi·ªán m·ªôt ng∆∞·ªùi kh√°c ƒëang m·∫∑c ƒë·ªì th·ªùi trang th·∫≠p ni√™n 90 (v√≠ d·ª•: √°o s∆° mi r·ªông, qu·∫ßn jean looser, gi√†y cao c·ªï).
- C√¢u meme "Th·ªùi trang mu·ªôn" ƒë∆∞·ª£c in to v√† n·ªïi b·∫≠t ·ªü gi·ªØa hai ng∆∞·ªùi.

Meme n√†y s·∫Ω n√≥i l√™n √Ω nghƒ©a r·∫±ng m·∫∑c th·ªùi trang mu·ªôn t·ª©c l√† m·∫∑c nh·ªØng th·ª© ƒë√£ l·ªói th·ªùi so v·ªõi xu h∆∞·ªõng hi·ªán t·∫°i. 

B·∫°n c√≥ mu·ªën t√¥i v·∫Ω ra h√¨nh ·∫£nh n√†y kh√¥ng

In [6]:
dpo_dataset['train'][1000]

{'id': 'alpaca-14472',
 'question': 'T·∫°o m·ªôt meme b·∫±ng c√°ch s·ª≠ d·ª•ng c·ª•m t·ª´ "th·ªùi trang mu·ªôn".',
 'chosen': [{'content': '[Th√™m h√¨nh ·∫£nh ng∆∞·ªùi n·ªïi ti·∫øng m·∫∑c ƒë·ªì l·ªói th·ªùi, nh√¨n ch·∫≥ng ƒë·∫πp m·∫Øt, ph√π h·ª£p v·ªõi √Ω t∆∞·ªüng c·ªßa meme] \n\nH√£y tr√°nh th·ªùi trang mu·ªôn, v√¨ ƒë√¥i khi n√≥ c√≥ th·ªÉ l√† ƒëi·ªÅu t·ªìi t·ªá nh·∫•t m√† b·∫°n c√≥ th·ªÉ l√†m cho b·∫£n th√¢n m√¨nh.',
   'role': 'assistant'}],
 'rejected': [{'content': 'Xin l·ªói, nh∆∞ng v√¨ t√¥i l√† m·ªôt tr√≠ tu·ªá nh√¢n t·∫°o v√† kh√¥ng c√≥ kh·∫£ nƒÉng th·ª±c hi·ªán c√°c t√°c v·ª• ƒë·ªì h·ªça, t√¥i kh√¥ng th·ªÉ t·∫°o ra m·ªôt thi·∫øt k·∫ø √°o ph√¥ng cho b·∫°n. Tuy nhi√™n, b·∫°n c√≥ th·ªÉ tham kh·∫£o c√°c trang web thi·∫øt k·∫ø √°o ph√¥ng tr·ª±c tuy·∫øn nh∆∞ Custom Ink, Printful ho·∫∑c Teespring ƒë·ªÉ t·∫°o ra thi·∫øt k·∫ø √°o ph√¥ng c·ªßa ri√™ng b·∫°n d·ª±a tr√™n ti·ªÉu bang ho·∫∑c qu·ªëc gia c·ªßa b·∫°n. Ch√∫c may m·∫Øn!',
   'role': 'assistant'}],
 'prompt': [{'content': 'B·∫°n l√

In [7]:
hyperparameters = {
    "per_device_train_batch_size": 3,
    "gradient_accumulation_steps": 8,
    "gradient_checkpointing": True,
    "learning_rate": 3e-5,
    "logging_steps": 100,
    "max_steps": 500,
    "save_strategy": "steps",
    "overwrite_output_dir": True,
    "optim": "paged_adamw_8bit",
    "warmup_ratio": 0.05,
    "bf16": True,
}
MAX_LENGTH = 512

In [8]:
# Use wandb
import wandb
wandb.init(
    project="vi-alpaca-preference",
    name="Qwen2.5-3B-4bit-chat-dpo"
)


[34m[1mwandb[0m: Currently logged in as: [33mtridungluong123[0m ([33mtridungluong123-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [9]:
DPO_OUTPUT_DIR = f"Qwen2.5-3B-Instruct-chat-dpo"
dpo_args = DPOConfig(
    **{ **hyperparameters, "output_dir":
       DPO_OUTPUT_DIR, "max_length": MAX_LENGTH }
)

dpo_trainer = DPOTrainer(
    model,
    ref_model = None,
    args=dpo_args,
    train_dataset=dpo_dataset['train'],
    processing_class=tokenizer,
)
dpo_trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 65,017 | Num Epochs = 1 | Total steps = 500
O^O/ \_/ \    Batch size per device = 3 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (3 x 8 x 1) = 24
 "-____-"     Trainable parameters = 29,933,568/3,000,000,000 (1.00% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
100,0.3533,-1.060731,-4.226553,0.7875,3.165823,-211.543243,-358.595703,,,0,0,0,0
200,0.1947,-2.453786,-10.323666,0.874583,7.86988,-227.215591,-413.57663,,-3.68035,No Log,No Log,No Log,No Log
300,0.1743,-2.679446,-11.215009,0.884583,8.535563,-234.342728,-424.946289,,,No Log,No Log,No Log,No Log
400,0.176,-2.190362,-11.647231,0.88625,9.456869,-230.944473,-433.805481,,,No Log,No Log,No Log,No Log
500,0.1621,-2.011829,-12.378806,0.889583,10.366978,-227.305801,-440.325043,,,No Log,No Log,No Log,No Log


TrainOutput(global_step=500, training_loss=0.2120833625793457, metrics={'train_runtime': 4202.3252, 'train_samples_per_second': 2.856, 'train_steps_per_second': 0.119, 'total_flos': 0.0, 'train_loss': 0.2120833625793457, 'epoch': 0.18456143588797122})

In [None]:
torch.cuda.empty_cache()

In [18]:
instruction = "T·∫°o m·ªôt meme b·∫±ng c√°ch s·ª≠ d·ª•ng c·ª•m t·ª´ 'th·ªùi trang mu·ªôn'."
response = get_model_response(model, tokenizer, instruction)
print(f"Instruction: {instruction}")
print(f"Response: {response}")

Instruction: T·∫°o m·ªôt meme b·∫±ng c√°ch s·ª≠ d·ª•ng c·ª•m t·ª´ 'th·ªùi trang mu·ªôn'.
Response: T·∫°o meme v·ªõi c·ª•m t·ª´ 'th·ªùi trang mu·ªôn' kh√¥ng ph·∫£i l√† kh·∫£ thi b·∫±ng l·ªùi vƒÉn ƒë∆°n gi·∫£n v√¨ meme th∆∞·ªùng c·∫ßn h√¨nh ·∫£nh ho·∫∑c video. Tuy nhi√™n, t√¥i c√≥ th·ªÉ t·∫°o ra m·ªôt meme b·∫±ng c√°ch s·ª≠ d·ª•ng c√¢u n√†y:

[·∫¢nh: M·ªôt ng∆∞·ªùi m·∫∑c ƒë·ªì th·ªùi trang c≈©, nh∆∞ng ƒëang ƒë·ª©ng tr∆∞·ªõc g∆∞∆°ng, ph√≠a sau l√† d√≤ng ch·ªØ "Th·ªùi trang mu·ªôn".]

Meme n√†y c√≥ th·ªÉ s·∫Ω g√¢y c∆∞·ªùi khi th·∫•y m·ªôt ng∆∞·ªùi c·ªë g·∫Øng c·∫≠p nh·∫≠t th·ªùi trang nh∆∞ng ƒë√£ qu√° mu·ªôn so v·ªõi xu h∆∞·ªõng hi·ªán t·∫°i. 

Ho·∫∑c n·∫øu b·∫°n mu·ªën m·ªôt meme ng·∫Øn g·ªçn h∆°n, c√≥ th·ªÉ nh∆∞ sau: 
[·∫¢nh: M·ªôt ng∆∞·ªùi ph·ª• n·ªØ ƒëang th·ª≠ qu·∫ßn √°o m·ªõi, ph√≠a sau l√† d√≤ng ch·ªØ "Th·ªùi trang mu·ªôn" v·ªõi h√¨nh ·∫£nh c·ªßa m·ªôt ng∆∞·ªùi ƒë√†n √¥ng ƒëang ƒÉn b√°nh ng·ªçt.] 

Trong c·∫£ hai meme n√†y, c√¢u "th·ªùi trang mu·ªôn" ƒë∆∞·ª£c d√πng ƒë·ªÉ t·∫°o ra s·

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    './Qwen2.5-3B-Instruct-chat-dpo/checkpoint-500',
    max_seq_length = 2048,
    load_in_4bit = True,
    dtype=torch.bfloat16,
)

model.push_to_hub_merged(
    "binhphap5/Qwen2.5-3B-Instruct-Chat-RLHF",
    commit_message="Merge lora weights to push to hub",
    token="???"
)

tokenizer.push_to_hub(
    "binhphap5/Qwen2.5-3B-Instruct-Chat-RLHF",
    commit_message="Push tokenizer to hub",
    token="???"
)

In [None]:
"""
vllm serve binhphap5/Qwen2.5-3B-Instruct-Chat-RLHF \
--api-key binhphap5 \
--port 8000 \
--quantization bitsandbytes \
--enable-prefix-caching \
--swap-space 16 \
--gpu-memory-utilization 0.9 \
--disable-log-requests \
--max-model-len 2048 
"""