Source: https://wandb.ai/byyoung3/Generative-AI/reports/How-to-fine-tune-and-evaluate-Qwen3-with-Unsloth---VmlldzoxMjU3OTI0Ng#fine-tuning-qwen3-

In [None]:
import torch 
from unsloth import FastLanguageModel
import weave
from pprint import pprint
weave.init('think_test')

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
#BASE_MODEl_NAME = "unsloth/Qwen3-8B"
BASE_MODEl_NAME = "unsloth/Qwen3-0.6B"
max_seq_length = 2048
dtype = None
load_in_4bit = False

BASE_MODEL, TOKENIZER = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEl_NAME,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

In [None]:
BASE_MODEL.eval().to("cuda")
FastLanguageModel.for_inference(BASE_MODEL)

In [None]:
def make_prompt(instruction):
    return [{"role": "user", "content": instruction}]

def apply_chat_template(prompt, tokenizer, enable_thinking=True):
    messages = make_prompt(prompt)
    return tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True,
        enable_thinking=enable_thinking
    )

In [None]:
@weave.op
def generate_response(prompt, enable_thinking=True):
    prompt_text = apply_chat_template(prompt, TOKENIZER, enable_thinking)
    inputs = TOKENIZER([prompt_text], return_tensors="pt").to("cuda")
    with torch.no_grad():
        gen_output = BASE_MODEL.generate(
            **inputs, 
            max_new_tokens=1000,
            use_cache=False, 
            temperature=0.7,
            top_p=0.8,
            top_k=20,
            min_p=0.0
        )
    output_text = TOKENIZER.decode(gen_output[0], skip_special_tokens=True)
    return output_text

In [None]:
math_question = "What is 256 multiplied by 17?"
math_question_no_think = "/no_think\nWhat is 256 multiplied by 17?"

In [None]:
print("=== enable_thinking=True (default) ===")
output1 = generate_response(math_question, enable_thinking=True)
pprint(output1)

In [None]:
print('=== enable_thinking=False ===')
output2 = generate_response(math_question, enable_thinking=False)
pprint(output2)

In [None]:
print('=== enable_thinking=True + /no_think in prompt ===')
output3 = generate_response(math_question_no_think, enable_thinking=True)
pprint(output3)

In [None]:
import random
import numpy as np 
import torch 

SEED = 3278

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True 
torch.backends.cudnn.benchmark = False

In [None]:
del BASE_MODEL, TOKENIZER
gc.collect()
torch.cuda.empty_cache()

In [None]:
from unsloth import FastLanguageModel, is_bf16_supported
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

max_seq_length = 2048
dtype = None
load_in_4bit = True  # Changed from False to True
MODEL_NAME = "unsloth/Qwen3-0.6B"
SAVE_DIR = "lora_model"

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model, 
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=SEED,
    use_rslora=False, 
    loftq_config=None,
)

In [None]:
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        if input_text.strip():
            user_message = f"{instruction}\n\n{input_text}"
        else:
            user_message = instruction 
        messages = [
            {"role": "user", "content": user_message},
            {"role": "assistant", "content": output}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
            enable_thinking=False
        )
        texts.append(text)
    return {"text": texts}

In [None]:
dataset = load_dataset("yahma/alpaca-cleaned", split="train")
half_len = len(dataset) // 2
dataset = dataset.select(range(half_len))
dataset = dataset.map(formatting_prompts_func, batched=True, num_proc=2)

In [None]:
dataset[0]

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2, 
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bf16_supported(),
        bf16=is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit", 
        weight_decay=0.01,
        lr_scheduler_type="linear", 
        seed=SEED,
        output_dir="outputs", 
        report_to="none"
    )
)

In [None]:
trainer.train()

In [None]:
FastLanguageModel.for_inference(model)
user_query = "Continue the Fibonacci sequence.\n\n1, 1, 2, 3, 5, 8"
messages = [{"role": "user", "content": user_query}]
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=False
)
inputs = tokenizer([prompt], return_tensors='pt').to('cuda')
outputs = model.generate(
    **inputs,
    max_new_tokens=128,
    do_sample=True,
    use_cache=False,
    temperature=0.7,
    top_p=0.8,
    top_k=20,
    min_p=0.0
)

pprint("\n============= Output from in-memory model (just trained)")
pprint(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

del model
del tokenizer
torch.cuda.empty_cache()

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=SAVE_DIR,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(model)

In [None]:
prompt2 = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=False
)

inputs2 = tokenizer([prompt2], return_tensors="pt").to("cuda")
outputs2 = model.generate(
    **inputs2,
    max_new_tokens=2048,
    use_cache=False,
    temperature=0.7,
    top_p=0.8,
    top_k=20,
    min_p=0.0
)

print("\n============== Output from reloaded model (after save / load)")
print(tokenizer.decode(outputs2[0], skip_special_tokens=True))

### Evaluate with Weave

In [None]:
import random 
import numpy as np 
import torch 

SEED = 3407
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True 
torch.backends.cudnn.benchmark = False 

In [None]:
max_seq_length = 2048
dtype = None 
load_in_4bit = True 
BASE_MODEL_NAME = "unsloth/Qwen3-0.6B"
LORA_MODEL_DIR = "lora_model"
N = 30
weave.init("q3")


In [None]:
# === GLOBAL: LOAD MODELS ONLY ONCE ===
BASE_MODEL, TOKENIZER = FastLanguageModel.from_pretrained(
    model_name=BASE_MODEL_NAME, 
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

LORA_MODEL, _ = FastLanguageModel.from_pretrained(
    model_name=LORA_MODEL_DIR,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

BASE_MODEL.eval()
LORA_MODEL.eval()
FastLanguageModel.for_inference(BASE_MODEL)
FastLanguageModel.for_inference(LORA_MODEL)

def make_prompt(instruction, input_text):
    if input_text.strip():
        user_message = f"{instruction}\n\n{input_text}"
    else:
        user_message = instruction
    return [{"role": "user", "content": user_message}]

In [None]:
def apply_chat_template_loss(sample, tokenizer):
    messages = make_prompt(sample["instruction"], sample["input"])
    messages.append({"role": "assistant", "content": sample["output"]})
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

In [None]:
def apply_chat_template_generation(sample, tokenizer):
    messages = make_prompt(sample["instruction"], sample["input"])
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )

In [None]:
def output_only_loss(toknizer, model, sample, device="cuda"):
    #1. Prepare full prompt + output for loss 
    prompt_plus_output = apply_chat_template_loss(sample, tokenizer)
    #2. Prepare prompt only (for prefix length)
    prompt_only = make_prompt(sample["instruction"], sample["input"])
    prompt_only_str = tokenizer.apply_chat_template(
        prompt_only,
        tokenize=False,
        add_generation_prompt=False, 
        enable_thinking=False
    )
    #3. Toknize both 
    tok_full = tokenizer(
        prompt_plus_output,
        return_tensors="pt",
        truncation=True,
        max_length=max_seq_length,
        padding="max_length"
    )
    tok_prompt = tokenizer(
        prompt_only_str,
        return_tensors='pt',
        truncation=True,
        max_length=max_seq_length
    )
    input_ids = tok_full["input_ids"].to(device)
    labels = input_ids.clone()

    # 4 Loss only on output tokens 
    prompt_len = tok_prompt["input_ids"].shape[-1]
    # mask pad tokens if there 
    labels[:, :prompt_len] = -100
    if tokenizer.pad_token_id is not None: 
        labels[input_ids == tokenizer.pad_token_id] = -100

    with torch.no_grad():
        output = model(input_ids=input_ids, labels=labels)
    return output.loss.item()
    

In [None]:
def safe_generate(model, tokenizer, prompt, device="cuda"):
    # Tokenize prompt and ensure we never overflow model max length 
    prompt_tok = tokenizer(
        [prompt], 
        return_tensors='pt',
        truncation=True,
        max_length=max_seq_length
    ).to(device)
    prompt_len = prompt_tok['input_ids'].shape[1]
    # prevent overflow: at least generate 1, never beyond 2048
    max_gen = max(1, max_seq_length - prompt_len)
    with torch.no_grad():
        output = model.generate(
            **prompt_tok,
            max_new_tokens=max_gen,
            use_cache=False,
            temperature=0.7,
            top_p=0.8,
            top_k=20,
            min_p=0.0
        )
        out_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return out_text 

In [None]:
class QwenBaseModel(weave.Model):
    @weave.op()
    async def predict(self, instruction, input, output):
        sample = {
            "instruction": instruction,
            "input": input,
            "output": output
        }
        # loss on output tokens only 
        loss = output_only_loss(TOKENIZER, BASE_MODEL, sample)
        prompt_gen = apply_chat_template_generation(sample, TOKENIZER)
        output_text = safe_generate(BASE_MODEL, TOKENIZER, prompt_gen)
        return {"loss": loss, "output": output_text}

In [None]:
class QwenLoraModel(weave.Model):
    @weave.op()
    async def predict(self, instruction, input, output):
        sample = {
            "instruction": instruction, 
            "input": input, 
            "output": output
        }
        loss = output_only_loss(TOKENIZER, LORA_MODEL, sample)
        prompt_gen = apply_chat_template_generation(sample, TOKENIZER)
        output_text = safe_generate(LORA_MODEL, TOKENIZER, prompt_gen)
        return {"loss": loss, "output": output_text}

In [None]:
@weave.op()
def loss_only_scorer(output):
    return {"loss": output["loss"]}

### Load last 10% of train and pick 30 samples 

In [None]:
full_ds = load_dataset("yahma/alpaca-cleaned", split="train")
length = len(full_ds)
start = int(length * 0.9)
end = length 
ds_last10 = full_ds.select(range(start, end))
samples = [
    dict(
        instruction=row["instruction"],
        input=row["input"], 
        output=row["output"]
    )
    for row in ds_last10.select(range(N))
]

async def main():
    models = {
        "Qwen3-8B-base": QwenBaseModel(),
        "Qwen3-8B-LoRA": QwenLoraModel()
    }
    scorers = [loss_only_scorer]
    for model_name, model, in models.items():
        print(f"==== Evaluating {model_name} ====")
        evaluation = weave.Evaluation(
            dataset=samples, 
            scorers=scorers, 
            name=f"{model_name} LossEval"
        )
        results = await evaluation.evaluate(model)
        print(results)

In [None]:
await main()