# LoRA Fine-Tuning for Qwen3-VL on Dots-and-Boxes
This notebook fine-tunes `Qwen/Qwen3-VL-4B-Instruct` using PEFT-LoRA. It includes a custom multimodal collator so the HF Trainer handles both image and text correctly.


In [None]:
import json
from pathlib import Path
from typing import List, Dict
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForImageTextToText, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / 'data' / 'prepared'
TRAIN_FILE = DATA_DIR / 'dataset_train.json'
EVAL_FILE  = DATA_DIR / 'dataset_eval.json'
CHECKPOINTS_DIR = BASE_DIR / 'checkpoints'

MODEL_ID = 'Qwen/Qwen3-VL-4B-Instruct'

seq = 0
while (CHECKPOINTS_DIR / f'lora_{seq}').exists():
    seq += 1
OUTPUT_DIR = CHECKPOINTS_DIR / f'lora_{seq}'

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
print('Device:', device, 'BF16:', bf16, 'OUTPUT_DIR:', OUTPUT_DIR)

In [None]:
# Load base model and processor
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForImageTextToText.from_pretrained(
    MODEL_ID,
    dtype=torch.bfloat16 if bf16 else (torch.float16 if device=='cuda' else torch.float32),
    device_map='auto',
)
model.config.use_cache = False

print('Model and processor loaded.')

In [None]:
# Data loading and multimodal collator (simple functions)
with open(TRAIN_FILE, 'r', encoding='utf-8') as f:
    train_dataset = json.load(f)
with open(EVAL_FILE, 'r', encoding='utf-8') as f:
    eval_dataset = json.load(f)
    
print('Train/Eval sizes:', len(train_dataset), len(eval_dataset))

def multimodal_collator(batch: List[Dict]):
    images = []
    full_texts = []
    prompt_texts = []
    for sample in batch:
        msgs = sample['messages']
        # Load image path into PIL for the processor
        pil_img = None
        
        for content_item in msgs[0]['content']:
            if content_item.get('type') == 'image':
                img_path = Path(content_item.get('image'))
                if not img_path.is_absolute():
                    img_path = (BASE_DIR / img_path).resolve()
                pil_img = Image.open(img_path).convert('RGB')
                content_item['image'] = pil_img
                break
        images.append(pil_img)
        full_text = processor.apply_chat_template(
            msgs, tokenize=False, add_generation_prompt=False
        )
        prompt_text = processor.apply_chat_template(
            msgs[:-1], tokenize=False, add_generation_prompt=True
        )
        full_texts.append(full_text)
        prompt_texts.append(prompt_text)

    full_out = processor(
        text=full_texts, images=images, return_tensors='pt', padding=True
    )
    prompt_out = processor(
        text=prompt_texts, images=images, return_tensors='pt', padding=True
    )
    labels = full_out['input_ids'].clone()
    prompt_lens = prompt_out['attention_mask'].sum(dim=1).tolist()
    for i, L in enumerate(prompt_lens):
        labels[i, :int(L)] = -100
    
    batch_out = dict(full_out)
    batch_out['labels'] = labels
    
    
    return batch_out


collator = multimodal_collator

In [None]:
# Configure and attach LoRA
lora_cfg = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM',
    target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj']
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

In [None]:
# Trainer setup
args = TrainingArguments(
    output_dir=str(OUTPUT_DIR),
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    logging_steps=1,
    eval_steps=4,
    eval_strategy='steps',
    save_strategy='epoch',
    bf16=bf16,
    fp16=(not bf16 and device=='cuda'),
    remove_unused_columns=False,
    dataloader_num_workers=2,
    report_to=['none'],
    gradient_checkpointing=False
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collator,
)
print('Trainer ready.')

In [None]:
# Train and save adapters
train_result = trainer.train()
trainer.save_model(str(OUTPUT_DIR))
processor.save_pretrained(str(OUTPUT_DIR))
with open(OUTPUT_DIR / 'train_results.json', 'w', encoding='utf-8') as f:
    json.dump({
        'metrics': train_result.metrics,
        'output_dir': str(OUTPUT_DIR)
    }, f, indent=2)
print('Saved to', OUTPUT_DIR)