In [None]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

model, tokenizer = FastVisionModel.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct",
    load_in_4bit = False, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

==((====))==  Unsloth 2025.4.4: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


model.safetensors.index.json:   0%|          | 0.00/57.6k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00005.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

We now add LoRA adapters for parameter efficient finetuning - this allows us to only efficiently train 1% of all parameters.

**[NEW]** We also support finetuning ONLY the vision part of the model, or ONLY the language part. Or you can select both! You can also select to finetune the attention or the MLP layers!

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

In [None]:
import json
SYSTEM_MESSAGE = """
You are a vision-language assistant specialized in answering questions based on document page images.
Given a question about the document, use the provided page images to only generate accurate, short and concise answers.
"""

def load_candidates(cands_path: str) -> dict:
    with open(cands_path, 'r') as f:
        return json.load(f)
    
def format_data(example: dict, image_dir: str, candidates: dict, top_k: int) -> dict:
    # Extract QID, question, and answer
    qid = example['questionId']
    question = example['question']
    answers = example.get('answers', [])
    answer = answers[0] if answers else ""

    # Construct chat messages

    # Select top-k candidate page IDs
    cand_pages = candidates.get(str(qid), [])[:top_k]
    # print(cand_pages, qid)
    image_paths = [os.path.join(image_dir, 'images', f"{pid}.jpg") for pid, _ in cand_pages]
    # print(image_paths)
    user_messages_content = [
        {
                "type": "text",
                "text": question
        }
    ]
    for image_path in image_paths:
        user_messages_content.append(
            {
                "type": "image",
                "image": image_path
            }
        )

    messages = [
        {"role": "system",    "content": {"type": "text", "text": SYSTEM_MESSAGE}},
        {
            "role": "user",
            "content": user_messages_content
        },
        {"role": "assistant", "content": {"type": "text", "text": answer}},
    ]
    return {"messages": messages}

In [None]:
from dataclasses import dataclass
from datasets import load_dataset
@dataclass
class SFTArgs:
    train_json: str = "mpdocvqa/question_answers/val.json"
    candidates_json: str = "close_vanilla_colqwen_val_eval_4.json"
    root_dir: str= "mpdocvqa"
    eval_json  = None
    model_id: str = "Qwen/Qwen2.5-VL-7B-Instruct"
    output_dir: str = "./qwen2vl-sft-mpdocvqa"
    num_epochs: int = 1
    batch_size: int = 1
    lr: float = 2e-4
    top_k: int = 1
args = SFTArgs()

In [None]:
raw_train = load_dataset('json', data_files={'train': args.train_json}, field='data')['train']
raw_eval  = load_dataset('json', data_files={'eval': args.eval_json}, field='data')['eval'] if args.eval_json else None
print(raw_train)
# Load candidate-page mapping
# Load candidate mappings and prepare samples
candidates = load_candidates(args.candidates_json)
# print(candidates['49153'])
train_samples = [format_data(ex, args.root_dir, candidates, args.top_k) for ex in raw_train]
eval_samples  = [format_data(ex, args.root_dir, candidates, args.top_k) for ex in raw_eval] if raw_eval else None

In [None]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = train_samples,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 30,
        num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        # max_seq_length = 2048,
    ),
)

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

In [None]:
# Select ONLY 1 to save! (Both not needed!)

# Save locally to 16bit
if False: model.save_pretrained_merged("unsloth_finetune", tokenizer,)

# To export and save to your Hugging Face account
if False: model.push_to_hub_merged("YOUR_USERNAME/unsloth_finetune", tokenizer, token = "PUT_HERE")