In [1]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

model, tokenizer = FastVisionModel.from_pretrained(
    "Unsloth/Qwen2.5-VL-7B-Instruct",
    load_in_4bit = False, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.7: Fast Qwen2_5_Vl patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.495 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Loading checkpoint shards: 100%|██████████| 5/5 [00:03<00:00,  1.53it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

In [3]:
import wandb
wandb.init(project="my-ms-thesis")
wandb.config.update(model.config.to_dict(), allow_val_change=True)

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mak11089[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
import json
SYSTEM_MESSAGE = """
You are a vision-language assistant specialized in answering questions based on document page images.
Given a question about the document, use the provided page images to only generate accurate, short and concise answers.
"""

def load_candidates(cands_path: str) -> dict:
    with open(cands_path, 'r') as f:
        return json.load(f)
    
def format_data(example: dict, image_dir: str, candidates: dict, top_k: int) -> dict:
    # Extract QID, question, and answer
    qid = example['questionId']
    question = example['question']
    answers = example.get('answers', [])
    answer = answers[0] if answers else ""

    # Construct chat messages

    # Select top-k candidate page IDs
    cand_pages = candidates.get(str(qid), [])[:top_k]
    # print(cand_pages, qid)
    image_paths = [os.path.join(image_dir, 'images', f"{pid}.jpg") for pid, _ in cand_pages]
    # print(image_paths)
    user_messages_content = [
        {
                "type": "text",
                "text": f"{SYSTEM_MESSAGE}\nQuestion: {question}"
        }
    ]
    for image_path in image_paths:
        user_messages_content.append(
            {
                "type": "image",
                "image": image_path
            }
        )

    messages = [
        
        {
            "role": "user",
            "content": user_messages_content
        },
        {"role": "assistant", "content": {"type": "text", "text": answer}},
    ]
    return {"messages": messages}

In [5]:
from dataclasses import dataclass
from datasets import load_dataset
import os
@dataclass
class SFTArgs:
    train_json: str = "mpdocvqa/question_answers/train.json"
    candidates_json: str = "sft-colqwen-train-retrieved-candidates_5.json"
    root_dir: str= "mpdocvqa"
    eval_json  = None
    # model_id: str = "Qwen/Qwen2.5-VL-7B-Instruct"
    output_dir: str = "qwen2vl-sft-mpdocvqa-train"
    # num_epochs: int = 1
    # batch_size: int = 1
    # lr: float = 2e-4
    top_k: int = 2
args = SFTArgs()

In [6]:
! ls ../../

bin   dev  home  lib32	libx32	    media  opt	 root  sbin  srv  tmp  var
boot  etc  lib	 lib64	lost+found  mnt    proc  run   snap  sys  usr


In [7]:
raw_train = load_dataset('json', data_files={'train': args.train_json}, field='data')['train']
raw_eval  = load_dataset('json', data_files={'eval': args.eval_json}, field='data')['eval'] if args.eval_json else None
print(raw_train)
# Load candidate-page mapping
# Load candidate mappings and prepare samples
candidates = load_candidates(args.candidates_json)
# print(candidates['49153'])
train_samples = [format_data(ex, args.root_dir, candidates, args.top_k) for ex in raw_train][:10000]
eval_samples  = [format_data(ex, args.root_dir, candidates, args.top_k) for ex in raw_eval] if raw_eval else None

Dataset({
    features: ['questionId', 'question', 'doc_id', 'page_ids', 'answers', 'answer_page_idx', 'data_split'],
    num_rows: 36230
})


In [8]:
train_samples[0]

{'messages': [{'role': 'user',
   'content': [{'type': 'text',
     'text': '\nYou are a vision-language assistant specialized in answering questions based on document page images.\nGiven a question about the document, use the provided page images to only generate accurate, short and concise answers.\n\nQuestion: what is the date mentioned in this letter?'},
    {'type': 'image', 'image': 'mpdocvqa/images/xnbl0037_p0.jpg'},
    {'type': 'image', 'image': 'mpdocvqa/images/xnbl0037_p1.jpg'}]},
  {'role': 'assistant', 'content': {'type': 'text', 'text': '1/8/93'}}]}

In [9]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = train_samples,
    args = SFTConfig(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 30,
        num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        # max_seq_length = 2048,
    ),
)

Unsloth: Model does not have a default image size - using 512


In [10]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.495 GB.
15.709 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 1 | Total steps = 312
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 51,521,536/8,343,688,192 (0.62% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,8.8289
2,8.8112
3,8.4966


In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

In [None]:
# Select ONLY 1 to save! (Both not needed!)

# Save locally to 16bit
if False: model.save_pretrained_merged("unsloth_finetune", tokenizer,)

# To export and save to your Hugging Face account
if False: model.push_to_hub_merged("YOUR_USERNAME/unsloth_finetune", tokenizer, token = "PUT_HERE")