# Qwen3 VL 2B QLoRA Finetuning with Unsloth
Finetuning on the `wrath/well-log-headers-ocr` dataset for well log OCR.

In [1]:
# !pip install uv && uv pip install unsloth
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [2]:
import torch
from unsloth import FastVisionModel

model, tokenizer = FastVisionModel.from_pretrained(
    # Model identifier from HuggingFace Hub.
    # Unsloth provides optimized versions: 'unsloth/Qwen2.5-VL-3B-Instruct',
    # 'unsloth/Qwen2.5-VL-7B-Instruct', 'unsloth/Llama-3.2-11B-Vision-Instruct', etc.
    # Use 'unsloth/' prefix for 2x faster loading with Unsloth optimizations.
    model_name="unsloth/Qwen3-VL-2B-Instruct-unsloth-bnb-4bit",

    # QLoRA 4-bit quantization. Reduces VRAM by ~75% with minimal quality loss.
    # True = 4-bit (QLoRA), False = 16-bit (full precision, needs more VRAM).
    # Unsloth recommends True for Colab free tier (T4 GPU).
    load_in_4bit=True,

    # Gradient checkpointing trades compute for memory.
    # 'unsloth' = Unsloth's optimized implementation (recommended, 30% less VRAM).
    # True = standard HF implementation. False = disabled (fastest but most VRAM).
    use_gradient_checkpointing="unsloth",
)

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.12.9: Fast Qwen3_Vl patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.41G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/782 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

video_preprocessor_config.json:   0%|          | 0.00/817 [00:00<?, ?B/s]

In [5]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers=True,
    finetune_language_layers=True,
    finetune_attention_modules=True,
    finetune_mlp_modules=True,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

RuntimeError: Unsloth: You already added LoRA adapters to your model!

In [4]:
import base64
import gc
from io import BytesIO

from PIL import Image

from datasets import load_dataset

# Load dataset with streaming to avoid RAM crash
raw_dataset = load_dataset("wrath/well-log-headers-ocr")

def b64_to_image(b64_str):
    return Image.open(BytesIO(base64.b64decode(b64_str))).convert("RGB")

# MEMORY-EFFICIENT: Create lightweight dataset with lazy image loading
class LazyVisionDataset:
    def __init__(self, hf_dataset):
        self.data = hf_dataset

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        return {
            "messages": [
                {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": sample["instruction"]}]},
                {"role": "assistant", "content": [{"type": "text", "text": sample["answer"]}]},
            ],
            "images": [b64_to_image(sample["image_base64"])],
        }

train_dataset = LazyVisionDataset(raw_dataset["train"])
eval_dataset = LazyVisionDataset(raw_dataset["eval"])

print(f"Train: {len(train_dataset)} samples")
print(f"Eval: {len(eval_dataset)} samples")
gc.collect()  # Free memory

Train: 501 samples
Eval: 126 samples


75

## Before Finetuning - Test Baseline on Eval Set

In [None]:
import random

random.seed(42)
test_indices = random.sample(range(len(eval_dataset)), min(3, len(eval_dataset)))

default_instruction= """Convert the following document to markdown.
Return only the markdown with no explanation text. Do not include delimiters like ```markdown or ```html.

RULES:
- You must include all information on the page. Do not exclude headers, footers, or subtext.
- Return tables in an HTML format.
- Charts & infographics must be interpreted to a markdown format. Prefer table format when applicable.
- Prefer using ‚òê and ‚òë for check boxes."""

def run_inference(model, tokenizer, image, instruction=default_instruction):
    FastVisionModel.for_inference(model)
    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": instruction}]}]
    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = tokenizer(image, input_text, add_special_tokens=False, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_new_tokens=512, use_cache=True, temperature=0.7, min_p=0.1)
    return tokenizer.decode(output[0], skip_special_tokens=True)

print("=== BEFORE FINETUNING (on EVAL set) ===")
baseline_outputs = []
for i in test_indices:
    sample = eval_dataset[i]
    img = sample["images"][0]
    instruction = sample["messages"][0]["content"][1]["text"]
    output = run_inference(model, tokenizer, img, instruction)
    baseline_outputs.append(output)
    print(f"\n--- Eval Sample {i} ---")
    print(f"Output: {output}...")
    del img, sample  # Free memory
gc.collect()

=== BEFORE FINETUNING (on EVAL set) ===

--- Eval Sample 81 ---
Output: user
Convert the following document to markdown.
Return only the markdown with no explanation text. Do not include delimiters like ```markdown or ```html.

RULES:
  - You must include all information on the page. Do not exclude headers, footers, or subtext.
  - Return tables in an HTML format.
  - Charts & infographics must be interpreted to a markdown format. Prefer table format when applicable.
  - Prefer using ‚òê and ‚òë for check boxes.
assistant
```html
<!DOCTYPE html>
<html>
<body>
<h1> LATER...

--- Eval Sample 14 ---
Output: user
Convert the following document to markdown.
Return only the markdown with no explanation text. Do not include delimiters like ```markdown or ```html.

RULES:
  - You must include all information on the page. Do not exclude headers, footers, or subtext.
  - Return tables in an HTML format.
  - Charts & infographics must be interpreted to a markdown format. Prefer table format when 

## Training

In [None]:
from trl import SFTConfig, SFTTrainer
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator

FastVisionModel.for_training(model)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=UnslothVisionDataCollator(model, tokenizer),
    train_dataset=train_dataset,  # Train on train split
    eval_dataset=eval_dataset,     # Validate on eval split
    args=SFTConfig(
        # === BATCH SIZE ===
        # Unsloth recommends 2 for vision on Colab free tier. Increase to 4-8 on A100/H100.
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,

        # Effective batch = batch_size √ó gradient_accumulation_steps. Default: 4.
        gradient_accumulation_steps=4,

        # === TRAINING DURATION ===
        # Warmup: prevents early instability. Default: 5 or 5-10% of max_steps.
        warmup_steps=5,

        # Total steps. 30 for demo, 100-500+ for real finetuning.
        max_steps=30,

        # === LEARNING RATE ===
        # Unsloth STRONGLY recommends 2e-4 for LoRA/QLoRA. Range: 1e-4 to 5e-4.
        learning_rate=2e-4,

        # === PRECISION ===
        # bf16 preferred (better stability), fp16 fallback for older GPUs.
        fp16=not is_bf16_supported(),
        bf16=is_bf16_supported(),

        # === LOGGING & EVAL ===
        logging_steps=1,
        eval_strategy="steps",  # Evaluate during training
        eval_steps=10,           # Evaluate every 10 steps

        # === OPTIMIZER ===
        # 'adamw_8bit' saves ~30% memory. Alt: 'adamw_torch'.
        optim="adamw_8bit",
        weight_decay=0.01,       # L2 regularization. Range: 0.0 to 0.1.
        lr_scheduler_type="linear",  # Alt: 'cosine', 'constant'.

        # === MISC ===
        seed=3407,
        output_dir="outputs",
        report_to="none",

        # === VISION-SPECIFIC (REQUIRED) ===
        remove_unused_columns=False,
        dataset_text_field="",
        dataset_kwargs={"skip_prepare_dataset": True},
        dataset_num_proc=4,
        max_seq_length=2048,
    ),
)

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU: {gpu_stats.name}, Max Memory: {max_memory} GB")

trainer_stats = trainer.train()

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
print(f"\nTraining Time: {round(trainer_stats.metrics['train_runtime']/60, 2)} min")
print(f"Peak Memory: {used_memory} GB ({round(used_memory/max_memory*100, 1)}%)")

## After Finetuning - Compare Results on Eval Set

In [None]:
print("=== AFTER FINETUNING (on EVAL set - unseen data) ===")
for idx, i in enumerate(test_indices):
    img = eval_dataset[i]["images"][0]
    expected = eval_dataset[i]["messages"][1]["content"][0]["text"]
    output = run_inference(model, tokenizer, img)

    print(f"\n{'='*60}")
    print(f"EVAL SAMPLE {i}")
    print(f"{'='*60}")
    print(f"\n[BEFORE]:\n{baseline_outputs[idx][:400]}...")
    print(f"\n[AFTER]:\n{output[:400]}...")
    print(f"\n[EXPECTED]:\n{expected[:400]}...")

## Save Model

In [None]:
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")
print("LoRA adapters saved to ./lora_model")

# Optional: Push to HuggingFace Hub
# model.push_to_hub("your_name/well-log-ocr-lora", token="...")
# tokenizer.push_to_hub("your_name/well-log-ocr-lora", token="...")