# PaddleOCR-VL Finetuning with Unsloth
Finetuning on the `wrath/well-log-headers-ocr` dataset for well log OCR.

This notebook uses the PaddleOCR-VL 1B model which works reliably with Unsloth's SFTTrainer.

In [None]:
# !pip install uv && uv pip install unsloth
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
%%capture
!pip install uv
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !uv pip install unsloth
else:
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !uv pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !uv pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !uv pip install --no-deps unsloth
!uv pip install transformers==4.56.2
!uv pip install --no-deps trl==0.22.2

## Load Model

In [None]:
from unsloth import FastVisionModel
import torch
from transformers import AutoModelForCausalLM, AutoProcessor

model_path = "unsloth/PaddleOCR-VL"

model, tokenizer = FastVisionModel.from_pretrained(
    # Model identifier from HuggingFace Hub.
    # PaddleOCR-VL is a 1B parameter OCR-focused vision model.
    # Works reliably with Unsloth's SFTTrainer unlike Qwen3-VL.
    model_path,

    # Maximum sequence length for input text + generated tokens.
    # Unsloth recommends 2048 for vision tasks. Can increase to 4096+ for long outputs.
    max_seq_length=2048,

    # QLoRA 4-bit quantization. Reduces VRAM by ~75% with minimal quality loss.
    # False = full precision (16-bit). PaddleOCR-VL uses full finetuning by default.
    load_in_4bit=False,

    # 8-bit quantization. Less memory savings than 4-bit but more accurate.
    # Generally use either 4-bit OR 8-bit, not both.
    load_in_8bit=False,

    # Enable full finetuning of all parameters (not just LoRA adapters).
    # Uses more VRAM but can achieve better results for domain-specific tasks.
    full_finetuning=True,

    # Base model class to use. AutoModelForCausalLM is standard for most VLMs.
    auto_model=AutoModelForCausalLM,

    # Required for models with custom code (like PaddleOCR-VL).
    # WARNING: Only use with trusted models from HuggingFace.
    trust_remote_code=True,

    # Force Triton kernel compilation for faster inference.
    # Unsloth recommendation: True for best performance.
    unsloth_force_compile=True,
)

# Load processor separately - required for PaddleOCR-VL
# The processor handles image preprocessing and tokenization.
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

## Apply LoRA Adapters

In [None]:
model = FastVisionModel.get_peft_model(
    model,

    # LoRA rank. Higher = more parameters, more expressiveness, more VRAM.
    # Unsloth recommends: 8-32 for small models, 64-128 for larger models.
    # PaddleOCR notebook uses 64.
    r=64,

    # LoRA alpha scaling factor. Controls the magnitude of LoRA updates.
    # Rule of thumb: set equal to r, or 2x r for stronger adaptation.
    lora_alpha=64,

    # Dropout probability for LoRA layers. 0 = no dropout (faster training).
    # Unsloth recommends 0 for most cases.
    lora_dropout=0,

    # Whether to add bias to LoRA layers.
    # "none" = no bias (Unsloth default, saves memory).
    # "all" or "lora_only" = add bias.
    bias="none",

    # Random seed for reproducibility.
    # 3407 is Unsloth's "lucky" seed from their experiments.
    random_state=3407,

    # Rank-Stabilized LoRA. Improves training stability for high ranks.
    # Unsloth recommends False for most cases.
    use_rslora=False,

    # Target modules for LoRA adaptation.
    # These are the linear layers in the transformer that LoRA will modify.
    # Includes attention (q/k/v/o_proj), MLP (gate/up/down_proj), and vision layers.
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",  # Attention layers
        "gate_proj", "up_proj", "down_proj",      # MLP layers
        "out_proj", "fc1", "fc2",                 # Vision encoder layers
        "linear_1", "linear_2"                    # Vision projector layers
    ]
)

## Load Dataset (Lazy Loading for Memory Efficiency)

In [None]:
from datasets import load_dataset
from PIL import Image
import base64
from io import BytesIO
import gc

# Load raw dataset from HuggingFace
raw_dataset = load_dataset("wrath/well-log-headers-ocr")

# Default instruction for OCR task
default_instruction = """Convert the following document to markdown.
Return only the markdown with no explanation text. Do not include delimiters like ```markdown or ```html.

RULES:
- You must include all information on the page. Do not exclude headers, footers, or subtext.
- Return tables in an HTML format.
- Charts & infographics must be interpreted to a markdown format. Prefer table format when applicable.
- Prefer using ‚òê and ‚òë for check boxes."""

def b64_to_image(b64_str):
    """Decode base64 string to PIL Image."""
    return Image.open(BytesIO(base64.b64decode(b64_str))).convert("RGB")


class LazyVisionDataset:
    """
    MEMORY-EFFICIENT dataset wrapper with lazy image loading.
    
    Images are only decoded from base64 when accessed, preventing
    RAM crashes from loading all images into memory at once.
    This is critical for Colab's limited RAM (12-13GB).
    """
    def __init__(self, hf_dataset, instruction=default_instruction):
        self.data = hf_dataset
        self.instruction = instruction

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        img = b64_to_image(sample["image_base64"])
        return {
            "images": [img],
            "messages": [
                {"role": "user", "content": [
                    {"type": "text", "text": self.instruction},
                    {"type": "image", "image": img}
                ]},
                {"role": "assistant", "content": [
                    {"type": "text", "text": sample["answer"]}
                ]},
            ]
        }


train_dataset = LazyVisionDataset(raw_dataset["train"])
eval_dataset = LazyVisionDataset(raw_dataset["eval"])

print(f"Train: {len(train_dataset)} samples")
print(f"Eval: {len(eval_dataset)} samples")
gc.collect()

## Before Finetuning - Test Baseline on Eval Set

In [None]:
from IPython.display import display, HTML
import random

def pil_to_base64(img):
    """Convert PIL Image to base64 string for HTML embedding."""
    buffered = BytesIO()
    img.save(buffered, format="PNG")
    return base64.b64encode(buffered.getvalue()).decode()

def display_side_by_side(img, output, title="Sample"):
    """Display image and rendered output side-by-side in notebook."""
    html = f"""
    <div style="display:flex;gap:20px;padding:15px;background:#1e1e1e;border-radius:10px;margin:10px 0;">
        <div><h4 style="color:#888;">{title}</h4>
            <img src="data:image/png;base64,{pil_to_base64(img)}" style="max-width:400px;border-radius:5px;"/></div>
        <div style="flex:1;background:white;color:black;padding:15px;border-radius:5px;overflow:auto;max-height:600px;">
            {output}</div>
    </div>"""
    display(HTML(html))

random.seed(42)
test_indices = random.sample(range(len(eval_dataset)), min(3, len(eval_dataset)))

def run_inference(model, image, instruction=default_instruction):
    """
    Run inference on a single image.
    
    Uses PaddleOCR-VL's recommended generation settings:
    - temperature=1.5 for diverse outputs
    - min_p=0.1 for quality filtering
    """
    FastVisionModel.for_inference(model)
    messages = [
        {"role": "user", "content": [
            {"type": "image"},
            {"type": "text", "text": instruction}
        ]}
    ]
    text_prompt = processor.tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = processor(
        image, text_prompt,
        add_special_tokens=False,
        return_tensors="pt"
    ).to("cuda")
    output = model.generate(
        **inputs,
        max_new_tokens=512,
        use_cache=False,
        temperature=1.5,
        min_p=0.1
    )
    return processor.tokenizer.decode(output[0], skip_special_tokens=True)

print("=== BEFORE FINETUNING (on EVAL set) ===")
baseline_outputs = []
for i in test_indices:
    sample = eval_dataset[i]
    img = sample["images"][0]
    output = run_inference(model, img)
    baseline_outputs.append(output)
    display_side_by_side(img, output, f"Eval Sample {i} - BEFORE")
    del sample
gc.collect()

## Training

In [None]:
from trl import SFTTrainer, SFTConfig
from unsloth.trainer import UnslothVisionDataCollator

FastVisionModel.for_training(model)

# UnslothVisionDataCollator - CRITICAL for vision finetuning!
# This handles batching of multimodal (image + text) inputs correctly.
custom_collator = UnslothVisionDataCollator(
    # The model being trained
    model=model,

    # Processor for image/text preprocessing
    processor=processor,

    # Index to ignore in loss calculation (usually -100 for padding)
    ignore_index=-100,

    # Maximum sequence length for inputs
    max_seq_length=2048,

    # Only compute loss on assistant responses, not user prompts.
    # This helps the model learn to generate outputs, not memorize inputs.
    train_on_responses_only=True,

    # String that marks the start of user instruction in the chat template.
    # Must match the model's chat template format.
    instruction_part="User: ",

    # String that marks the start of assistant response in the chat template.
    # Must match the model's chat template format.
    response_part="\nAssistant:",

    # Pad sequences to a multiple of this value for efficient GPU utilization.
    # 8 is optimal for most GPUs.
    pad_to_multiple_of=8,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=processor.tokenizer,
    data_collator=custom_collator,
    train_dataset=train_dataset,
    args=SFTConfig(
        # === BATCH SIZE ===
        # Samples per GPU per forward pass. Higher = faster but more VRAM.
        # Unsloth recommends 4 for T4 GPU with small models.
        per_device_train_batch_size=4,

        # Accumulate gradients over N steps before updating weights.
        # Effective batch size = per_device_batch_size √ó gradient_accumulation_steps.
        # Unsloth recommends 2-4 for vision tasks.
        gradient_accumulation_steps=2,

        # === TRAINING DURATION ===
        # Number of warmup steps. Prevents early training instability.
        # Rule of thumb: 5-10% of total steps, or 5 minimum.
        warmup_steps=5,

        # Total training steps. 60 for demo, 200-500+ for production.
        # Alternatively, use num_train_epochs=1 for full dataset pass.
        max_steps=60,

        # === LEARNING RATE ===
        # Learning rate for optimizer. Lower = more stable, slower convergence.
        # Unsloth recommends 5e-5 for full finetuning, 2e-4 for LoRA.
        learning_rate=5e-5,

        # === LOGGING ===
        # Log training metrics every N steps.
        logging_steps=1,

        # === OPTIMIZER ===
        # 'adamw_8bit' saves ~30% memory vs standard AdamW.
        # Unsloth recommendation for Colab.
        optim="adamw_8bit",

        # L2 regularization to prevent overfitting. Range: 0.0 to 0.1.
        weight_decay=0.001,

        # Learning rate schedule. 'linear' decreases LR linearly to 0.
        # Alternatives: 'cosine', 'constant', 'polynomial'.
        lr_scheduler_type="linear",

        # === MISC ===
        # Random seed for reproducibility. 3407 is Unsloth's "lucky" seed.
        seed=3407,

        # Directory to save checkpoints and logs.
        output_dir="outputs",

        # Disable reporting to wandb/tensorboard. Use "wandb" to enable.
        report_to="none",

        # === VISION-SPECIFIC (REQUIRED) ===
        # Don't remove columns not in model signature (vision data has extra fields).
        remove_unused_columns=False,

        # Empty string required for vision tasks (we use custom collator).
        dataset_text_field="",

        # Skip TRL's dataset preparation (we handle it ourselves).
        dataset_kwargs={"skip_prepare_dataset": True},

        # Maximum sequence length for model inputs.
        max_length=2048,

        # === PRECISION ===
        # Use fp16 if bf16 not supported (older GPUs like T4).
        fp16=not torch.cuda.is_bf16_supported(),

        # Use bf16 if supported (A100, H100). Better training stability.
        bf16=torch.cuda.is_bf16_supported(),
    ),
)

In [None]:
# Show current memory stats before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

# Train!
trainer_stats = trainer.train()

# Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"\n{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## After Finetuning - Compare Results on Eval Set

In [None]:
def display_comparison(img, before, after, expected, title):
    """Display 4-column comparison: image, before, after, expected."""
    html = f"""
    <div style="padding:15px;background:#1e1e1e;border-radius:10px;margin:20px 0;">
        <h3 style="color:#4CAF50;">{title}</h3>
        <div style="display:flex;gap:10px;flex-wrap:wrap;">
            <div><h4 style="color:#888;">Input Image</h4>
                <img src="data:image/png;base64,{pil_to_base64(img)}" style="max-width:200px;"/></div>
            <div style="flex:1;min-width:180px;"><h4 style="color:#f44336;">‚ùå Before</h4>
                <div style="background:white;color:black;padding:8px;max-height:300px;overflow:auto;font-size:10px;">{before[:1500]}</div></div>
            <div style="flex:1;min-width:180px;"><h4 style="color:#4CAF50;">‚úÖ After</h4>
                <div style="background:white;color:black;padding:8px;max-height:300px;overflow:auto;font-size:10px;">{after[:1500]}</div></div>
            <div style="flex:1;min-width:180px;"><h4 style="color:#2196F3;">üìã Expected</h4>
                <div style="background:white;color:black;padding:8px;max-height:300px;overflow:auto;font-size:10px;">{expected[:1500]}</div></div>
        </div>
    </div>"""
    display(HTML(html))

print("=== AFTER FINETUNING (on EVAL set - unseen data) ===")
for idx, i in enumerate(test_indices):
    sample = eval_dataset[i]
    img = sample["images"][0]
    expected = sample["messages"][1]["content"][0]["text"]
    after_output = run_inference(model, img)
    display_comparison(img, baseline_outputs[idx], after_output, expected, f"Eval Sample {i}")
    del sample
gc.collect()

## Save Model

In [None]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
print("LoRA adapters saved to ./lora_model")

# Optional: Push to HuggingFace Hub
# model.push_to_hub("your_name/well-log-ocr-paddleocr", token="...")
# tokenizer.push_to_hub("your_name/well-log-ocr-paddleocr", token="...")

## Load Saved Model for Inference

In [None]:
if False:  # Set to True to load saved model
    from unsloth import FastVisionModel
    model, tokenizer = FastVisionModel.from_pretrained(
        model_name="lora_model",  # Your saved model
        load_in_4bit=False,
    )
    FastVisionModel.for_inference(model)

    # Test inference
    from transformers import TextStreamer
    text_streamer = TextStreamer(tokenizer, skip_prompt=True)
    _ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128,
                       use_cache=False, temperature=1.5, min_p=0.1)