In [None]:
import os
from unsloth import FastVisionModel, is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
import torch
import docdataset as dd
import wandb
from dotenv import load_dotenv
load_dotenv()

In [1]:
your_name='ansu0122'
model_save_name='uadoc-ada-qwen2.5vl'

In [None]:
# Hyperparameters
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

seed = 3407

model_name = "unsloth/Qwen2.5-VL-7B-Instruct"
load_in_4bit = False

lora_r = 16
lora_alpha = 16

max_steps = 30
num_train_epochs = 1
warmup_steps = 5

learning_rate = 2e-4
weight_decay = 0.01
optim = "adamw_8bit"
lr_scheduler_type = "linear"

per_device_train_batch_size = 2
gradient_accumulation_steps = 4
max_seq_length = 2048

dataset_num_proc = 4

In [None]:
WANDB_API_KEY = os.getenv("WANDB_API_KEY")
wandb.login(key=WANDB_API_KEY)

run = wandb.init(
    entity="uadoc-ada-qwen2.5vl",
    project="uadoc-ada",
    name="uadoc-ada-qwen2.5vl",
    tags=["docai", "qwen2.5vl"],
    config={
        "seed": seed,

        "model_name": model_name,

        "max_steps": max_steps,
        "num_train_epochs": num_train_epochs,
        "warmup_steps": warmup_steps,

        "learning_rate": learning_rate,
        "weight_decay": weight_decay,
        "optim": optim,
        "lr_scheduler_type": lr_scheduler_type,

        "per_device_train_batch_size": per_device_train_batch_size,
        "gradient_accumulation_steps": gradient_accumulation_steps,
        "max_seq_length": max_seq_length,

        "dataset_num_proc": dataset_num_proc,
    }
)


### Downloading the Model

In [None]:
model, tokenizer = FastVisionModel.from_pretrained(
    model_name,
    load_in_4bit = load_in_4bit, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

### Adding LoRA

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = lora_r,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = lora_alpha,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = seed,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

### Data Prep

In [27]:
import docdataset as dd
import prompt_templates as pt
import string_util as su
from datasets import Dataset

In [32]:
dataset = dd.download_dataset()['train']
train_dataset = dataset.filter(lambda x: x["split"] == "train")
test_dataset = dataset.filter(lambda x: x["split"] == "test")

In [None]:
train_dataset_inflated = dd.prep_train_data(train_dataset)
train_dataset_inflated = train_dataset_inflated.shuffle(seed=seed)
len(train_dataset_inflated)

7200

In [None]:
test_dataset_inflated = dd.prep_train_data(test_dataset)
test_dataset_inflated = test_dataset_inflated.shuffle(seed=seed)
len(test_dataset_inflated)

1800

In [41]:
train_dataset_inflated[0]

{'id': 'e6a6214169b7469299f4eb91a7b356fe',
 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=764x161>,
 'focus': 'table',
 'content': '<table><tbody><tr><td>Загальна кількість країн</td><td></td><td></td><td></td><td></td><td></td><td>50</td></tr><tr><td>Кількість очок до зміни зон</td><td></td><td></td><td></td><td></td><td></td><td>2</td></tr><tr><td>Кількість очок перед зміною сторін після першої частини гри</td><td></td><td></td><td></td><td></td><td></td><td>незалежно</td></tr></tbody></table>',
 'prompt': '\n        Витягни таблицю з даними, що міститься на зображенні, у форматі HTML.\n\n        Вимоги:\n        – Поверни лише HTML-код таблиці, без додаткового тексту чи опису.\n        – Залиш текст у комірках таким, як він є на зображенні.\n        – Збережи структуру таблиці, включно з усіма рядками (`<tr>`) та комірками (`<td>`), відповідно до оригінального вигляду.\n        - Уникай використання тегів для заголовків (`<thead>`, `<th>`).\n        – Використовуй те

In [42]:
def convert_to_conversation(sample):
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : sample["prompt"]},
            {"type" : "image", "image" : sample["image"]} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : sample["content"]} ]
        },
    ]
    return { "messages" : conversation }
pass

In [45]:
train_dataset_converted = Dataset.from_list([convert_to_conversation(sample) for sample in train_dataset_inflated])
len(train_dataset_converted)

7200

In [46]:
test_dataset_converted = Dataset.from_list([convert_to_conversation(sample) for sample in test_dataset_inflated])
len(test_dataset_converted)

1800

### Training Setup

In [None]:
from transformers import TrainerCallback, TrainerControl, TrainerState, TrainingArguments

class EmptyCacheCallback(TrainerCallback):
    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        torch.cuda.empty_cache()
        return control
    
callbacks = [EmptyCacheCallback()]

In [None]:
FastVisionModel.for_training(model) # Enable for training!

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = train_dataset_converted,
    eval_dataset=test_dataset_converted,
    callbacks = callbacks,
    args = SFTConfig(
        per_device_train_batch_size = per_device_train_batch_size,
        gradient_accumulation_steps = gradient_accumulation_steps,
        warmup_steps = warmup_steps,
        max_steps = max_steps,
        # num_train_epochs = num_train_epochs, # Set this instead of max_steps for full training runs
        learning_rate = learning_rate,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),

        optim = optim,
        weight_decay = weight_decay,
        lr_scheduler_type = lr_scheduler_type,
        seed = seed,
        output_dir = "train_outputs",

        report_to = "wandb",     # For Weights and Biases
        logging_steps = 1,            # Log every N steps
        evaluation_strategy = "steps", # 👈 Required for validation logging
        eval_steps = 10,               # Evaluate every N steps

        save_steps = 10,  # Save checkpoint every N steps
        save_total_limit = 3,  # Keep only the last 3 checkpoints
        push_to_hub = True,
        hub_model_id = f"{your_name}/{model_save_name}",
        hub_token = os.environ["HF_TOKEN"],
        hub_strategy = "checkpoint",

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = dataset_num_proc,
        max_seq_length = max_seq_length,
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

### Saving the Latest Model

In [None]:
your_name='ansu0122'
model_save_name='mistral-lora-zno'
# model.save_pretrained(model_save_name)  # Local saving
# tokenizer.save_pretrained(model_save_name)

model.push_to_hub(f"{your_name}/{model_save_name}", token = os.environ["HF_TOKEN"], private=True)
tokenizer.push_to_hub(f"{your_name}/{model_save_name}", token = os.environ["HF_TOKEN"], private=True)