In [1]:
%pip install "transformers>=4.49.0" accelerate peft bitsandbytes datasets "qwen-vl-utils[decord]==0.0.8" "comet-ml>=3.31.0"

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import json
import PIL
import comet_ml

In [3]:
!nvidia-smi

Thu Mar 13 18:10:48 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.230.02             Driver Version: 535.230.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L40S                    Off | 00000000:34:00.0 Off |                    0 |
| N/A   29C    P0              75W / 350W |    558MiB / 46068MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
from datasets import load_dataset

dataset = load_dataset("zackriya/diagramJSON")

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'filename', 'json_string'],
        num_rows: 199
    })
    test: Dataset({
        features: ['image', 'filename', 'json_string'],
        num_rows: 20
    })
})

In [None]:
dataset["train"][0]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=504x271>,
 'filename': '548.png',
 'json_string': '{"nodes": [{"id": "1", "type_of_node": "start", "shape": "start_event", "label": "Start"}, {"id": "2", "type_of_node": "decision", "shape": "gateway", "label": "Make decision"}, {"id": "3", "type_of_node": "process", "shape": "task", "label": "Do 1"}, {"id": "4", "type_of_node": "process", "shape": "task", "label": "Do 2"}, {"id": "5", "type_of_node": "process", "shape": "task", "label": "Do 3"}, {"id": "6", "type_of_node": "terminator", "shape": "end_event", "label": "Stop"}], "edges": [{"source": "1", "source_type": "start", "source_label": "Start", "target": "2", "target_type": "decision", "target_label": "Make decision", "type_of_edge": "solid", "relationship_value": "", "relationship_type": "follows"}, {"source": "2", "source_type": "decision", "source_label": "Make decision", "target": "3", "target_type": "process", "target_label": "Do 1", "type_of_edge": "solid", "r

In [9]:
SYSTEM_MESSAGE = """You are a Vision Language Model specialized in extracting structured data from visual representations of process and flow diagrams.
Your task is to analyze the provided image of a diagram and extract the relevant information into a well-structured JSON format.
The diagram includes details such as nodes and edges. each of them have their own attributes.
Focus on identifying key data fields and ensuring the output adheres to the requested JSON structure.
Provide only the JSON output based on the extracted information. Avoid additional explanations or comments."""

In [10]:
def format_data(entry):
    return [
        {
            "role": "system",
            "content": [{"type": "text", "text": SYSTEM_MESSAGE}],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    # this image is handled by qwen_vl_utils's process_visio_Info so no need to worry about pil image or path
                    "image": entry["image"],
                },
                {
                    "type": "text",
                    "text": "Extract data in JSON format",
                },
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": entry["json_string"]}],
        },
    ]

In [None]:
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [None]:
train_dataset = [(entry, format_data(entry)) for entry in train_dataset]
eval_dataset = [(entry, format_data(entry)) for entry in eval_dataset]

In [13]:
# 2 values, first value is dataset entry, the second one is the chat template with dataset applied
train_dataset[0]

({'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=504x271>,
  'filename': '548.png',
  'json_string': '{"nodes": [{"id": "1", "type_of_node": "start", "shape": "start_event", "label": "Start"}, {"id": "2", "type_of_node": "decision", "shape": "gateway", "label": "Make decision"}, {"id": "3", "type_of_node": "process", "shape": "task", "label": "Do 1"}, {"id": "4", "type_of_node": "process", "shape": "task", "label": "Do 2"}, {"id": "5", "type_of_node": "process", "shape": "task", "label": "Do 3"}, {"id": "6", "type_of_node": "terminator", "shape": "end_event", "label": "Stop"}], "edges": [{"source": "1", "source_type": "start", "source_label": "Start", "target": "2", "target_type": "decision", "target_label": "Make decision", "type_of_edge": "solid", "relationship_value": "", "relationship_type": "follows"}, {"source": "2", "source_type": "decision", "source_label": "Make decision", "target": "3", "target_type": "process", "target_label": "Do 1", "type_of_edge": "solid",

In [None]:
import torch
from peft import get_peft_model, LoraConfig
from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor


MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"

lora_config = LoraConfig(
    lora_alpha=16,  # how much the adapted parms contribute
    lora_dropout=0.05,  # Dropout for lora layers
    r=8,  # lower mean fewer trainable params
    bias="none",
    target_modules=["q_proj", "v_proj"],  # Query and Value project(common in GPT's)
    task_type="CAUSAL_LM",  # For referring predicting next tokens in seq
)

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 1,843,200 || all params: 3,756,466,176 || trainable%: 0.0491


In [None]:
# Min and Max pixels for balancing memory usage
MAX_PIXELS = 1280 * 28 * 28
MIN_PIXELS = 256 * 28 * 28

processor = Qwen2_5_VLProcessor.from_pretrained(
    MODEL_ID, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS
)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
from qwen_vl_utils import process_vision_info


def training_collate_fn(batch):
    # Referring to the dataset applied [{'role': 'system....}...]
    _, formatted_data = zip(*batch)

    texts = [processor.apply_chat_template(entry) for entry in formatted_data]

    image_inputs = [
        process_vision_info(entry)[0]  # Only takes the PIL image
        for entry in formatted_data
    ]

    model_inputs = processor(
        text=texts,
        images=image_inputs,
        padding=True,
        return_tensors="pt",
    )

    labels = model_inputs["input_ids"].clone()  # Cloning for later masking use

    # mask system message and image token IDs in the labels
    for i, example in enumerate(formatted_data):
        sysuser_conv = example[:-1]
        sysuser_text = processor.apply_chat_template(sysuser_conv, tokenize=False)
        sysuser_img, _ = process_vision_info(sysuser_conv)

        sysuser_inputs = processor(
            text=[sysuser_text],
            images=[sysuser_img],
            return_tensors="pt",
            padding=True,
        )

        sysuser_len = sysuser_inputs["input_ids"].shape[1]
        labels[i, :sysuser_len] = -100

    input_ids = model_inputs["input_ids"]
    attention_mask = model_inputs["attention_mask"]
    pixel_values = model_inputs["pixel_values"]
    image_grid_thw = model_inputs["image_grid_thw"]

    return input_ids, attention_mask, pixel_values, image_grid_thw, labels


In [None]:
def evaluating_collate_fn(batch):
    data, formatted_data = zip(*batch)
    ground_truth = [entry["json_string"] for entry in data]

    # Removing the assistant answer section from the formatted data
    formatted_data = [entry[:2] for entry in formatted_data]

    texts = [
        processor.apply_chat_template(entry, tokenize=False) for entry in formatted_data
    ]

    image_inputs = [process_vision_info(entry)[0] for entry in formatted_data]

    model_inputs = processor(
        text=texts,
        images=image_inputs,
        return_tensors="pt",
        padding=True,
    )
    input_ids = model_inputs["input_ids"]
    attention_mask = model_inputs["attention_mask"]
    pixel_values = model_inputs["pixel_values"]
    image_grid_thw = model_inputs["image_grid_thw"]

    return input_ids, attention_mask, pixel_values, image_grid_thw, ground_truth

In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 1
NUM_WORKERS = 4

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=training_collate_fn,
    num_workers=NUM_WORKERS,
)
eval_loader = DataLoader(
    eval_dataset,
    batch_size=BATCH_SIZE,
    collate_fn=evaluating_collate_fn,
    num_workers=NUM_WORKERS,
)

In [None]:
%pip install -q lightning nltk

In [None]:
from lightning.pytorch.loggers import CometLogger

comet_logger = CometLogger(
    api_key=os.environ.get("COMET_API_KEY"),
    project_name="qwen2-5-vl-finetune",
    workspace="mohammedsafvan",
)

CometLogger will be initialized in online mode


In [None]:
import lightning as L
from nltk import edit_distance
from torch.optim import AdamW


class Qwen2_5_Trainer(L.LightningModule):
    def __init__(self, model, processor, config):
        super().__init__()
        self.model = model
        self.processor = processor
        self.config = config

    def training_step(self, batch):
        input_ids, attention_mask, pixel_values, image_grid_thw, labels = batch
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            image_grid_thw=image_grid_thw,
            labels=labels,  # Masked labels
        )
        loss = outputs.loss
        self.log("train_loss", loss, logger=True, prog_bar=True)
        return loss

    def validation_step(self, batch):
        input_ids, attention_mask, pixel_values, image_grid_thw, ground_truths = batch
        generated_ids = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            image_grid_thw=image_grid_thw,
            max_new_tokens=1024,
        )

        # The output(generated) tokens includes the input tokens. So trimming out the input_ids from the output_ids
        trimmed_generated_ids = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(input_ids, generated_ids)
        ]

        generated_json = self.processor.batch_decode(
            trimmed_generated_ids,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
        )
        scores = []
        for generated, ground_truth in zip(generated_json, ground_truths):
            score = edit_distance(generated, ground_truth)
            score = score / max(len(generated), len(ground_truth))
            scores.append(score)

            print(f"Generated JSON : {generated}")
            print(f"Ground Truth(JSON): {ground_truth}")
            print(f"Score: {score}")

        score = sum(scores) / len(scores)
        self.log(
            "val_edit_distance",
            score,
            logger=True,
            prog_bar=True,
            batch_size=self.config.get("batch_size"),
        )
        return scores

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=self.config.get("lr"))
        return optimizer

    def train_dataloader(self):
        return DataLoader(
            train_dataset,
            batch_size=self.config.get("batch_size"),
            collate_fn=training_collate_fn,
            shuffle=True,
            num_workers=10,
        )

    def val_dataloader(self):
        return DataLoader(
            eval_dataset,
            batch_size=self.config.get("batch_size"),
            collate_fn=evaluating_collate_fn,
            num_workers=10,
        )


In [None]:
config = {
    "max_epochs": 10,
    "batch_size": 2,
    "lr": 2e-4,  # 5
    "check_val_every_n_epoch": 2,
    "gradient_clip_val": 1.0,
    "accumulate_grad_batches": 8,
    # "accumulate_grad_batches":2,
    "num_nodes": 1,
    "warmup_steps": 50,
    "result_path": "qwen2.5-3b-instruct-diagram-json(second)",
}

In [23]:
model_module = Qwen2_5_Trainer(model, processor, config)

In [None]:
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import Callback

# minimum val_edit_distance is good
early_stopping_callback = EarlyStopping(
    monitor="val_edit_distance", patience=3, verbose=False, mode="min"
)


class SaveCheckpoint(Callback):
    def __init__(self, result_path):
        self.result_path = result_path
        self.epoch = 0

    def on_train_epoch_end(self, trainer, pl_module):
        checkpoint_path = f"{self.result_path}/{self.epoch}"
        os.makedirs(checkpoint_path, exist_ok=True)

        pl_module.processor.save_pretrained(checkpoint_path)
        pl_module.model.save_pretrained(checkpoint_path)
        print(f"Checkpoint saved at {checkpoint_path}")

        self.epoch += 1

    def on_train_end(self, trainer, pl_module):
        checkpoint_path = f"{self.result_path}/latest"
        os.makedirs(checkpoint_path, exist_ok=True)

        pl_module.processor.save_pretrained(checkpoint_path)
        pl_module.model.save_pretrained(checkpoint_path)
        print(f"(Train Ended) -- Checkpoint saved at {checkpoint_path}")


In [None]:
trainer = L.Trainer(
    accelerator="gpu",
    devices=[0],  # -1 refers to use all available gpu's
    max_epochs=config.get("max_epochs"),
    check_val_every_n_epoch=config.get("check_val_every_n_epoch"),
    gradient_clip_val=config.get("gradient_clip_val"),
    accumulate_grad_batches=config.get("accumulate_grad_batches"),
    limit_val_batches=1,
    num_sanity_val_steps=0,
    log_every_n_steps=1,
    callbacks=[early_stopping_callback, SaveCheckpoint(config.get("result_path"))],
    logger=comet_logger,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
`Trainer(limit_val_batches=1)` was configured so 1 batch will be used.


In [26]:
trainer.fit(model_module)

You are using a CUDA device ('NVIDIA L40S') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/mohammedsafvan/qwen2-5-vl-finetune/1bf2a0473f5c4c4cb18e2a22a4eda764

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                 | Params | Mode 
-------------------------------------------------------
0 | model | PeftModelForCausalLM | 3.8 B  | train
-------------------------------------------------------
1.8 M     Trainable params
3.8 B     Non-trainable params
3.8 B     Total params
15,025.865Total estimated model params size (MB)
722       Modules in train mode
874       Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

Checkpoint saved at qwen2.5-3b-instruct-diagram-json/0


Validation: |          | 0/? [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generated JSON : nodes [
    {
        "name": "Start",
        "type": "start"
    },
    {
        "name": "Order 1",
        "type": "process"
    },
    {
        "name": "Create",
        "type": "process"
    },
    {
        "name": "Flowchart",
        "type": "process"
    },
    {
        "name": "Document",
        "type": "process"
    },
    {
        "name": "Code",
        "type": "process"
    },
    {
        "name": "End",
        "type": "end"
    }
]
Ground Truth(JSON): {"nodes": [{"id": "1", "type_of_node": "start", "shape": "start_event", "label": "Start"}, {"id": "2", "type_of_node": "process", "shape": "task", "label": "Order 1"}, {"id": "3", "type_of_node": "decision", "shape": "gateway", "label": "Accept?"}, {"id": "4", "type_of_node": "process", "shape": "task", "label": "Create"}, {"id": "5", "type_of_node": "process", "shape": "task", "label": "Flowchart"}, {"id": "6", "type_of_node": "process", "shape": "task", "label": "Document"}, {"id": "7", "type_of_no

Validation: |          | 0/? [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generated JSON : assistant
{"nodes": [{"id": "1", "type": "start", "label": "Start"}, {"id": "2", "type": "process", "label": "Order 1"}, {"id": "3", "type": "decision", "label": "Accept?"}, {"id": "4", "type": "process", "label": "Create"}, {"id": "5", "type": "process", "label": "Flowchart"}, {"id": "6", "type": "process", "label": "Document"}, {"id": "7", "type": "process", "label": "Code"}, {"id": "8", "type": "end", "label": "End"}], "edges": [{"source": "1", "target": "2", "type": "flow", "label": ""}, {"source": "2", "target": "3", "type": "flow", "label": ""}, {"source": "3", "target": "4", "type": "flow", "label": "Yes"}, {"source": "3", "target": "6", "type": "flow", "label": "No"}, {"source": "4", "target": "5", "type": "flow", "label": ""}, {"source": "5", "target": "6", "type": "flow", "label": ""}, {"source": "6", "target": "7", "type": "flow", "label": ""}, {"source": "7", "target": "8", "type": "flow", "label": ""}]}
Ground Truth(JSON): {"nodes": [{"id": "1", "type_of_n

Validation: |          | 0/? [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generated JSON : assistant
{"nodes": [{"id": "1", "type_of_node": "start", "shape": "start_event", "label": "Start"}, {"id": "2", "type_of_node": "process", "shape": "task", "label": "Order 1"}, {"id": "3", "type_of_node": "decision", "shape": "gateway", "label": "Accept?"}, {"id": "4", "type_of_node": "process", "shape": "task", "label": "Create"}, {"id": "5", "type_of_node": "process", "shape": "task", "label": "Flowchart"}, {"id": "6", "type_of_node": "process", "shape": "task", "label": "Document"}, {"id": "7", "type_of_node": "process", "shape": "task", "label": "Code"}, {"id": "8", "type_of_node": "terminator", "shape": "end_event", "label": "End"}], "edges": [{"source": "1", "source_type": "start", "source_label": "Start", "target": "2", "target_type": "process", "target_label": "Order 1", "type_of_edge": "solid", "relationship_value": "", "relationship_type": "follows"}, {"source": "2", "source_type": "process", "source_label": "Order 1", "target": "3", "target_type": "decision



Checkpoint saved at qwen2.5-3b-instruct-diagram-json/6


Validation: |          | 0/? [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generated JSON : assistant
{"nodes": [{"id": "1", "type_of_node": "start", "shape": "start_event", "label": "Start"}, {"id": "2", "type_of_node": "process", "shape": "task", "label": "Order 1"}, {"id": "3", "type_of_node": "decision", "shape": "gateway", "label": "Accept?"}, {"id": "4", "type_of_node": "process", "shape": "task", "label": "Create"}, {"id": "5", "type_of_node": "process", "shape": "task", "label": "Flowchart"}, {"id": "6", "type_of_node": "process", "shape": "task", "label": "Document"}, {"id": "7", "type_of_node": "process", "shape": "task", "label": "Code"}, {"id": "8", "type_of_node": "terminator", "shape": "end_event", "label": "End"}], "edges": [{"source": "1", "source_type": "start", "source_label": "Start", "target": "2", "target_type": "process", "target_label": "Order 1", "type_of_edge": "solid", "relationship_value": "", "relationship_type": "follows"}, {"source": "2", "source_type": "process", "source_label": "Order 1", "target": "3", "target_type": "decision

Validation: |          | 0/? [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generated JSON : assistant
{"nodes": [{"id": "1", "type_of_node": "start", "shape": "start_event", "label": "Start"}, {"id": "2", "type_of_node": "process", "shape": "task", "label": "Order 1"}, {"id": "3", "type_of_node": "decision", "shape": "gateway", "label": "Accept?"}, {"id": "4", "type_of_node": "process", "shape": "task", "label": "Create"}, {"id": "5", "type_of_node": "process", "shape": "task", "label": "Flowchart"}, {"id": "6", "type_of_node": "process", "shape": "task", "label": "Document"}, {"id": "7", "type_of_node": "process", "shape": "task", "label": "Code"}, {"id": "8", "type_of_node": "terminator", "shape": "end_event", "label": "End"}], "edges": [{"source": "1", "source_type": "start", "source_label": "Start", "target": "2", "target_type": "process", "target_label": "Order 1", "type_of_edge": "solid", "relationship_value": "", "relationship_type": "follows"}, {"source": "2", "source_type": "process", "source_label": "Order 1", "target": "3", "target_type": "decision

`Trainer.fit` stopped: `max_epochs=10` reached.
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : wise_pear_5739
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/mohammedsafvan/qwen2-5-vl-finetune/1bf2a0473f5c4c4cb18e2a22a4eda764
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     loss [100]            : (0.00012576776498463005, 0.3202984631061554)
[1;38;5;39mCOMET INFO:[0m     train_loss [130]      : (0.001706618582829833, 2.562387704849243)
[1;38;5;39mCOMET INFO:[0m     val_edit_distance [5] : (0.5111111402511597, 0.9231404662132263)

Checkpoint saved at qwen2.5-3b-instruct-diagram-json/9


NameError: name 'exist_ok' is not defined

In [None]:
# The last epoch(9) is same as the latest checkpoint; so no need to load the checkpoint for more training