In [1]:
%pip install -q "transformers>=4.49.0" accelerate "qwen-vl-utils[decord]==0.0.8"

In [2]:
import json
import PIL

In [3]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor


MODEL_ID="zackriya/diagram2graph"
MAX_PIXELS = 1280 * 28 * 28
MIN_PIXELS = 256 * 28 * 28


model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

processor = Qwen2_5_VLProcessor.from_pretrained(
    MODEL_ID,
    min_pixels=MIN_PIXELS,
    max_pixels=MAX_PIXELS
)

config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/180M [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/577 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/7.35k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [28]:
from qwen_vl_utils import process_vision_info

SYSTEM_MESSAGE = """You are a Vision Language Model specialized in extracting structured data from visual representations of process and flow diagrams.
Your task is to analyze the provided image of a diagram and extract the relevant information into a well-structured JSON format.
The diagram includes details such as nodes and edges. each of them have their own attributes.
Focus on identifying key data fields and ensuring the output adheres to the requested JSON structure.
Provide only the JSON output based on the extracted information. Avoid additional explanations or comments."""

def run_inference(image):
  """
  Inference with the Model
  """
  messages= [
      {
          "role": "system",
          "content": [{"type": "text", "text": SYSTEM_MESSAGE}],
      },
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  # this image is handled by qwen_vl_utils's process_visio_Info so no need to worry about pil image or path
                  "image": image,
              },
              {
                  "type": "text",
                  "text": "Extract data in JSON format, Only give the JSON",
              },
          ],
      },
  ]

  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
  image_inputs, _ = process_vision_info(messages)

  inputs = processor(
      text=[text],
      images=image_inputs,
      return_tensors="pt",
  )
  inputs = inputs.to('cuda')

  generated_ids = model.generate(**inputs, max_new_tokens=1024)
  generated_ids_trimmed = [
      out_ids[len(in_ids):]
      for in_ids, out_ids
      in zip(inputs.input_ids, generated_ids)
  ]

  output_text = processor.batch_decode(
      generated_ids_trimmed,
      skip_special_tokens=True,
      clean_up_tokenization_spaces=False
  )
  return output_text


In [29]:
# As per the qwen documentation the input can be PIL.Image, relative path or URL
output = run_inference("https://uwaterloo.ca/vpaf-project-management-office/sites/default/files/uploads/images/process-flow-2013-11-15.png")
output

['{"nodes": [{"id": "1", "type_of_node": "start", "shape": "start_event", "label": "Phone call is placed."}, {"id": "2", "type_of_node": "decision", "shape": "gateway", "label": "Is ringer turned on?"}, {"id": "3", "type_of_node": "process", "shape": "task", "label": "Voice mail picks up."}, {"id": "4", "type_of_node": "process", "shape": "task", "label": "Phone rings."}, {"id": "5", "type_of_node": "decision", "shape": "gateway", "label": "Does user pick up?"}, {"id": "6", "type_of_node": "decision", "shape": "gateway", "label": "Is it ring #4?"}, {"id": "7", "type_of_node": "terminator", "shape": "end_event", "label": "Phone call is complete."}], "edges": [{"source": "1", "source_type": "start", "source_label": "Phone call is placed.", "target": "2", "target_type": "decision", "target_label": "Is ringer turned on?", "type_of_edge": "solid", "relationship_value": "", "relationship_type": "follows"}, {"source": "2", "source_type": "decision", "source_label": "Is ringer turned on?", "ta

In [30]:
json.loads(output[0])

{'nodes': [{'id': '1',
   'type_of_node': 'start',
   'shape': 'start_event',
   'label': 'Phone call is placed.'},
  {'id': '2',
   'type_of_node': 'decision',
   'shape': 'gateway',
   'label': 'Is ringer turned on?'},
  {'id': '3',
   'type_of_node': 'process',
   'shape': 'task',
   'label': 'Voice mail picks up.'},
  {'id': '4',
   'type_of_node': 'process',
   'shape': 'task',
   'label': 'Phone rings.'},
  {'id': '5',
   'type_of_node': 'decision',
   'shape': 'gateway',
   'label': 'Does user pick up?'},
  {'id': '6',
   'type_of_node': 'decision',
   'shape': 'gateway',
   'label': 'Is it ring #4?'},
  {'id': '7',
   'type_of_node': 'terminator',
   'shape': 'end_event',
   'label': 'Phone call is complete.'}],
 'edges': [{'source': '1',
   'source_type': 'start',
   'source_label': 'Phone call is placed.',
   'target': '2',
   'target_type': 'decision',
   'target_label': 'Is ringer turned on?',
   'type_of_edge': 'solid',
   'relationship_value': '',
   'relationship_type': 