## Windows omni_env

## 00 Install & Import Libraries

In [None]:
import torch
import json
import random
import io
import ast
import xml.etree.ElementTree as ET

from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer
)
from qwen_vl_utils import process_vision_info
from PIL import Image, ImageDraw, ImageFont, ImageColor
from IPython.display import display

## 01 Import Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)

In [None]:
model_path = './00_Model/Qwen2.5-VL-7B-Instruct-unsloth-bnb-4bit'

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map = 'auto',
).to(device) #''
processor = AutoProcessor.from_pretrained(model_path)

## 02 Define Inference Function

In [None]:
additional_colors = [colorname for (colorname, colorcode) in ImageColor.colormap.items()]

def decode_xml_points(text):
    try:
        root = ET.fromstring(text)
        num_points = (len(root.attrib) - 1) // 2
        points = []
        for i in range(num_points):
            x = root.attrib.get(f'x{i+1}')
            y = root.attrib.get(f'y{i+1}')
            points.append([x, y])
        alt = root.attrib.get('alt')
        phrase = root.text.strip() if root.text else None
        return {
            "points": points,
            "alt": alt,
            "phrase": phrase
        }
    except Exception as e:
        print(e)
        return None

def plot_bounding_boxes(im, bounding_boxes, input_width, input_height):
    """
    Plots bounding boxes on an image with markers for each a name, using PIL, normalized coordinates, and different colors.

    Args:
        img_path: The path to the image file.
        bounding_boxes: A list of bounding boxes containing the name of the object
         and their positions in normalized [y1 x1 y2 x2] format.
    """

    # Load the image
    img = im
    width, height = img.size
    print(img.size)
    # Create a drawing object
    draw = ImageDraw.Draw(img)

    # Define a list of colors
    colors = [
    'red',
    'green',
    'blue',
    'yellow',
    'orange',
    'pink',
    'purple',
    'brown',
    'gray',
    'beige',
    'turquoise',
    'cyan',
    'magenta',
    'lime',
    'navy',
    'maroon',
    'teal',
    'olive',
    'coral',
    'lavender',
    'violet',
    'gold',
    'silver',
    ] + additional_colors

    # Parsing out the markdown fencing
    bounding_boxes = parse_json(bounding_boxes)

    font = ImageFont.truetype('./00_Dataset/NotoSansCJK-Regular.ttc', size = 14)

    try:
        json_output = ast.literal_eval(bounding_boxes)
    except Exception as e:
        end_idx = bounding_boxes.rfind('"}') + len('"}')
        truncated_text = bounding_boxes[:end_idx] + "]"
        json_output = ast.literal_eval(truncated_text)

    # Iterate over the bounding boxes
    for i, bounding_box in enumerate(json_output):
        # Select a color from the list
        color = colors[i % len(colors)]

        # Convert normalized coordinates to absolute coordinates
        abs_y1 = int(bounding_box["bbox_2d"][1]/input_height * height)
        abs_x1 = int(bounding_box["bbox_2d"][0]/input_width * width)
        abs_y2 = int(bounding_box["bbox_2d"][3]/input_height * height)
        abs_x2 = int(bounding_box["bbox_2d"][2]/input_width * width)

        if abs_x1 > abs_x2:
            abs_x1, abs_x2 = abs_x2, abs_x1

        if abs_y1 > abs_y2:
            abs_y1, abs_y2 = abs_y2, abs_y1

        # Draw the bounding box
        draw.rectangle(
            ((abs_x1, abs_y1), (abs_x2, abs_y2)), outline = color, width = 4
        )

        # Draw the text
        if "label" in bounding_box:
            draw.text((abs_x1 + 8, abs_y1 + 6), bounding_box["label"], fill = color, font = font)

    # Display the image
    display(img)


def plot_points(im, text, input_width, input_height):
    img = im
    width, height = img.size
    draw = ImageDraw.Draw(img)
    colors = [
        'red', 'green', 'blue', 'yellow', 'orange', 'pink', 'purple', 'brown', 'gray',
        'beige', 'turquoise', 'cyan', 'magenta', 'lime', 'navy', 'maroon', 'teal',
        'olive', 'coral', 'lavender', 'violet', 'gold', 'silver',
    ] + additional_colors
    xml_text = text.replace('```xml', '')
    xml_text = xml_text.replace('```', '')
    data = decode_xml_points(xml_text)
    if data is None:
        img.show()
        return
    points = data['points']
    description = data['phrase']

    font = ImageFont.truetype('./00_Dataset/NotoSansCJK-Regular.ttc', size = 14)

    for i, point in enumerate(points):
        color = colors[i % len(colors)]
        abs_x1 = int(point[0])/input_width * width
        abs_y1 = int(point[1])/input_height * height
        radius = 2
        draw.ellipse([(abs_x1 - radius, abs_y1 - radius), (abs_x1 + radius, abs_y1 + radius)], fill = color)
        draw.text((abs_x1 + 8, abs_y1 + 6), description, fill = color, font = font)
    
    display(img)
  

# @title Parsing JSON output
def parse_json(json_output):
    # Parsing out the markdown fencing
    lines = json_output.splitlines()
    for i, line in enumerate(lines):
        if line == "```json":
            json_output = "\n".join(lines[i+1:])  # Remove everything before "```json"
            json_output = json_output.split("```")[0]  # Remove everything after the closing "```"
            break  # Exit the loop once "```json" is found
    return json_output

In [None]:
def inference(
    prompt,
    image_path,
    system_prompt = 'You are a helpful assistant',
    max_new_tokens = 32000,
    min_pixels = 512 * 28 * 28,
    max_pixels = 2048 * 28 * 28
):
    messages = [
        {
            'role' : 'user',
            'content' : [
                {
                    'type' : 'image',
                    'image' : image_path,
                    'min_pixels' : min_pixels,
                    'max_pixels' : max_pixels,
                },
                {'type' : 'text', 'text' : prompt},
            ],
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize = False, add_generation_prompt = True
    )
    print('input:\n', text)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text = [text],
        images = image_inputs,
        videos = video_inputs,
        padding = True,
        return_tensors = 'pt',
    )
    inputs = inputs.to('cuda')

    streamer = TextStreamer(processor.tokenizer, skip_special_tokens = True, skip_prompt = True)

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens = max_new_tokens, streamer = streamer)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens = True, clean_up_tokenization_spaces = False
    )

    print('output:\n', output_text[0])

    input_height = inputs['image_grid_thw'][0][1] * 14
    input_width = inputs['image_grid_thw'][0][2] * 14

    return output_text[0], input_height, input_width

## 03 Run Inference

### 03.01 Detect Certain Object in the Image

In [None]:
%%time

image_path = './00_Dataset/cakes.png'
# prompt in chinese
prompt = '框出每一个小蛋糕的位置，以json格式输出所有的坐标'
# prompt in english
prompt = 'Outline the position of each small cake and output all the coordinates in JSON format.'

response, input_height, input_width = inference(prompt, image_path)

image = Image.open(image_path)
print(image.size)
image.thumbnail([640,640], Image.Resampling.LANCZOS)
plot_bounding_boxes(image, response, input_width, input_height)

### 03.02 Detect a Specific Object Using Descriptions

In [None]:
%%time

image_path = './00_Dataset/cakes.png'
# prompt in chinses
prompt = '定位最右上角的棕色蛋糕，以JSON格式输出其bbox坐标'
# prompt in english
prompt = 'Locate the top right brown cake, output its bbox coordinates using JSON format.'

response, input_height, input_width = inference(prompt, image_path)

image = Image.open(image_path)
print(image.size)
image.thumbnail([640,640], Image.Resampling.LANCZOS)
plot_bounding_boxes(image, response, input_width, input_height)

### 03.03 Point to Certain Objects in XML Format

In [None]:
%%time

image_path = './00_Dataset/cakes.png'
# prompt in chinese
prompt = '以点的形式定位图中桌子远处的擀面杖，以XML格式输出其坐标'
# prompt in english
prompt = 'point to the rolling pin on the far side of the table, output its coordinates in XML format <points x y>object</points>'

response, input_height, input_width = inference(prompt, image_path)

image = Image.open(image_path)
print(image.size)
image.thumbnail([640,640], Image.Resampling.LANCZOS)
plot_points(image, response, input_width, input_height)

### 03.04 Reasoning Capability

In [None]:
%%time

image_path = './00_Dataset/Origamis.jpg'
# prompt in chinese
prompt = '框出图中纸狐狸的影子，以json格式输出其bbox坐标'
# prompt in english
prompt = 'Locate the shadow of the paper fox, report the bbox coordinates in JSON format.'

response, input_height, input_width = inference(prompt, image_path)

image = Image.open(image_path)
print(image.size)
image.thumbnail([640,640], Image.Resampling.LANCZOS)
plot_bounding_boxes(image, response, input_width, input_height)

### 03.05 Understand Relationships Across Different Instances

In [None]:
%%time

image_path = './00_Dataset/cartoon_brave_person.jpeg'
# prompt in chinese
prompt = '框出图中见义勇为的人，以json格式输出其bbox坐标'
# prompt in english
prompt = 'Locate the person who act bravely, report the bbox coordinates in JSON format.'

response, input_height, input_width = inference(prompt, image_path)

image = Image.open(image_path)
print(image.size)
image.thumbnail([640,640], Image.Resampling.LANCZOS)
plot_bounding_boxes(image, response, input_width, input_height)

### 03.06 Find a Special Instance with Unique Characteristic (Color, Location, Utility, ...)

In [None]:
%%time

image_path = './00_Dataset/multiple_items.png'
# prompt in chinese
prompt = '如果太阳很刺眼，我应该用这张图中的什么物品，框出该物品在图中的bbox坐标，并以json格式输出'
# prompt in english
prompt = 'If the sun is very glaring, which item in this image should I use? Please locate it in the image with its bbox coordinates and its name and output in JSON format.'

response, input_height, input_width = inference(prompt, image_path)

image = Image.open(image_path)
print(image.size)
image.thumbnail([640,640], Image.Resampling.LANCZOS)
plot_bounding_boxes(image, response, input_width, input_height)

### 03.07 Use Qwen2.5-VL Grounding Capabilities to Help Counting

In [None]:
%%time

image_path = './00_Dataset/multiple_items.png'
# prompt in chinese
prompt = '请以JSON格式输出图中所有物体bbox的坐标以及它们的名字，然后基于检测结果回答以下问题：图中物体的数目是多少？'
# prompt in english
prompt = 'Please first output bbox coordinates and names of every item in this image in JSON format, and then answer how many items are there in the image.'

response, input_height, input_width = inference(prompt, image_path)

image = Image.open(image_path)
print(image.size)
image.thumbnail([640,640], Image.Resampling.LANCZOS)
plot_bounding_boxes(image, response, input_width, input_height)

In [None]:
%%time

image_path = './00_Dataset/cakes.png'
system_prompt = "As an AI assistant, you specialize in accurate image object detection, delivering coordinates in plain text format 'x1,y1,x2,y2 object'."
prompt = 'find all cakes'
response, input_height, input_width = inference(prompt, image_path, system_prompt = system_prompt)