## Windows omni_env

## 00 Install & Import Libraries

In [None]:
import torch
import json
import random
import io
import ast
import xml.etree.ElementTree as ET

from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer
)
from qwen_vl_utils import process_vision_info
from PIL import Image, ImageDraw, ImageFont, ImageColor
from IPython.display import display, Markdown

## 01 Import Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)

In [None]:
model_path = './00_Model/Qwen2.5-VL-3B-Instruct'

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map = 'auto',
).to(device) #''
processor = AutoProcessor.from_pretrained(model_path)

## 02 Define Inference Function

In [None]:
def plot_text_bounding_boxes(image_path, bounding_boxes, input_width, input_height):
    """
    Plots bounding boxes on an image with markers for each a name, using PIL, normalized coordinates, and different colors.

    Args:
        image_path: The path to the image file.
        bounding_boxes: A list of bounding boxes containing the name of the object
         and their positions in normalized [y1 x1 y2 x2] format.
    """

    # Load the image
    img = Image.open(image_path)
    width, height = img.size
    print(img.size)
    # Create a drawing object
    draw = ImageDraw.Draw(img)

    # Parsing out the markdown fencing
    bounding_boxes = parse_json(bounding_boxes)

    font = ImageFont.truetype('./00_Dataset/NotoSansCJK-Regular.ttc', size = 10)

    # Iterate over the bounding boxes
    for i, bounding_box in enumerate(ast.literal_eval(bounding_boxes)):
      color = 'green'

      # Convert normalized coordinates to absolute coordinates
      abs_y1 = int(bounding_box["bbox_2d"][1] / input_height * height)
      abs_x1 = int(bounding_box["bbox_2d"][0] / input_width * width)
      abs_y2 = int(bounding_box["bbox_2d"][3] / input_height * height)
      abs_x2 = int(bounding_box["bbox_2d"][2] / input_width * width)

      if abs_x1 > abs_x2:
        abs_x1, abs_x2 = abs_x2, abs_x1

      if abs_y1 > abs_y2:
        abs_y1, abs_y2 = abs_y2, abs_y1

      # Draw the bounding box
      draw.rectangle(
          ((abs_x1, abs_y1), (abs_x2, abs_y2)), outline = color, width = 1
      )

      # Draw the text
      if 'text_content' in bounding_box:
        draw.text((abs_x1, abs_y2), bounding_box['text_content'], fill = color, font = font)

    # Display the image
    display(img)

# @title Parsing JSON output
def parse_json(json_output):
    # Parsing out the markdown fencing
    lines = json_output.splitlines()
    for i, line in enumerate(lines):
        if line == "```json":
            json_output = "\n".join(lines[i+1:])  # Remove everything before "```json"
            json_output = json_output.split("```")[0]  # Remove everything after the closing "```"
            break  # Exit the loop once "```json" is found
    return json_output

In [None]:
def inference(
    prompt,
    image_path,
    system_prompt = 'You are a helpful assistant',
    max_new_tokens = 4096,
    min_pixels = 512 * 28 * 28,
    max_pixels = 2048 * 28 * 28,
    return_input = False
):
    messages = [
        {
            'role' : 'user',
            'content' : [
                {
                    'type' : 'image',
                    'image' : image_path,
                    'min_pixels' : min_pixels,
                    'max_pixels' : max_pixels,
                },
                {'type' : 'text', 'text' : prompt},
            ],
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize = False, add_generation_prompt = True
    )
    print('input:\n', text)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text = [text],
        images = image_inputs,
        videos = video_inputs,
        padding = True,
        return_tensors = 'pt',
    )
    inputs = inputs.to('cuda')

    streamer = TextStreamer(processor.tokenizer, skip_special_tokens = True, skip_prompt = True)

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens = max_new_tokens, streamer = streamer)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens = True, clean_up_tokenization_spaces = False
    )

    print('output:\n', output_text[0])

    input_height = inputs['image_grid_thw'][0][1] * 14
    input_width = inputs['image_grid_thw'][0][2] * 14

    if return_input:
        return output_text[0], inputs
    else:
        return output_text[0]

## 03 Run Inference

### 03.01 Full-page OCR for English Text

In [None]:
%%time

image_path = './00_Dataset/ocr_example2.jpg'
prompt = 'Read all the text in the image.'

image = Image.open(image_path)
display(image.resize((400, 600)))

response = inference(prompt, image_path)
print(response)

### 03.02 Full Page OCR for Multilingual Text

In [None]:
%%time

image_path = './00_Dataset/ocr_example1.jpg'
prompt = 'Please output only the text content from the image without any additional descriptions or formatting.'

image = Image.open(image_path)
display(image.resize((400, 400)))

response = inference(prompt, image_path)
print(response)

In [None]:
%%time

image_path = './00_Dataset/ocr_example5.jpg'
prompt = '请识别出图中所有的文字。'

image = Image.open(image_path)
display(image.resize((512, 512)))

response = inference(prompt, image_path)
print(response)

### 03.03 Text Spotting

In [None]:
%%time

image_path = './00_Dataset/ocr_example3.jpg'
prompt = 'Spotting all the text in the image with line-level, and output in JSON format.'

image = Image.open(image_path)
display(image.resize((512, 512)))

response, inputs = inference(prompt, image_path, return_input = True)

display(Markdown(response))
input_height = inputs['image_grid_thw'][0][1] * 14
input_width = inputs['image_grid_thw'][0][2] * 14
print(input_height, input_width)
plot_text_bounding_boxes(image_path, response, input_width, input_height)

### 03.04 Visual Information Extraction

In [None]:
%%time

image_path = './00_Dataset/ocr_example3.jpg'
prompt = "Extract the key-value information in the format:{\"company\": \"\", \"date\": \"\", \"address\": \"\", \"total\": \"\"}"

image = Image.open(image_path)
display(image.resize((300, 500)))

response = inference(prompt, image_path)
display(Markdown(response))

In [None]:
%%time

image_path = './00_Dataset/ocr_example4.jpg'
prompt = "提取图中的：['发票代码','发票号码','到站','燃油费','票价','乘车日期','开车时间','车次','座号']，并且按照json格式输出。"

image = Image.open(image_path)
display(image.resize((400, 400)))

response = inference(prompt, image_path)
display(Markdown(response))