## 00 Setup & Import Libraries

In [None]:
%%capture
!pip install qwen-vl-utils
!pip install qwen_agent
!pip install openai
!pip install icecream
!pip install dotenv
!pip install bitsandbytes accelerate

In [None]:
%%capture
!git clone https://github.com/QwenLM/Qwen2.5-VL.git
%cd Qwen2.5-VL

In [None]:
import torch
import json
import random
import io
import ast
import xml.etree.ElementTree as ET
import os.path as osp
import math
import numpy as np
import base64
import hashlib
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from enum import Enum
from typing import Optional, Dict, Any, List, Tuple, Union
from copy import deepcopy

from icecream import ic
from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer
)
from qwen_vl_utils import process_vision_info, smart_resize
from PIL import Image, ImageDraw, ImageFont, ImageColor
from IPython.display import display, Markdown, Video
from matplotlib import pyplot as plt
from bs4 import BeautifulSoup
import re

from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
    NousFnCallPrompt,
    Message,
    ContentItem,
)
from cookbooks.utils.agent_function_call import ComputerUse, MobileUse

## 01 Load Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_id = "Qwen/Qwen2.5-VL-3B-Instruct"

In [None]:
'''model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id, torch_dtype="auto", device_map="auto"
)

processor = AutoProcessor.from_pretrained(model_id) #'''

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map='auto',
)#.to(device)

processor = AutoProcessor.from_pretrained(model_id) #'''

## 02 Define Inference Function

In [None]:
# Task Types Enumeration
class TaskType(Enum):
    COMPUTER_AGENT = "computer_agent"
    DOCUMENT_PARSING = "document_parsing"
    MOBILE_AGENT = "mobile_agent"
    OCR = "ocr"
    RECOGNITION = "recognition"
    SPATIAL_UNDERSTANDING = "spatial_understanding"
    VIDEO_INFERENCE = "video_inference"
    VIDEO_UNDERSTANDING = "video_understanding"

# System Prompts for Different Tasks
SYSTEM_PROMPTS = {
    TaskType.COMPUTER_AGENT: "You are a helpful assistant that can control computer interfaces through function calls.",
    TaskType.DOCUMENT_PARSING: "You are an AI specialized in recognizing and extracting text from images. Your mission is to analyze the image document and generate the result in QwenVL Document Parser HTML format using specified tags while maintaining user privacy and data integrity.",
    TaskType.MOBILE_AGENT: "You are a helpful assistant that can control mobile interfaces through function calls.",
    TaskType.OCR: "You are a helpful assistant specialized in optical character recognition and text extraction from images.",
    TaskType.RECOGNITION: "You are a helpful assistant specialized in image recognition and detailed description.",
    TaskType.SPATIAL_UNDERSTANDING: "You are a helpful assistant specialized in understanding spatial relationships and object positioning in images.",
    TaskType.VIDEO_INFERENCE: "You are a helpful assistant specialized in analyzing and describing video content.",
    TaskType.VIDEO_UNDERSTANDING: "You are a helpful assistant specialized in detailed video analysis and understanding temporal relationships."
}

# Utility Functions
additional_colors = [colorname for (colorname, colorcode) in ImageColor.colormap.items()]

def download_font_if_needed():
    """Download font file if running in Kaggle or similar environment"""
    font_dir = './fonts'
    font_path = os.path.join(font_dir, 'NotoSansCJK-Regular.ttc')
    
    if not os.path.exists(font_path):
        try:
            import urllib.request
            os.makedirs(font_dir, exist_ok=True)
            
            # Download Noto Sans CJK from Google Fonts
            font_url = "https://github.com/googlefonts/noto-cjk/raw/main/Sans/OTC/NotoSansCJK-Regular.ttc"
            print(f"Downloading font from {font_url}...")
            urllib.request.urlretrieve(font_url, font_path)
            print(f"Font downloaded successfully to {font_path}")
            return font_path
        except Exception as e:
            print(f"Failed to download font: {e}")
            return None
    return font_path

def get_font(size=14):
    """Get a font with fallback options including online download"""
    # Try to download font if in cloud environment (like Kaggle)
    downloaded_font = download_font_if_needed()
    
    font_paths = [
        downloaded_font,  # Downloaded font (if available)
        './00_Dataset/NotoSansCJK-Regular.ttc',  # Original path
        './fonts/NotoSansCJK-Regular.ttc',  # Downloaded location
        '/System/Library/Fonts/Arial.ttf',  # macOS
        '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',  # Linux
        'C:/Windows/Fonts/arial.ttf',  # Windows
        '/usr/share/fonts/TTF/DejaVuSans.ttf',  # Some Linux distributions
        '/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf',  # Common on Linux
        '/opt/conda/lib/python3.*/site-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf',  # Conda environments
    ]
    
    # Filter out None values
    font_paths = [path for path in font_paths if path is not None]
    
    for font_path in font_paths:
        try:
            if os.path.exists(font_path):
                return ImageFont.truetype(font_path, size)
        except (OSError, IOError):
            continue
    
    # Try matplotlib fonts (common in Kaggle/Colab)
    try:
        import matplotlib.font_manager as fm
        font_files = fm.findSystemFonts()
        for font_file in font_files[:5]:  # Try first 5 fonts
            try:
                return ImageFont.truetype(font_file, size)
            except:
                continue
    except ImportError:
        pass
    
    # If no truetype font is found, use default font
    try:
        return ImageFont.load_default()
    except:
        return None

def draw_point(image: Image.Image, point: list, color=None):
    """Draw a point on the image for visualization"""
    if isinstance(color, str):
        try:
            color = ImageColor.getrgb(color)
            color = color + (128,)  
        except ValueError:
            color = (255, 0, 0, 128)  
    else:
        color = (255, 0, 0, 128)  

    overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
    overlay_draw = ImageDraw.Draw(overlay)
    radius = min(image.size) * 0.05
    x, y = point

    overlay_draw.ellipse(
        [(x - radius, y - radius), (x + radius, y + radius)],
        fill=color
    )
    
    center_radius = radius * 0.1
    overlay_draw.ellipse(
        [(x - center_radius, y - center_radius), 
         (x + center_radius, y + center_radius)],
        fill=(0, 255, 0, 255)
    )

    image = image.convert('RGBA')
    combined = Image.alpha_composite(image, overlay)
    return combined.convert('RGB')

def parse_json(json_output):
    """Parse JSON output from model responses"""
    lines = json_output.splitlines()
    for i, line in enumerate(lines):
        if line == "```json":
            json_output = "\n".join(lines[i+1:])
            json_output = json_output.split("```")[0]
            break
    return json_output

def decode_xml_points(text):
    """Decode XML points for spatial understanding"""
    try:
        root = ET.fromstring(text)
        num_points = (len(root.attrib) - 1) // 2
        points = []
        for i in range(num_points):
            x = root.attrib.get(f'x{i+1}')
            y = root.attrib.get(f'y{i+1}')
            points.append([x, y])
        alt = root.attrib.get('alt')
        phrase = root.text.strip() if root.text else None
        return {
            "points": points,
            "alt": alt,
            "phrase": phrase
        }
    except Exception as e:
        print(e)
        return None

def clean_and_format_html(full_predict):
    """Clean and format HTML content for document parsing"""
    soup = BeautifulSoup(full_predict, 'html.parser')
    
    color_pattern = re.compile(r'\bcolor:[^;]+;?')
    for tag in soup.find_all(style=True):
        original_style = tag.get('style', '')
        new_style = color_pattern.sub('', original_style)
        if not new_style.strip():
            del tag['style']
        else:
            new_style = new_style.rstrip(';')
            tag['style'] = new_style
            
    for attr in ["data-bbox", "data-polygon"]:
        for tag in soup.find_all(attrs={attr: True}):
            del tag[attr]

    classes_to_update = ['formula.machine_printed', 'formula.handwritten']
    for tag in soup.find_all(class_=True):
        if 'class' in tag.attrs:
            new_classes = [cls if cls not in classes_to_update else 'formula' for cls in tag.get('class', [])]
            tag['class'] = list(dict.fromkeys(new_classes))

    for div in soup.find_all('div', class_='image caption'):
        div.clear()
        div['class'] = ['image']

    classes_to_clean = ['music sheet', 'chemical formula', 'chart']
    for class_name in classes_to_clean:
        for tag in soup.find_all(class_=class_name):
            tag.clear()
            if 'format' in tag.attrs:
                del tag['format']

    output = []
    for child in soup.body.children:
        if hasattr(child, 'name') and child.name:
            output.append(str(child))
            output.append('\n')
        elif isinstance(child, str) and not child.strip():
            continue
    complete_html = f"""```html\n<html><body>\n{"".join(output)}</body></html>\n```"""
    return complete_html

# Visualization Functions
def draw_bbox(image_path, resized_width, resized_height, full_predict):
    """Draw bounding boxes for document parsing"""
    if image_path.startswith('http'):
        response = requests.get(image_path)
        image = Image.open(io.BytesIO(response.content))
    else:
        image = Image.open(image_path)
    
    original_width = image.width
    original_height = image.height
    
    soup = BeautifulSoup(full_predict, 'html.parser')
    elements_with_bbox = soup.find_all(attrs={'data-bbox': True})

    filtered_elements = []
    for el in elements_with_bbox:
        if el.name == 'ol':
            continue
        elif el.name == 'li' and el.parent.name == 'ol':
            filtered_elements.append(el)
        else:
            filtered_elements.append(el)

    font = get_font(20)
    draw = ImageDraw.Draw(image)
    
    for element in filtered_elements:
        bbox_str = element['data-bbox']
        text = element.get_text(strip=True)
        x1, y1, x2, y2 = map(int, bbox_str.split())
        
        scale_x = resized_width / original_width
        scale_y = resized_height / original_height
        
        x1_resized = int(x1 / scale_x)
        y1_resized = int(y1 / scale_y)
        x2_resized = int(x2 / scale_x)
        y2_resized = int(y2 / scale_y)
        
        if x1_resized > x2_resized:
            x1_resized, x2_resized = x2_resized, x1_resized
        if y1_resized > y2_resized:
            y1_resized, y2_resized = y2_resized, y1_resized
            
        draw.rectangle([x1_resized, y1_resized, x2_resized, y2_resized], outline='red', width=2)
        if font:
            draw.text((x1_resized, y2_resized), text, fill='black', font=font)
        else:
            draw.text((x1_resized, y2_resized), text, fill='black')

    display(image)

def plot_bounding_boxes(im, bounding_boxes, input_width, input_height):
    """Plot bounding boxes for spatial understanding"""
    img = im
    width, height = img.size
    draw = ImageDraw.Draw(img)

    colors = [
        'red', 'green', 'blue', 'yellow', 'orange', 'pink', 'purple', 'brown', 'gray',
        'beige', 'turquoise', 'cyan', 'magenta', 'lime', 'navy', 'maroon', 'teal',
        'olive', 'coral', 'lavender', 'violet', 'gold', 'silver',
    ] + additional_colors

    bounding_boxes = parse_json(bounding_boxes)
    font = get_font(14)

    try:
        json_output = ast.literal_eval(bounding_boxes)
    except Exception as e:
        end_idx = bounding_boxes.rfind('"}') + len('"}')
        truncated_text = bounding_boxes[:end_idx] + "]"
        json_output = ast.literal_eval(truncated_text)

    for i, bounding_box in enumerate(json_output):
        color = colors[i % len(colors)]

        abs_y1 = int(bounding_box["bbox_2d"][1]/input_height * height)
        abs_x1 = int(bounding_box["bbox_2d"][0]/input_width * width)
        abs_y2 = int(bounding_box["bbox_2d"][3]/input_height * height)
        abs_x2 = int(bounding_box["bbox_2d"][2]/input_width * width)

        if abs_x1 > abs_x2:
            abs_x1, abs_x2 = abs_x2, abs_x1
        if abs_y1 > abs_y2:
            abs_y1, abs_y2 = abs_y2, abs_y1

        draw.rectangle(((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=4)

        if "label" in bounding_box:
            if font:
                draw.text((abs_x1 + 8, abs_y1 + 6), bounding_box["label"], fill=color, font=font)
            else:
                draw.text((abs_x1 + 8, abs_y1 + 6), bounding_box["label"], fill=color)

    display(img)

def plot_text_bounding_boxes(image_path, bounding_boxes, input_width, input_height):
    """Plot text bounding boxes for OCR"""
    img = Image.open(image_path)
    width, height = img.size
    draw = ImageDraw.Draw(img)

    bounding_boxes = parse_json(bounding_boxes)
    font = get_font(10)

    for i, bounding_box in enumerate(ast.literal_eval(bounding_boxes)):
        color = 'green'

        abs_y1 = int(bounding_box["bbox_2d"][1] / input_height * height)
        abs_x1 = int(bounding_box["bbox_2d"][0] / input_width * width)
        abs_y2 = int(bounding_box["bbox_2d"][3] / input_height * height)
        abs_x2 = int(bounding_box["bbox_2d"][2] / input_width * width)

        if abs_x1 > abs_x2:
            abs_x1, abs_x2 = abs_x2, abs_x1
        if abs_y1 > abs_y2:
            abs_y1, abs_y2 = abs_y2, abs_y1

        draw.rectangle(((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=1)

        if 'text_content' in bounding_box:
            if font:
                draw.text((abs_x1, abs_y2), bounding_box['text_content'], fill=color, font=font)
            else:
                draw.text((abs_x1, abs_y2), bounding_box['text_content'], fill=color)

    display(img)

# Main Unified Inference Class
class UnifiedQwenInference:
    def __init__(self, model, processor, device):
        self.model = model
        self.processor = processor
        self.device = device
        
    def unified_inference(
        self,
        task_type: TaskType,
        prompt: Optional[str] = None,
        image_path: Optional[str] = None,
        video_path: Optional[str] = None,
        screenshot: Optional[str] = None,
        user_query: Optional[str] = None,
        max_new_tokens: int = 4096,
        min_pixels: int = 512 * 28 * 28,
        max_pixels: int = 2048 * 28 * 28,
        total_pixels: int = 20480 * 28 * 28,
        visualize: bool = True,
        return_additional_info: bool = False
    ) -> Union[str, Tuple[str, Dict[str, Any]]]:
        """
        Unified inference function that handles all task types
        
        Args:
            task_type: Type of task to perform
            prompt: Text prompt for the task (optional, can use user_query instead)
            image_path: Path to input image (for image tasks)
            video_path: Path to input video (for video tasks)
            screenshot: Path to screenshot (for agent tasks)
            user_query: User query (for agent tasks, can be used instead of prompt)
            max_new_tokens: Maximum tokens to generate
            min_pixels: Minimum pixels for processing
            max_pixels: Maximum pixels for processing
            total_pixels: Total pixels for video processing
            visualize: Whether to show visualizations
            return_additional_info: Whether to return additional processing info
            
        Returns:
            Model output text, optionally with additional processing information
        """
        
        # Handle prompt/user_query flexibility
        if prompt is None and user_query is None:
            raise ValueError("Either 'prompt' or 'user_query' must be provided")
        
        # Use user_query if prompt is not provided
        effective_prompt = prompt if prompt is not None else user_query
        
        # Determine system prompt
        system_prompt = SYSTEM_PROMPTS.get(task_type, "You are a helpful assistant")
        
        # Handle different task types
        if task_type in [TaskType.COMPUTER_AGENT, TaskType.MOBILE_AGENT]:
            return self._handle_agent_task(task_type, user_query or effective_prompt, screenshot, visualize)
        
        elif task_type == TaskType.VIDEO_INFERENCE or task_type == TaskType.VIDEO_UNDERSTANDING:
            return self._handle_video_task(task_type, effective_prompt, video_path, max_new_tokens, 
                                         min_pixels, total_pixels, visualize)
        
        else:
            return self._handle_image_task(task_type, effective_prompt, image_path, system_prompt, 
                                         max_new_tokens, min_pixels, max_pixels, 
                                         visualize, return_additional_info)
    
    def _handle_agent_task(self, task_type: TaskType, user_query: str, screenshot: str, visualize: bool):
        """Handle computer and mobile agent tasks"""
        dummy_image = Image.open(screenshot)
        resized_height, resized_width = smart_resize(
            dummy_image.height, dummy_image.width,
            factor=self.processor.image_processor.patch_size * self.processor.image_processor.merge_size,
            min_pixels=self.processor.image_processor.min_pixels,
            max_pixels=self.processor.image_processor.max_pixels,
        )

        # Initialize appropriate agent
        if task_type == TaskType.COMPUTER_AGENT:
            agent_use = ComputerUse(
                cfg={'display_width_px': resized_width, 'display_height_px': resized_height}
            )
        else:  # MOBILE_AGENT
            # agent_use = MobileUse(
            #     cfg={'display_width_px': resized_width, 'display_height_px': resized_height}
            # )
            # For now, use ComputerUse as fallback
            agent_use = ComputerUse(
                cfg={'display_width_px': resized_width, 'display_height_px': resized_height}
            )

        # Build messages
        prompt_builder = NousFnCallPrompt()
        message = prompt_builder.preprocess_fncall_messages(
            messages=[
                Message(role='system', content=[ContentItem(text='You are a helpful assistant.')]),
                Message(role='user', content=[
                    ContentItem(text=user_query),
                    ContentItem(image=f"file://{screenshot}")
                ]),
            ],
            functions=[agent_use.function],
            lang=None,
        )
        message = [msg.model_dump() for msg in message]

        text = self.processor.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
        inputs = self.processor(text=[text], images=[dummy_image], padding=True, return_tensors='pt').to(self.device)

        streamer = TextStreamer(self.processor.tokenizer, skip_special_tokens=True, skip_prompt=True)
        
        output_ids = self.model.generate(**inputs, max_new_tokens=2048, streamer=streamer)
        generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
        output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]

        # Process action and visualize
        if visualize:
            try:
                action = json.loads(output_text.split('<tool_call>\n')[1].split('\n</tool_call>')[0])
                display_image = dummy_image.resize((resized_width, resized_height))
                
                if 'click' in action['arguments']['action']:
                    display_image = draw_point(dummy_image, action['arguments']['coordinate'], color='green')
                display(display_image)
            except:
                display(dummy_image)

        return output_text
    
    def _handle_video_task(self, task_type: TaskType, prompt: str, video_path: str, 
                          max_new_tokens: int, min_pixels: int, total_pixels: int, visualize: bool):
        """Handle video inference and understanding tasks"""
        messages = [
            {
                'role': 'user',
                'content': [
                    {
                        'type': 'video',
                        'video': video_path,
                        'min_pixels': min_pixels,
                        'total_pixels': total_pixels,
                    },
                    {'type': 'text', 'text': prompt},
                ],
            }
        ]

        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        if task_type == TaskType.VIDEO_UNDERSTANDING:
            image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
            fps_inputs = video_kwargs['fps']
            inputs = self.processor(
                text=[text], images=image_inputs, videos=video_inputs,
                fps=fps_inputs, padding=True, return_tensors='pt',
            )
        else:
            image_inputs, video_inputs = process_vision_info(messages)
            inputs = self.processor(
                text=[text], images=image_inputs, videos=video_inputs,
                padding=True, return_tensors='pt',
            )
        
        inputs = inputs.to(self.device)
        streamer = TextStreamer(self.processor.tokenizer, skip_special_tokens=True, skip_prompt=True)

        generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens, streamer=streamer)
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = self.processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )

        return output_text[0]
    
    def _handle_image_task(self, task_type: TaskType, prompt: str, image_path: str, 
                          system_prompt: str, max_new_tokens: int, min_pixels: int, 
                          max_pixels: int, visualize: bool, return_additional_info: bool):
        """Handle image-based tasks"""
        messages = [
            {
                'role': 'user',
                'content': [
                    {
                        'type': 'image',
                        'image': image_path,
                        'min_pixels': min_pixels,
                        'max_pixels': max_pixels,
                    },
                    {'type': 'text', 'text': prompt},
                ],
            }
        ]

        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = self.processor(
            text=[text], images=image_inputs, videos=video_inputs,
            padding=True, return_tensors='pt',
        )
        inputs = inputs.to(self.device)

        streamer = TextStreamer(self.processor.tokenizer, skip_special_tokens=True, skip_prompt=True)

        generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens, streamer=streamer)
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = self.processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )

        # Get input dimensions for visualization
        input_height = inputs['image_grid_thw'][0][1] * 14
        input_width = inputs['image_grid_thw'][0][2] * 14

        # Handle visualization based on task type
        if visualize:
            image = Image.open(image_path)
            
            if task_type == TaskType.DOCUMENT_PARSING:
                draw_bbox(image_path, input_width, input_height, output_text[0])
                if return_additional_info:
                    cleaned_html = clean_and_format_html(output_text[0])
                    return output_text[0], {'cleaned_html': cleaned_html, 'input_height': input_height, 'input_width': input_width}
                    
            elif task_type == TaskType.SPATIAL_UNDERSTANDING:
                image.thumbnail([640, 640], Image.Resampling.LANCZOS)
                plot_bounding_boxes(image, output_text[0], input_width, input_height)
                
            elif task_type == TaskType.OCR and 'bbox_2d' in output_text[0]:
                plot_text_bounding_boxes(image_path, output_text[0], input_width, input_height)
                
            elif task_type == TaskType.RECOGNITION:
                display(image.resize((400, 600)) if image.size[0] > 400 else image)

        if return_additional_info:
            return output_text[0], {'input_height': input_height, 'input_width': input_width}
        
        return output_text[0]

## 03 Initialize Inference

In [None]:
unified_system = UnifiedQwenInference(model, processor, device)

print("âœ… Unified Qwen2.5-VL Multi-Modal Inference System initialized successfully!")
print("\nAvailable task types:")
for task in TaskType:
    print(f"  - {task.value}")

## 04 Run Inference

In [None]:
"""Example: Computer Agent Task"""
screenshot = './cookbooks/assets/computer_use/computer_use2.jpeg'

user_query = 'open the third issue'

result = unified_system.unified_inference(
    task_type=TaskType.COMPUTER_AGENT,
    user_query=user_query,
    screenshot=screenshot,
    visualize=True
)

In [None]:
"""Example: Document Parsing Task"""
image_path = './cookbooks/assets/document_parsing/docparsing_example1.jpg'
prompt = 'QwenVL HTML'

result, additional_info = unified_system.unified_inference(
    task_type=TaskType.DOCUMENT_PARSING,
    prompt=prompt,
    image_path=image_path,
    visualize=True,
    return_additional_info=True
)

print("Cleaned HTML:")
print(additional_info['cleaned_html'])

In [None]:
"""Example: OCR Task"""
image_path = './cookbooks/assets/ocr/ocr_example2.jpg'
prompt = 'Read all the text in the image.'

img = mpimg.imread(image_path)
plt.imshow(img)
plt.axis('off')  # Hide axis
plt.show()

result = unified_system.unified_inference(
    task_type=TaskType.OCR,
    prompt=prompt,
    image_path=image_path,
    visualize=True
)

In [None]:
"""Example: Recognition Task"""
image_path = './cookbooks/assets/universal_recognition/unireco_birds_example.jpg'
prompt = 'What is the main subject of this image? Describe it in detail.'

result = unified_system.unified_inference(
    task_type=TaskType.RECOGNITION,
    prompt=prompt,
    image_path=image_path,
    visualize=True
)

In [None]:
"""Example: Spatial Understanding Task"""
image_path = './cookbooks/assets/spatial_understanding/cakes.png'
prompt = 'Outline the position of each small cake and output all the coordinates in JSON format.'

result = unified_system.unified_inference(
    task_type=TaskType.SPATIAL_UNDERSTANDING,
    prompt=prompt,
    image_path=image_path,
    visualize=True
)

In [None]:
"""Example: Video Inference Task"""
video_path = 'https://duguang-labelling.oss-cn-shanghai.aliyuncs.com/qiansun/video_ocr/videos/50221078283.mp4'
prompt = 'What is the main subject of this video? Describe it in detail.'

result = unified_system.unified_inference(
    task_type=TaskType.VIDEO_INFERENCE,
    prompt=prompt,
    video_path=video_path,
    max_new_tokens=128
)

In [None]:
"""Example: Video Understanding Task"""
video_path = 'https://duguang-labelling.oss-cn-shanghai.aliyuncs.com/qiansun/video_ocr/videos/50221078283.mp4'
prompt = 'Analyze this video in detail and describe what happens.'

result = unified_system.unified_inference(
    task_type=TaskType.VIDEO_UNDERSTANDING,
    prompt=prompt,
    video_path=video_path,
    max_new_tokens=2048
)