## Windows omni_env

## v1

In [None]:
import gradio as gr
import torch
import json
import os
from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer,
    TextIteratorStreamer
)
from qwen_vl_utils import process_vision_info
from PIL import Image
from typing import List, Tuple, Dict, Any, Optional
import io
import sys
import re
import threading

# Load model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
model_path = './00_Model/Qwen2.5-VL-3B-Instruct'

if not os.path.exists(model_path) or not os.path.isdir(model_path):
    print(f"Error: Model path '{model_path}' does not exist or is not a directory.")
    print("Please ensure your model files are in the specified directory.")
    sys.exit(1)

try:
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_path,
        quantization_config=bnb_config,
        device_map='auto',
        torch_dtype=torch.float16
    ).to(device)
    processor = AutoProcessor.from_pretrained(model_path)
    tokenizer = processor.tokenizer
except Exception as e:
    print(f"Error loading model or processor: {e}")
    print("Please ensure the model path is correct and all necessary files are present.")
    sys.exit(1)

print("Updating processor config files...")
try:
    processor.save_pretrained(model_path)
    print("Processor config files updated.")
except Exception as e:
    print(f"Warning: Could not save processor config files. This might be due to permissions or an incomplete model download: {e}")

SYSTEM_PROMPTS = {
    "General Assistant": "You are a helpful assistant that can analyze images and answer questions.",
    "Image Analyzer": "You are an expert image analyst. Provide detailed descriptions and analysis of images.",
    "OCR Reader": "You are an OCR specialist. Extract and transcribe all text from images accurately.",
    "Medical Assistant": "You are a medical AI assistant. Analyze medical images and provide insights (for educational purposes only).",
    "Educational Tutor": "You are an educational tutor. Help explain concepts shown in images and answer related questions."
}

def clean_response(text: str) -> str:
    text = re.sub(r'<\|im_end\|>', '', text)
    text = re.sub(r'^\s*<\|im_start\|>assistant\s*', '', text) 
    text = re.sub(r'\s*<\|im_start\|>user\s*', '', text) 
    text = text.strip()
    return text

def inference_with_streaming(
    prompt,
    image_path,
    system_prompt='You are a helpful assistant',
    max_new_tokens=32000,
    min_pixels=512 * 28 * 28,
    max_pixels=2048 * 28 * 28
):
    messages = [
        {
            'role': 'system',
            'content': system_prompt
        },
        {
            'role': 'user',
            'content': [
                {
                    'type': 'image',
                    'image': image_path,
                    'min_pixels': min_pixels,
                    'max_pixels': max_pixels,
                },
                {'type': 'text', 'text': prompt},
            ],
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    print('input:\n', text)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors='pt',
    )
    inputs = inputs.to(device)

    # Create TextIteratorStreamer
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    # Generation parameters
    generation_kwargs = {
        **inputs,
        'max_new_tokens': max_new_tokens,
        'streamer': streamer,
        'do_sample': True,
        'temperature': 0.7,
        'pad_token_id': tokenizer.eos_token_id,
    }

    # Start generation in a separate thread
    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # Yield tokens as they are generated
    generated_text = ""
    for new_token in streamer:
        generated_text += new_token
        yield generated_text

    thread.join()
    print('output:\n', generated_text)
    return generated_text

def chat_with_history(
    message: str,
    image: Optional[Image.Image],
    history: List[Dict[str, str]], 
    usage_mode: str,
    custom_system_prompt: str,
    max_tokens: int,
    min_pixels: int,
    max_pixels: int
):
    try:
        current_system_prompt = custom_system_prompt.strip() or SYSTEM_PROMPTS.get(usage_mode, SYSTEM_PROMPTS["General Assistant"])
        
        if not message.strip() and not image:
            yield history, ""
            return

        # Handle image upload
        temp_image_path = None
        if image:
            temp_image_dir = "temp_images"
            os.makedirs(temp_image_dir, exist_ok=True)
            temp_image_path = os.path.join(temp_image_dir, f"uploaded_image_{os.getpid()}_{threading.get_ident()}.png") 
            image.save(temp_image_path)

            # Add to Gradio history with markdown image for display
            image_md_for_gradio = f'![Uploaded Image](file={temp_image_path})' 
            if message:
                history.append({"role": "user", "content": f"{image_md_for_gradio}\n\n{message}"})
            else:
                history.append({"role": "user", "content": image_md_for_gradio})
        else:
            if message.strip():
                history.append({"role": "user", "content": message})

        # Add assistant placeholder
        history.append({"role": "assistant", "content": ""})
        yield history, ""

        # Call inference function with streaming
        if temp_image_path:
            for partial_response in inference_with_streaming(
                prompt=message or "Describe this image.",
                image_path=temp_image_path,
                system_prompt=current_system_prompt,
                max_new_tokens=max_tokens,
                min_pixels=min_pixels,
                max_pixels=max_pixels
            ):
                cleaned_partial = clean_response(partial_response)
                history[-1]["content"] = cleaned_partial
                yield history, ""
        else:
            history[-1]["content"] = "Image is required for this model."
            yield history, ""
        
        # Clean up temporary image file
        if temp_image_path and os.path.exists(temp_image_path):
            try:
                os.remove(temp_image_path)
            except OSError as e:
                print(f"Error deleting temp file {temp_image_path}: {e}")
            
    except Exception as e:
        error_msg = f"An unexpected error occurred: {str(e)}"
        print(f"Error in chat_with_history: {e}") 
        if not history or history[-1]["role"] != "assistant":
            history.append({"role": "assistant", "content": error_msg})
        else:
            history[-1]["content"] = error_msg
        yield history, ""

def clear_history():
    return [], None, ""

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# Qwen2.5-VL Conversational Assistant")

    with gr.Row():
        with gr.Column(scale=1):
            image_upload = gr.Image(type="pil", label="Upload Image", sources=["upload"], interactive=True)
            gr.Markdown("Image will be displayed in the chat history.")

            with gr.Accordion("System Settings", open=False):
                usage_mode = gr.Radio(
                    list(SYSTEM_PROMPTS.keys()),
                    value="General Assistant",
                    label="Usage Mode",
                    interactive=True
                )
                custom_system_prompt = gr.Textbox(
                    label="Custom System Prompt (Overrides Usage Mode)",
                    placeholder="Enter your custom system prompt here...",
                    lines=2,
                    interactive=True
                )
                max_tokens = gr.Slider(
                    minimum=100,
                    maximum=32000,
                    value=800,
                    label="Max New Tokens",
                    step=100,
                    interactive=True
                )
                min_pixels = gr.Slider(
                    minimum=1000,
                    maximum=1000000,
                    value=224 * 224,
                    label="Min Pixels for Image Processing",
                    step=1000,
                    interactive=True
                )
                max_pixels = gr.Slider(
                    minimum=1000000,
                    maximum=50000000,
                    value=1280 * 28 * 28,
                    label="Max Pixels for Image Processing",
                    step=100000,
                    interactive=True
                )
            
            clear_btn = gr.Button("Clear Chat")

        with gr.Column(scale=2):
            chatbot = gr.Chatbot(
                label="Conversation History",
                elem_id="chatbot",
                height=500,
                render_markdown=True,
                type='messages' 
            )
            msg = gr.Textbox(label="Your Message", placeholder="Type your message here...", lines=2)
            send_btn = gr.Button("Send")
    
    msg.submit(
        chat_with_history,
        inputs=[msg, image_upload, chatbot, usage_mode, custom_system_prompt, max_tokens, min_pixels, max_pixels],
        outputs=[chatbot, msg],
    )
    send_btn.click(
        chat_with_history,
        inputs=[msg, image_upload, chatbot, usage_mode, custom_system_prompt, max_tokens, min_pixels, max_pixels],
        outputs=[chatbot, msg],
    )
    clear_btn.click(clear_history, outputs=[chatbot, image_upload, msg])

if __name__ == "__main__":
    demo.launch(
        #server_name = "0.0.0.0",
        #server_port = 7860,
        #share = False,
        debug = True
    )

## v2

In [None]:
import gradio as gr
import torch
import json
import os
from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextIteratorStreamer
)
# Make sure qwen_vl_utils.py is in the same directory as this script.
# This file contains the 'process_vision_info' function crucial for handling
# vision inputs for Qwen-VL.
from qwen_vl_utils import process_vision_info 
from PIL import Image
from typing import List, Tuple, Dict, Any, Optional
import io
import sys
import re
import threading
import shutil
import tempfile

# --- Model Loading and Configuration ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
model_path = './00_Model/Qwen2.5-VL-3B-Instruct'

# Check if model path exists
if not os.path.exists(model_path) or not os.path.isdir(model_path):
    print(f"Error: Model path '{model_path}' does not exist or is not a directory.")
    print("Please ensure your model files are in the specified directory.")
    sys.exit(1)

try:
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_path,
        quantization_config=bnb_config,
        device_map='auto',
        torch_dtype=torch.float16
    ).to(device)
    processor = AutoProcessor.from_pretrained(model_path)
    tokenizer = processor.tokenizer
except Exception as e:
    print(f"Error loading model or processor: {e}")
    print("Please ensure the model path is correct and all necessary files are present.")
    sys.exit(1)

print("Updating processor config files...")
try:
    processor.save_pretrained(model_path)
    print("Processor config files updated.")
except Exception as e:
    print(f"Warning: Could not save processor config files. This might be due to permissions or an incomplete model download: {e}")

# --- System Prompts for Different Modes ---
SYSTEM_PROMPTS = {
    "General Assistant": "You are a helpful assistant that can analyze images and answer questions.",
    "Image Analyzer": "You are an expert image analyst. Provide detailed descriptions and analysis of images.",
    "OCR Reader": "You are an OCR specialist. Extract and transcribe all text from images accurately.",
    "Medical Assistant": "You are a medical AI assistant. Analyze medical images and provide insights (for educational purposes only).",
    "Educational Tutor": "You are an educational tutor. Help explain concepts shown in images and answer related questions."
}

# --- Utility Functions ---
def clean_response(text: str) -> str:
    """Removes special tokens and cleans up the generated text."""
    text = re.sub(r'<\|im_end\|>', '', text)
    text = re.sub(r'^\s*<\|im_start\|>assistant\s*', '', text) 
    text = re.sub(r'\s*<\|im_start\|>user\s*', '', text) 
    text = text.strip()
    return text

def validate_and_process_image_paths(image_paths: Optional[List[str]]) -> List[str]:
    """
    Validates image paths and ensures they exist and are accessible.
    Returns a list of valid image paths.
    """
    if not image_paths:
        return []
    
    valid_paths = []
    for path in image_paths:
        if not path:
            continue
            
        # Convert to absolute path for consistency
        abs_path = os.path.abspath(path)
        
        # Check if file exists
        if not os.path.exists(abs_path):
            print(f"Warning: Image path does not exist: {abs_path}")
            continue
            
        # Check if it's a file (not a directory)
        if not os.path.isfile(abs_path):
            print(f"Warning: Path is not a file: {abs_path}")
            continue
            
        # Check if it's a valid image file by trying to open it
        try:
            with Image.open(abs_path) as img:
                # Just opening to validate - we don't need to do anything with it
                pass
            valid_paths.append(abs_path)
            print(f"Valid image path added: {abs_path}")
        except Exception as e:
            print(f"Warning: Cannot open image {abs_path}: {e}")
            continue
            
    return valid_paths

def inference_with_streaming(
    prompt: Optional[str],
    image_paths: Optional[List[str]],
    system_prompt: str,
    max_new_tokens: int,
    min_pixels: int,
    max_pixels: int
):
    """
    Performs streaming inference with the Qwen-VL model.
    Constructs messages for the model's expected format.
    """
    try:
        messages = [
            {
                'role': 'system',
                'content': system_prompt
            }
        ]

        user_content_for_model = []
        
        # Validate and process image paths
        valid_image_paths = validate_and_process_image_paths(image_paths)
        
        if valid_image_paths:
            for path in valid_image_paths:
                # This is the format expected by Qwen-VL's `process_vision_info`
                user_content_for_model.append({
                    'type': 'image',
                    'image': path,  # Use the validated absolute path
                    'min_pixels': min_pixels,
                    'max_pixels': max_pixels,
                })
        
        if prompt:
            user_content_for_model.append({'type': 'text', 'text': prompt})
        
        # If no text or image provided, model has nothing to process
        if not user_content_for_model:
            yield "Please provide a message or an image to the model."
            return

        messages.append({'role': 'user', 'content': user_content_for_model})

        print(f"DEBUG: Messages sent to process_vision_info and processor:")
        for i, msg in enumerate(messages):
            print(f"  Message {i}: {msg}")

        # Preparation for inference
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        print(f'Applied chat template:\n{text}')
        
        # Process vision info - this is where the original error likely occurred
        try:
            image_inputs, video_inputs = process_vision_info(messages)
            print(f"DEBUG: Processed vision - Images: {len(image_inputs) if image_inputs else 0}, Videos: {len(video_inputs) if video_inputs else 0}")
        except Exception as e:
            print(f"Error in process_vision_info: {e}")
            yield f"Error processing images: {str(e)}"
            return
        
        # Process inputs for the model
        try:
            inputs = processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors='pt',
            )
            inputs = inputs.to(device)
        except Exception as e:
            print(f"Error in processor: {e}")
            yield f"Error processing inputs for model: {str(e)}"
            return

        # Create TextIteratorStreamer for token-by-token generation
        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
        
        # Generation parameters
        generation_kwargs = {
            **inputs,
            'max_new_tokens': max_new_tokens,
            'streamer': streamer,
            'do_sample': True,
            'temperature': 0.7,
            'pad_token_id': tokenizer.eos_token_id,
        }

        # Start generation in a separate thread to allow streaming
        thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
        thread.start()

        # Yield tokens as they are generated
        generated_text = ""
        for new_token in streamer:
            generated_text += new_token
            yield generated_text

        thread.join()  # Wait for the generation thread to complete
        print(f'Final output:\n{generated_text}')
        
    except Exception as e:
        error_msg = f"Error in inference: {str(e)}"
        print(error_msg)
        yield error_msg

def chat_with_history(
    message: Dict[str, Any],  # Input from gr.MultimodalTextbox
    history: List[Dict[str, Any]],  # History for gr.Chatbot(type='messages')
    usage_mode: str,
    custom_system_prompt: str,
    max_tokens: int,
    min_pixels: int,
    max_pixels: int
):
    """
    Manages chat conversation, processes user input (text/image),
    calls inference, and updates Gradio chatbot history.
    """
    try:
        current_system_prompt = custom_system_prompt.strip() or SYSTEM_PROMPTS.get(usage_mode, SYSTEM_PROMPTS["General Assistant"])
        
        user_text = message.get("text", "").strip() if message.get("text") else ""
        user_files = message.get("files", []) if message.get("files") else []

        print(f"DEBUG: Received message - Text: '{user_text}', Files: {user_files}")

        if not user_text and not user_files:
            # If nothing is provided, return current history and clear input
            yield history, {"text": "", "files": []}
            return

        # Prepare user message content for Gradio Chatbot display
        user_display_content_for_chatbot = []
        
        if user_files:
            for file_path in user_files:
                # For Gradio Chatbot (type='messages'), use "file" key for display
                if os.path.exists(file_path):
                    user_display_content_for_chatbot.append({"file": file_path})
                else:
                    print(f"Warning: File does not exist for display: {file_path}")
                    
        if user_text:
            # For pure text, append the string
            user_display_content_for_chatbot.append(user_text)
        
        # Determine final content for chatbot display
        if len(user_display_content_for_chatbot) == 1 and isinstance(user_display_content_for_chatbot[0], str):
            final_user_content_for_chatbot = user_display_content_for_chatbot[0]
        else:
            final_user_content_for_chatbot = user_display_content_for_chatbot

        # Append user message to history
        history.append({"role": "user", "content": final_user_content_for_chatbot})
        
        # Add an empty assistant placeholder for streaming
        history.append({"role": "assistant", "content": ""})
        
        # Yield history immediately to show user's message and clear the input textbox
        yield history, {"text": "", "files": []}

        # Call inference function with streaming
        response_generator = inference_with_streaming(
            prompt=user_text if user_text else None,
            image_paths=user_files,
            system_prompt=current_system_prompt,
            max_new_tokens=max_tokens,
            min_pixels=min_pixels,
            max_pixels=max_pixels
        )

        full_response_content = ""
        for partial_response in response_generator:
            cleaned_partial = clean_response(partial_response)
            full_response_content = cleaned_partial
            # Update the last assistant message in history during streaming
            history[-1]["content"] = full_response_content
            yield history, {"text": "", "files": []}

    except Exception as e:
        error_msg = f"An unexpected error occurred: {str(e)}"
        print(f"Error in chat_with_history: {e}")
        import traceback
        traceback.print_exc()
        
        # Update or add an assistant message with the error
        if history and history[-1]["role"] == "assistant" and history[-1]["content"] == "":
            history[-1]["content"] = error_msg
        else:
            history.append({"role": "assistant", "content": error_msg})
        yield history, {"text": "", "files": []}

def clear_history():
    """Clears the chatbot history and the MultimodalTextbox input."""
    return [], {"text": "", "files": []}

# --- Gradio Interface Definition ---
with gr.Blocks(title="Qwen2.5-VL Assistant") as demo:
    gr.Markdown("# Qwen2.5-VL Conversational Assistant")
    gr.Markdown("Upload images and ask questions about them, or have a regular text conversation.")

    with gr.Column():
        gr.Markdown("## Conversation")
        chatbot = gr.Chatbot(
            label="Conversation History",
            elem_id="chatbot",
            height=500,
            render_markdown=True,
            type='messages'
        )
        
        # MultimodalTextbox for input
        msg = gr.MultimodalTextbox(
            label="Your Message", 
            placeholder="Type your message here or upload an image...", 
            interactive=True,
            file_types=["image"],
        )
        
        # Settings section
        with gr.Accordion("Settings", open=False): 
            with gr.Row():
                with gr.Column():
                    usage_mode = gr.Radio(
                        list(SYSTEM_PROMPTS.keys()),
                        value="General Assistant",
                        label="Usage Mode",
                        interactive=True
                    )
                    custom_system_prompt = gr.Textbox(
                        label="Custom System Prompt (Overrides Usage Mode)",
                        placeholder="Enter your custom system prompt here...",
                        lines=2,
                        interactive=True
                    )
            with gr.Row():
                with gr.Column():
                    max_tokens = gr.Slider(
                        minimum=100,
                        maximum=32000,
                        value=800,
                        label="Max New Tokens",
                        step=100,
                        interactive=True
                    )
                with gr.Column():
                    min_pixels = gr.Slider(
                        minimum=1000,
                        maximum=1000000,
                        value=224 * 224,
                        label="Min Pixels for Image Processing",
                        step=1000,
                        interactive=True
                    )
                with gr.Column():
                    max_pixels = gr.Slider(
                        minimum=1000000,
                        maximum=50000000,
                        value=1280 * 28 * 28,
                        label="Max Pixels for Image Processing",
                        step=100000,
                        interactive=True
                    )
            
            clear_btn = gr.Button("Clear Chat", variant="secondary")
    
    # --- Event Handlers ---
    msg.submit(
        chat_with_history,
        inputs=[msg, chatbot, usage_mode, custom_system_prompt, max_tokens, min_pixels, max_pixels],
        outputs=[chatbot, msg],
    )
    
    clear_btn.click(clear_history, outputs=[chatbot, msg])

# --- Launch the Gradio App ---
if __name__ == "__main__":
    demo.launch(
        debug=True,
        # Uncomment for public access:
        # server_name="0.0.0.0",
        # server_port=7860,
        # share=True,
    )