## v1

In [None]:
import gradio as gr
import torch
import json
import os
from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer
)
from qwen_vl_utils import process_vision_info
from PIL import Image, ImageDraw, ImageFont
from IPython.display import display
from typing import List, Tuple, Dict, Any, Optional

# Load model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)
model_path = './00_Model/Qwen2.5-VL-3B-Instruct'
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map = 'auto',
).to(device) #''
processor = AutoProcessor.from_pretrained(model_path)


# System prompts for different usage modes
SYSTEM_PROMPTS = {
    "General Assistant": "You are a helpful assistant that can analyze images and answer questions.",
    "Image Analyzer": "You are an expert image analyst. Provide detailed descriptions and analysis of images.",
    "OCR Reader": "You are an OCR specialist. Extract and transcribe all text from images accurately.",
    "Medical Assistant": "You are a medical AI assistant. Analyze medical images and provide insights (for educational purposes only).",
    "Educational Tutor": "You are an educational tutor. Help explain concepts shown in images and answer related questions."
}

def inference(
    prompt,
    image_path,
    system_prompt = 'You are a helpful assistant',
    max_new_tokens = 32000,
    min_pixels = 512 * 28 * 28,
    max_pixels = 2048 * 28 * 28
):
    messages = [
        {
            'role' : 'user',
            'content' : [
                {
                    'type' : 'image',
                    'image' : image_path,
                    'min_pixels' : min_pixels,
                    'max_pixels' : max_pixels,
                },
                {'type' : 'text', 'text' : prompt},
            ],
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize = False, add_generation_prompt = True
    )
    print('input:\n', text)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text = [text],
        images = image_inputs,
        videos = video_inputs,
        padding = True,
        return_tensors = 'pt',
    )
    inputs = inputs.to('cuda')

    streamer = TextStreamer(processor.tokenizer, skip_special_tokens = True, skip_prompt = True)

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens = max_new_tokens, streamer = streamer)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens = True, clean_up_tokenization_spaces = False
    )

    print('output:\n', output_text[0])

    input_height = inputs['image_grid_thw'][0][1] * 14
    input_width = inputs['image_grid_thw'][0][2] * 14

    return output_text[0], input_height, input_width

def chat_with_history(
    message: str,
    image: Optional[str],
    history: List[Tuple[str, str]],
    usage_mode: str,
    custom_system_prompt: str,
    max_tokens: int,
    min_pixels: int,
    max_pixels: int
) -> Tuple[List[Tuple[str, str]], str]:
    """
    Handle chat with conversation history
    """
    try:
        # Determine system prompt
        if custom_system_prompt.strip():
            system_prompt = custom_system_prompt.strip()
        else:
            system_prompt = SYSTEM_PROMPTS.get(usage_mode, SYSTEM_PROMPTS["General Assistant"])
        
        # If no message provided, return current history
        if not message.strip():
            return history, ""
        
        # Call inference function
        if image:
            response, img_height, img_width = inference(
                prompt=message,
                image_path=image,
                system_prompt=system_prompt,
                max_new_tokens=max_tokens,
                min_pixels=min_pixels,
                max_pixels=max_pixels
            )
        else:
            # Handle text-only messages (you might need to modify inference function for this)
            response = "Image is required for this model."
        
        # Add to history
        history.append((message, response))
        
        return history, ""
        
    except Exception as e:
        error_msg = f"Error: {str(e)}"
        history.append((message, error_msg))
        return history, ""

def clear_history():
    """Clear chat history"""
    return [], ""

def update_system_prompt(usage_mode: str) -> str:
    """Update system prompt based on usage mode"""
    return SYSTEM_PROMPTS.get(usage_mode, SYSTEM_PROMPTS["General Assistant"])

# Create Gradio interface
with gr.Blocks(title="Qwen2.5 VL Chatbot", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Qwen2.5 VL Chatbot with Conversation History")
    gr.Markdown("Upload an image and chat with the AI assistant. The conversation history is maintained throughout the session.")
    
    with gr.Row():
        with gr.Column(scale=1):
            # Configuration Panel
            gr.Markdown("### Configuration")
            
            usage_mode = gr.Dropdown(
                choices=list(SYSTEM_PROMPTS.keys()),
                value="General Assistant",
                label="Usage Mode",
                info="Select the AI's behavior mode"
            )
            
            custom_system_prompt = gr.Textbox(
                label="Custom System Prompt (Optional)",
                placeholder="Enter custom system prompt to override the selected mode...",
                lines=3
            )
            
            with gr.Accordion("Advanced Settings", open=False):
                max_tokens = gr.Slider(
                    minimum=100,
                    maximum=32000,
                    value=2000,
                    step=100,
                    label="Max New Tokens"
                )
                
                min_pixels = gr.Number(
                    value=512 * 28 * 28,
                    label="Min Pixels",
                    info="Minimum image resolution"
                )
                
                max_pixels = gr.Number(
                    value=2048 * 28 * 28,
                    label="Max Pixels",
                    info="Maximum image resolution"
                )
            
            clear_btn = gr.Button("Clear History", variant="secondary")
            
        with gr.Column(scale=2):
            # Chat Interface
            chatbot = gr.Chatbot(
                label="Conversation",
                height=400,
                show_copy_button=True
            )
            
            with gr.Row():
                image_input = gr.Image(
                    type="filepath",
                    label="Upload Image"
                )
            
            with gr.Row():
                msg_input = gr.Textbox(
                    label="Message",
                    placeholder="Type your message here...",
                    scale=4
                )
                send_btn = gr.Button("Send", variant="primary", scale=1)
    
    # Auto-update system prompt when usage mode changes
    usage_mode.change(
        fn = update_system_prompt,
        inputs = [usage_mode],
        outputs = [custom_system_prompt]
    )
    
    # Handle send button click
    send_btn.click(
        fn = chat_with_history,
        inputs = [
            msg_input,
            image_input,
            chatbot,
            usage_mode,
            custom_system_prompt,
            max_tokens,
            min_pixels,
            max_pixels
        ],
        outputs=[chatbot, msg_input]
    )
    
    # Handle enter key in message input
    msg_input.submit(
        fn = chat_with_history,
        inputs = [
            msg_input,
            image_input,
            chatbot,
            usage_mode,
            custom_system_prompt,
            max_tokens,
            min_pixels,
            max_pixels
        ],
        outputs = [chatbot, msg_input]
    )
    
    # Handle clear history button
    clear_btn.click(
        fn = clear_history,
        outputs = [chatbot, msg_input]
    )

if __name__ == "__main__":
    demo.launch(
        #server_name = "0.0.0.0",
        #server_port = 7860,
        #share = False,
        debug = True
    )

## v2

In [None]:
import gradio as gr
import torch
import json
import os
from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer
)
from qwen_vl_utils import process_vision_info
from PIL import Image, ImageDraw, ImageFont
from IPython.display import display
from typing import List, Tuple, Dict, Any, Optional

# Load model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)
model_path = './00_Model/Qwen2.5-VL-3B-Instruct'
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map = 'auto',
).to(device) #''
processor = AutoProcessor.from_pretrained(model_path)

# Custom CSS for ChatGPT-like styling
css = """
.gradio-container {
    max-width: 1200px !important;
    margin: 0 auto !important;
}

.message-wrap.svelte-1lcyrx4 {
    font-size: 10px !important;
}

.message.user.svelte-1lcyrx4 {
    font-size: 10px !important;
}

.message.bot.svelte-1lcyrx4 {
    font-size: 10px !important;
}

.chatbot {
    font-size: 10px !important;
}

.textbox_container {
    border-radius: 8px !important;
}

.settings-panel {
    background: #f8f9fa;
    padding: 15px;
    border-radius: 8px;
    margin-bottom: 15px;
}

.chat-container {
    display: flex;
    flex-direction: column;
    height: 80vh;
}

.input-area {
    border-top: 1px solid #e0e0e0;
    padding: 10px 0;
    background: white;
}

.attachment-area {
    padding: 5px 0;
    border-bottom: 1px solid #f0f0f0;
    margin-bottom: 10px;
}

#chatbot .message {
    font-size: 10px !important;
}

body, .gradio-container, .gradio-container * {
    font-size: 10px !important;
}

.gr-button {
    font-size: 10px !important;
}

.gr-textbox {
    font-size: 10px !important;
}

.gr-dropdown {
    font-size: 10px !important;
}

h1, h2, h3 {
    font-size: 18px !important;
}

.gr-markdown h1 {
    font-size: 20px !important;
}

.gr-markdown h2 {
    font-size: 18px !important;
}

.gr-markdown h3 {
    font-size: 16px !important;
}
"""

# System prompts for different usage modes
SYSTEM_PROMPTS = {
    "General Assistant": "You are a helpful assistant that can analyze images and answer questions.",
    "Image Analyzer": "You are an expert image analyst. Provide detailed descriptions and analysis of images.",
    "OCR Reader": "You are an OCR specialist. Extract and transcribe all text from images accurately.",
    "Medical Assistant": "You are a medical AI assistant. Analyze medical images and provide insights (for educational purposes only).",
    "Educational Tutor": "You are an educational tutor. Help explain concepts shown in images and answer related questions."
}

def inference(
    prompt,
    image_path,
    system_prompt = 'You are a helpful assistant',
    max_new_tokens = 32000,
    min_pixels = 512 * 28 * 28,
    max_pixels = 2048 * 28 * 28
):
    messages = [
        {
            'role' : 'user',
            'content' : [
                {
                    'type' : 'image',
                    'image' : image_path,
                    'min_pixels' : min_pixels,
                    'max_pixels' : max_pixels,
                },
                {'type' : 'text', 'text' : prompt},
            ],
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize = False, add_generation_prompt = True
    )
    print('input:\n', text)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text = [text],
        images = image_inputs,
        videos = video_inputs,
        padding = True,
        return_tensors = 'pt',
    )
    inputs = inputs.to('cuda')

    streamer = TextStreamer(processor.tokenizer, skip_special_tokens = True, skip_prompt = True)

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens = max_new_tokens, streamer = streamer)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens = True, clean_up_tokenization_spaces = False
    )

    print('output:\n', output_text[0])

    input_height = inputs['image_grid_thw'][0][1] * 14
    input_width = inputs['image_grid_thw'][0][2] * 14

    return output_text[0], input_height, input_width

def chat_with_history(
    message: str,
    image: Optional[str],
    history: List[Tuple[str, str]],
    usage_mode: str,
    custom_system_prompt: str,
    max_tokens: int,
    min_pixels: int,
    max_pixels: int
) -> Tuple[List[Tuple[str, str]], str]:
    """
    Handle chat with conversation history
    """
    try:
        # Determine system prompt
        if custom_system_prompt.strip():
            system_prompt = custom_system_prompt.strip()
        else:
            system_prompt = SYSTEM_PROMPTS.get(usage_mode, SYSTEM_PROMPTS["General Assistant"])
        
        # If no message provided, return current history
        if not message.strip():
            return history, ""
        
        # Call inference function
        if image:
            response, img_height, img_width = inference(
                prompt=message,
                image_path=image,
                system_prompt=system_prompt,
                max_new_tokens=max_tokens,
                min_pixels=min_pixels,
                max_pixels=max_pixels
            )
        else:
            # Handle text-only messages (you might need to modify inference function for this)
            response = "Image is required for this model."
        
        # Add to history
        history.append((message, response))
        
        return history, ""
        
    except Exception as e:
        error_msg = f"Error: {str(e)}"
        history.append((message, error_msg))
        return history, ""

def clear_history():
    """Clear chat history"""
    return [], ""

def update_system_prompt(usage_mode: str) -> str:
    """Update system prompt based on usage mode"""
    return SYSTEM_PROMPTS.get(usage_mode, SYSTEM_PROMPTS["General Assistant"])

# Create Gradio interface
# Create Gradio interface
with gr.Blocks(title="Qwen2.5 VL Chatbot", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Qwen2.5 VL Chatbot with Conversation History")
    gr.Markdown("Upload an image and chat with the AI assistant. The conversation history is maintained throughout the session.")
    
    with gr.Row():
        with gr.Column(scale=1):
            # Configuration Panel
            gr.Markdown("### Configuration")
            
            usage_mode = gr.Dropdown(
                choices=list(SYSTEM_PROMPTS.keys()),
                value="General Assistant",
                label="Usage Mode",
                info="Select the AI's behavior mode"
            )
            
            custom_system_prompt = gr.Textbox(
                label="Custom System Prompt (Optional)",
                placeholder="Enter custom system prompt to override the selected mode...",
                lines=3
            )
            
            with gr.Accordion("Advanced Settings", open=False):
                max_tokens = gr.Slider(
                    minimum=100,
                    maximum=32000,
                    value=2000,
                    step=100,
                    label="Max New Tokens"
                )
                
                min_pixels = gr.Number(
                    value=512 * 28 * 28,
                    label="Min Pixels",
                    info="Minimum image resolution"
                )
                
                max_pixels = gr.Number(
                    value=2048 * 28 * 28,
                    label="Max Pixels",
                    info="Maximum image resolution"
                )
            
            clear_btn = gr.Button("Clear History", variant="secondary")
            
        with gr.Column(scale=2):
            # Chat Interface
            chatbot = gr.Chatbot(
                label="Conversation",
                height=400,
                show_copy_button=True
            )
            
            with gr.Row():
                image_input = gr.Image(
                    type="filepath",
                    label="Upload Image"
                )
            
            with gr.Row():
                msg_input = gr.Textbox(
                    label="Message",
                    placeholder="Type your message here...",
                    scale=4
                )
                send_btn = gr.Button("Send", variant="primary", scale=1)
    
    # Auto-update system prompt when usage mode changes
    usage_mode.change(
        fn = update_system_prompt,
        inputs = [usage_mode],
        outputs = [custom_system_prompt]
    )
    
    # Handle send button click
    send_btn.click(
        fn = chat_with_history,
        inputs = [
            msg_input,
            image_input,
            chatbot,
            usage_mode,
            custom_system_prompt,
            max_tokens,
            min_pixels,
            max_pixels
        ],
        outputs=[chatbot, msg_input]
    )
    
    # Handle enter key in message input
    msg_input.submit(
        fn = chat_with_history,
        inputs = [
            msg_input,
            image_input,
            chatbot,
            usage_mode,
            custom_system_prompt,
            max_tokens,
            min_pixels,
            max_pixels
        ],
        outputs = [chatbot, msg_input]
    )
    
    # Handle clear history button
    clear_btn.click(
        fn = clear_history,
        outputs = [chatbot, msg_input]
    )

if __name__ == "__main__":
    demo.launch(
        #server_name = "0.0.0.0",
        #server_port = 7860,
        #share = False,
        debug = True
    )

## v3

In [None]:
import gradio as gr
import torch
import json
import os
import asyncio
import time

from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer
)
from qwen_vl_utils import process_vision_info
from PIL import Image, ImageDraw, ImageFont
from IPython.display import display
from typing import List, Tuple, Dict, Any, Optional

# Load model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)
model_path = './00_Model/Qwen2.5-VL-3B-Instruct'
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map = 'auto',
).to(device) #''
processor = AutoProcessor.from_pretrained(model_path)

# Custom CSS for ChatGPT-like interface
custom_css = """
/* Main container */
.gradio-container {
    font-size: 14px !important;
    max-width: 100% !important;
}

/* Sidebar styling */
.sidebar {
    background-color: #f7f7f8;
    border-right: 1px solid #e5e5e5;
    padding: 16px;
    height: 100vh;
    overflow-y: auto;
}

/* Chat container */
.chat-container {
    height: calc(100vh - 100px);
    display: flex;
    flex-direction: column;
}

/* Chatbot styling */
.chatbot {
    flex-grow: 1;
    border: none !important;
    font-size: 14px !important;
}

/* Message input area */
.message-input-container {
    padding: 16px;
    border-top: 1px solid #e5e5e5;
    background-color: white;
}

/* Input styling */
.message-input {
    border-radius: 24px !important;
    border: 2px solid #e5e5e5 !important;
    padding: 12px 16px !important;
    font-size: 14px !important;
}

/* Button styling */
.send-button {
    border-radius: 20px !important;
    background: #10a37f !important;
    border: none !important;
    padding: 8px 16px !important;
    font-size: 14px !important;
}

/* Sidebar toggle button */
.sidebar-toggle {
    position: fixed;
    top: 16px;
    left: 16px;
    z-index: 1000;
    background: #f7f7f8 !important;
    border: 1px solid #e5e5e5 !important;
    border-radius: 6px !important;
    padding: 8px !important;
}

/* Hide sidebar when collapsed */
.sidebar-hidden {
    display: none !important;
}

/* Adjust main content when sidebar is hidden */
.main-content-expanded {
    margin-left: 0 !important;
}

/* Image attachment styling */
.image-attachment {
    max-width: 150px;
    max-height: 150px;
    border-radius: 8px;
    margin: 5px 0;
    cursor: pointer;
}

/* Smaller text for all components */
* {
    font-size: 14px !important;
}

/* File upload area */
.file-upload {
    border: 2px dashed #e5e5e5 !important;
    border-radius: 12px !important;
    padding: 20px !important;
    text-align: center !important;
    background-color: #fafafa !important;
}
"""

# System prompts for different usage modes
SYSTEM_PROMPTS = {
    "General Assistant": "You are a helpful assistant that can analyze images and answer questions.",
    "Image Analyzer": "You are an expert image analyst. Provide detailed descriptions and analysis of images.",
    "OCR Reader": "You are an OCR specialist. Extract and transcribe all text from images accurately.",
    "Medical Assistant": "You are a medical AI assistant. Analyze medical images and provide insights (for educational purposes only).",
    "Educational Tutor": "You are an educational tutor. Help explain concepts shown in images and answer related questions."
}

def inference(
    prompt,
    image_path,
    system_prompt = 'You are a helpful assistant',
    max_new_tokens = 32000,
    min_pixels = 512 * 28 * 28,
    max_pixels = 2048 * 28 * 28
):
    messages = [
        {
            'role' : 'user',
            'content' : [
                {
                    'type' : 'image',
                    'image' : image_path,
                    'min_pixels' : min_pixels,
                    'max_pixels' : max_pixels,
                },
                {'type' : 'text', 'text' : prompt},
            ],
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize = False, add_generation_prompt = True
    )
    print('input:\n', text)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text = [text],
        images = image_inputs,
        videos = video_inputs,
        padding = True,
        return_tensors = 'pt',
    )
    inputs = inputs.to('cuda')

    streamer = TextStreamer(processor.tokenizer, skip_special_tokens = True, skip_prompt = True)

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens = max_new_tokens, streamer = streamer)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens = True, clean_up_tokenization_spaces = False
    )

    print('output:\n', output_text[0])

    input_height = inputs['image_grid_thw'][0][1] * 14
    input_width = inputs['image_grid_thw'][0][2] * 14

    return output_text[0], input_height, input_width

def stream_response(text: str):
    """Simulate streaming response"""
    words = text.split()
    current_text = ""
    for word in words:
        current_text += word + " "
        yield current_text.strip()
        time.sleep(0.05)  # Adjust speed as needed

def chat_with_history(
    message: str,
    image: Optional[str],
    history: List[Tuple[str, str]],
    usage_mode: str,
    custom_system_prompt: str,
    max_tokens: int,
    min_pixels: int,
    max_pixels: int
):
    """Handle chat with conversation history and streaming"""
    try:
        # Determine system prompt
        if custom_system_prompt.strip():
            system_prompt = custom_system_prompt.strip()
        else:
            system_prompt = SYSTEM_PROMPTS.get(usage_mode, SYSTEM_PROMPTS["General Assistant"])
        
        # If no message provided, return current history
        if not message.strip() and not image:
            return history, "", None
        
        # Prepare user message with image if provided
        user_message = message if message.strip() else "Please analyze this image."
        if image:
            user_message_display = f'<img src="{image}" style="max-width: 150px; max-height: 150px; margin: 5px 0; border-radius: 8px; cursor: pointer;" onclick="window.open(this.src)"> <br>{user_message}'
        else:
            user_message_display = user_message
        
        # Add user message to history
        history.append((user_message_display, ""))
        yield history, "", None
        
        # Call inference function
        if image:
            response, img_height, img_width = inference(
                prompt=user_message,
                image_path=image,
                system_prompt=system_prompt,
                max_new_tokens=max_tokens,
                min_pixels=min_pixels,
                max_pixels=max_pixels
            )
        else:
            response = "Please upload an image to continue the conversation."
        
        # Stream the response
        history[-1] = (user_message_display, "")
        for partial_response in stream_response(response):
            history[-1] = (user_message_display, partial_response)
            yield history, "", None
        
    except Exception as e:
        error_msg = f"Error: {str(e)}"
        if history and history[-1][1] == "":
            history[-1] = (history[-1][0], error_msg)
        else:
            history.append((message, error_msg))
        yield history, "", None

def clear_history():
    """Clear chat history"""
    return [], "", None

def update_system_prompt(usage_mode: str) -> str:
    """Update system prompt based on usage mode"""
    return SYSTEM_PROMPTS.get(usage_mode, SYSTEM_PROMPTS["General Assistant"])

def toggle_sidebar():
    """Toggle sidebar visibility"""
    return gr.update(visible=False), gr.update(visible=True)

def show_sidebar():
    """Show sidebar"""
    return gr.update(visible=True), gr.update(visible=False)

# Create Gradio interface
with gr.Blocks(css=custom_css, title="Qwen2.5 VL Chat", theme=gr.themes.Soft()) as demo:
    
    # State for sidebar visibility
    sidebar_visible = gr.State(True)
    
    with gr.Row(elem_classes="main-container"):
        # Sidebar
        with gr.Column(scale=1, elem_classes="sidebar", visible=True) as sidebar:
            gr.Markdown("### ‚öôÔ∏è Settings", elem_classes="sidebar-title")
            
            # Hide sidebar button
            hide_sidebar_btn = gr.Button("‚Üê Hide", size="sm", elem_classes="hide-sidebar-btn")
            
            usage_mode = gr.Dropdown(
                choices=list(SYSTEM_PROMPTS.keys()),
                value="General Assistant",
                label="Usage Mode",
                info="Select AI behavior",
                elem_classes="dropdown-small"
            )
            
            custom_system_prompt = gr.Textbox(
                label="Custom System Prompt",
                placeholder="Override with custom prompt...",
                lines=3,
                elem_classes="textbox-small"
            )
            
            with gr.Accordion("Advanced", open=False):
                max_tokens = gr.Slider(
                    minimum=100,
                    maximum=32000,
                    value=2000,
                    step=100,
                    label="Max Tokens",
                    elem_classes="slider-small"
                )
                
                min_pixels = gr.Number(
                    value=512 * 28 * 28,
                    label="Min Pixels",
                    elem_classes="number-small"
                )
                
                max_pixels = gr.Number(
                    value=2048 * 28 * 28,
                    label="Max Pixels",
                    elem_classes="number-small"
                )
            
            clear_btn = gr.Button("üóëÔ∏è Clear Chat", variant="secondary", elem_classes="clear-btn")
        
        # Show sidebar button (hidden by default)
        show_sidebar_btn = gr.Button("‚ò∞", elem_classes="sidebar-toggle", visible=False)
        
        # Main chat area
        with gr.Column(scale=4, elem_classes="chat-container"):
            # Header
            gr.Markdown("# Qwen2.5 VL Chat", elem_classes="chat-header")
            
            # Chatbot
            chatbot = gr.Chatbot(
                label="",
                height=600,
                show_copy_button=True,
                elem_classes="chatbot",
                avatar_images=None,
                bubble_full_width=False
            )
            
            # Message input area
            with gr.Row(elem_classes="message-input-container"):
                with gr.Column(scale=1):
                    # Image attachment (hidden by default)
                    image_input = gr.File(
                        file_types=["image"],
                        label="üìé",
                        elem_classes="file-upload",
                        visible=True,
                        height=60
                    )
                
                with gr.Column(scale=8):
                    msg_input = gr.Textbox(
                        placeholder="Message Qwen2.5 VL...",
                        lines=1,
                        max_lines=4,
                        elem_classes="message-input",
                        show_label=False,
                        container=False
                    )
                
                with gr.Column(scale=1):
                    send_btn = gr.Button("Send", variant="primary", elem_classes="send-button")
    
    # Event handlers
    
    # Auto-update system prompt when usage mode changes
    usage_mode.change(
        fn=update_system_prompt,
        inputs=[usage_mode],
        outputs=[custom_system_prompt]
    )
    
    # Sidebar toggle functions
    hide_sidebar_btn.click(
        fn=toggle_sidebar,
        outputs=[sidebar, show_sidebar_btn]
    )
    
    show_sidebar_btn.click(
        fn=show_sidebar,
        outputs=[sidebar, show_sidebar_btn]
    )
    
    # Chat functions with streaming
    send_btn.click(
        fn=chat_with_history,
        inputs=[
            msg_input,
            image_input,
            chatbot,
            usage_mode,
            custom_system_prompt,
            max_tokens,
            min_pixels,
            max_pixels
        ],
        outputs=[chatbot, msg_input, image_input]
    )
    
    msg_input.submit(
        fn=chat_with_history,
        inputs=[
            msg_input,
            image_input,
            chatbot,
            usage_mode,
            custom_system_prompt,
            max_tokens,
            min_pixels,
            max_pixels
        ],
        outputs=[chatbot, msg_input, image_input]
    )
    
    clear_btn.click(
        fn=clear_history,
        outputs=[chatbot, msg_input, image_input]
    )

if __name__ == "__main__":
    demo.launch(
        #server_name="0.0.0.0",
        #server_port=7860,
        #share=False,
        debug=True
    )

## v4

In [None]:
import gradio as gr
import torch
import json
import os
import asyncio
import time
import base64
import io
import tempfile

from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer
)
from qwen_vl_utils import process_vision_info
from PIL import Image, ImageDraw, ImageFont
from IPython.display import display
from typing import List, Tuple, Dict, Any, Optional

# Load model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)
model_path = './00_Model/Qwen2.5-VL-3B-Instruct'
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map = 'auto',
).to(device) #''
processor = AutoProcessor.from_pretrained(model_path)

# Custom CSS with dark purple Qwen-inspired theme
custom_css = """
/* Import Google Fonts */
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600&display=swap');

/* Global styles */
* {
    font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
    font-size: 13px !important;
    line-height: 1.4 !important;
}

/* Main container */
.gradio-container {
    background-color: #0f0f23 !important;
    color: #e2e8f0 !important;
    max-width: 100% !important;
}

/* Sidebar styling */
.sidebar {
    background: linear-gradient(180deg, #1a1a2e 0%, #16213e 100%) !important;
    border-right: 1px solid #2d1b69 !important;
    padding: 16px !important;
    min-height: 100vh !important;
    overflow-y: auto !important;
    color: #e2e8f0 !important;
    box-shadow: 2px 0 10px rgba(45, 27, 105, 0.3) !important;
}

.sidebar h3 {
    color: #a78bfa !important;
    font-size: 14px !important;
    font-weight: 600 !important;
    margin-bottom: 16px !important;
    padding-bottom: 8px !important;
    border-bottom: 1px solid #2d1b69 !important;
}

.sidebar label {
    color: #cbd5e1 !important;
    font-weight: 400 !important;
    font-size: 12px !important;
}

/* Chat container */
.chat-container {
    background-color: #0f0f23 !important;
    min-height: 100vh !important;
    display: flex !important;
    flex-direction: column !important;
}

/* Header */
.chat-header {
    background: linear-gradient(135deg, #5b21b6 0%, #7c3aed 50%, #8b5cf6 100%) !important;
    color: #f8fafc !important;
    padding: 12px 20px !important;
    margin: 0 !important;
    border-radius: 0 !important;
    box-shadow: 0 2px 8px rgba(91, 33, 182, 0.3) !important;
    border-bottom: 1px solid #6d28d9 !important;
}

.chat-header h1 {
    font-size: 16px !important;
    font-weight: 600 !important;
    margin: 0 !important;
    color: #f8fafc !important;
}

/* Chatbot container */
.chatbot {
    flex-grow: 1 !important;
    border: none !important;
    background-color: #0f0f23 !important;
    padding: 16px !important;
    overflow-y: auto !important;
}

/* Chat message styling */
.message {
    border-radius: 12px !important;
    padding: 12px 16px !important;
    margin: 6px 0 !important;
    max-width: 85% !important;
    word-wrap: break-word !important;
    font-size: 13px !important;
    line-height: 1.5 !important;
}

/* User messages */
.message.user {
    background: linear-gradient(135deg, #5b21b6 0%, #7c3aed 100%) !important;
    color: #f8fafc !important;
    margin-left: auto !important;
    margin-right: 0 !important;
    border: 1px solid #6d28d9 !important;
}

/* Bot messages */
.message.bot {
    background: linear-gradient(135deg, #1e293b 0%, #334155 100%) !important;
    color: #e2e8f0 !important;
    margin-left: 0 !important;
    margin-right: auto !important;
    border: 1px solid #475569 !important;
    border-left: 3px solid #8b5cf6 !important;
}

/* Message input area */
.message-input-container {
    padding: 16px !important;
    background: linear-gradient(180deg, #0f0f23 0%, #1a1a2e 100%) !important;
    border-top: 1px solid #2d1b69 !important;
    box-shadow: 0 -2px 8px rgba(0, 0, 0, 0.2) !important;
}

/* Input styling */
.message-input textarea {
    border-radius: 8px !important;
    border: 1px solid #475569 !important;
    padding: 10px 14px !important;
    font-size: 13px !important;
    background-color: #1e293b !important;
    color: #e2e8f0 !important;
    box-shadow: inset 0 1px 3px rgba(0, 0, 0, 0.2) !important;
    transition: all 0.2s ease !important;
}

.message-input textarea:focus {
    border-color: #8b5cf6 !important;
    box-shadow: 0 0 0 2px rgba(139, 92, 246, 0.2) !important;
    outline: none !important;
}

.message-input textarea::placeholder {
    color: #94a3b8 !important;
}

/* Button styling */
.send-button {
    border-radius: 8px !important;
    background: linear-gradient(135deg, #5b21b6 0%, #7c3aed 100%) !important;
    border: none !important;
    padding: 10px 16px !important;
    font-size: 13px !important;
    color: #f8fafc !important;
    font-weight: 500 !important;
    box-shadow: 0 2px 4px rgba(91, 33, 182, 0.3) !important;
    transition: all 0.2s ease !important;
}

.send-button:hover {
    background: linear-gradient(135deg, #6d28d9 0%, #8b5cf6 100%) !important;
    transform: translateY(-1px) !important;
    box-shadow: 0 4px 8px rgba(91, 33, 182, 0.4) !important;
}

/* File upload area */
.file-upload {
    border: 1px dashed #475569 !important;
    border-radius: 8px !important;
    padding: 12px !important;
    text-align: center !important;
    background: linear-gradient(135deg, #1e293b 0%, #334155 100%) !important;
    transition: all 0.2s ease !important;
    color: #cbd5e1 !important;
}

.file-upload:hover {
    border-color: #8b5cf6 !important;
    background: linear-gradient(135deg, #2d1b69 0%, #3730a3 100%) !important;
}

/* Dropdown and input field styling */
.gradio-dropdown, .gradio-textbox, .gradio-slider {
    background-color: #1e293b !important;
    color: #e2e8f0 !important;
    border: 1px solid #475569 !important;
    border-radius: 6px !important;
}

.gradio-dropdown select, .gradio-textbox input, .gradio-textbox textarea {
    background-color: #1e293b !important;
    color: #e2e8f0 !important;
    border: none !important;
    font-size: 12px !important;
}

.gradio-dropdown select:focus, .gradio-textbox input:focus, .gradio-textbox textarea:focus {
    box-shadow: 0 0 0 2px rgba(139, 92, 246, 0.2) !important;
    outline: none !important;
}

/* Button variants */
.gradio-button {
    border-radius: 6px !important;
    font-size: 12px !important;
    font-weight: 500 !important;
    padding: 8px 12px !important;
    transition: all 0.2s ease !important;
}

.gradio-button.secondary {
    background: linear-gradient(135deg, #374151 0%, #4b5563 100%) !important;
    color: #e2e8f0 !important;
    border: 1px solid #6b7280 !important;
}

.gradio-button.secondary:hover {
    background: linear-gradient(135deg, #4b5563 0%, #6b7280 100%) !important;
}

.gradio-button.stop {
    background: linear-gradient(135deg, #dc2626 0%, #ef4444 100%) !important;
    color: #f8fafc !important;
    border: none !important;
}

.gradio-button.stop:hover {
    background: linear-gradient(135deg, #b91c1c 0%, #dc2626 100%) !important;
}

/* Accordion styling */
.gradio-accordion {
    background-color: #1e293b !important;
    border: 1px solid #475569 !important;
    border-radius: 6px !important;
}

.gradio-accordion .accordion-header {
    background-color: #334155 !important;
    color: #cbd5e1 !important;
    font-size: 12px !important;
}

/* Image thumbnails in chat */
.chat-thumbnail {
    border-radius: 8px !important;
    border: 1px solid #475569 !important;
    transition: all 0.2s ease !important;
    cursor: pointer !important;
    box-shadow: 0 2px 8px rgba(0, 0, 0, 0.3) !important;
}

.chat-thumbnail:hover {
    transform: scale(1.02) !important;
    border-color: #8b5cf6 !important;
    box-shadow: 0 4px 12px rgba(139, 92, 246, 0.3) !important;
}

/* Message with image container */
.message-with-image {
    display: flex !important;
    flex-direction: column !important;
    gap: 8px !important;
}

.message-text {
    font-size: 13px !important;
    line-height: 1.5 !important;
}

/* Markdown content styling */
.markdown-content h1, .markdown-content h2, .markdown-content h3 {
    color: #a78bfa !important;
    margin: 12px 0 6px 0 !important;
    font-size: 14px !important;
}

.markdown-content h1 { font-size: 16px !important; }
.markdown-content h2 { font-size: 15px !important; }
.markdown-content h3 { font-size: 14px !important; }

.markdown-content code {
    background-color: #1e293b !important;
    color: #e879f9 !important;
    padding: 2px 4px !important;
    border-radius: 4px !important;
    font-family: 'JetBrains Mono', 'Consolas', monospace !important;
    font-size: 12px !important;
    border: 1px solid #475569 !important;
}

.markdown-content pre {
    background-color: #0f172a !important;
    color: #e2e8f0 !important;
    padding: 12px !important;
    border-radius: 8px !important;
    overflow-x: auto !important;
    border: 1px solid #334155 !important;
    font-size: 12px !important;
}

.markdown-content pre code {
    background: none !important;
    border: none !important;
    padding: 0 !important;
}

/* Sidebar toggle button */
.sidebar-toggle {
    position: fixed !important;
    top: 16px !important;
    left: 16px !important;
    z-index: 1001 !important;
    background: linear-gradient(135deg, #5b21b6 0%, #7c3aed 100%) !important;
    border: none !important;
    border-radius: 6px !important;
    padding: 8px 10px !important;
    color: #f8fafc !important;
    box-shadow: 0 2px 8px rgba(91, 33, 182, 0.4) !important;
    font-size: 12px !important;
}

/* Scrollbar styling */
.chatbot::-webkit-scrollbar, .sidebar::-webkit-scrollbar {
    width: 6px !important;
}

.chatbot::-webkit-scrollbar-track, .sidebar::-webkit-scrollbar-track {
    background: #1e293b !important;
}

.chatbot::-webkit-scrollbar-thumb, .sidebar::-webkit-scrollbar-thumb {
    background: #475569 !important;
    border-radius: 3px !important;
}

.chatbot::-webkit-scrollbar-thumb:hover, .sidebar::-webkit-scrollbar-thumb:hover {
    background: #6b7280 !important;
}

/* Responsive design */
@media (max-width: 768px) {
    .sidebar {
        width: 100% !important;
        position: fixed !important;
        z-index: 1000 !important;
        height: 100vh !important;
        left: -100% !important;
        transition: left 0.3s ease !important;
    }
    
    .sidebar.visible {
        left: 0 !important;
    }
    
    .chat-container {
        width: 100% !important;
        margin-left: 0 !important;
    }
    
    .message {
        max-width: 95% !important;
    }
    
    .chat-header h1 {
        font-size: 14px !important;
    }
    
    .message-input-container {
        padding: 12px !important;
    }
}

@media (max-width: 640px) {
    .message {
        max-width: 100% !important;
        margin-left: 0 !important;
        margin-right: 0 !important;
    }
}

/* Fix for Gradio specific elements */
.gradio-chatbot .message-wrap {
    background: transparent !important;
}

.gradio-chatbot .message-wrap .message {
    background: transparent !important;
}

/* Avatar styling */
.gradio-chatbot .avatar {
    width: 24px !important;
    height: 24px !important;
    border-radius: 50% !important;
    margin-right: 8px !important;
}

/* Copy button styling */
.copy-button {
    background: rgba(139, 92, 246, 0.1) !important;
    color: #a78bfa !important;
    border: 1px solid rgba(139, 92, 246, 0.3) !important;
    border-radius: 4px !important;
    font-size: 11px !important;
    padding: 4px 8px !important;
}

.copy-button:hover {
    background: rgba(139, 92, 246, 0.2) !important;
    border-color: rgba(139, 92, 246, 0.5) !important;
}
"""

# System prompts for different usage modes
SYSTEM_PROMPTS = {
    "General Assistant": "You are a helpful assistant that can analyze images and answer questions.",
    "Image Analyzer": "You are an expert image analyst. Provide detailed descriptions and analysis of images.",
    "OCR Reader": "You are an OCR specialist. Extract and transcribe all text from images accurately.",
    "Medical Assistant": "You are a medical AI assistant. Analyze medical images and provide insights (for educational purposes only).",
    "Educational Tutor": "You are an educational tutor. Help explain concepts shown in images and answer related questions."
}

def inference(
    prompt,
    image_path,
    system_prompt = 'You are a helpful assistant',
    max_new_tokens = 32000,
    min_pixels = 512 * 28 * 28,
    max_pixels = 2048 * 28 * 28
):
    messages = [
        {
            'role' : 'user',
            'content' : [
                {
                    'type' : 'image',
                    'image' : image_path,
                    'min_pixels' : min_pixels,
                    'max_pixels' : max_pixels,
                },
                {'type' : 'text', 'text' : prompt},
            ],
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize = False, add_generation_prompt = True
    )
    print('input:\n', text)
        
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text = [text],
        images = image_inputs,
        videos = video_inputs,
        padding = True,
        return_tensors = 'pt',
    )
    inputs = inputs.to('cuda')

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens = max_new_tokens)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens = True, clean_up_tokenization_spaces = False
    )

    print('output:\n', output_text[0])

    input_height = inputs['image_grid_thw'][0][1] * 14
    input_width = inputs['image_grid_thw'][0][2] * 14

    return output_text[0], input_height, input_width

# Global storage for image thumbnails and originals
image_storage = {}

def create_image_thumbnail(image_path: str, thumbnail_size: tuple = (120, 120)) -> tuple:
    """Create thumbnail and store original image, return thumbnail path and storage key"""
    try:
        # Generate unique key for this image
        import hashlib
        with open(image_path, 'rb') as f:
            image_hash = hashlib.md5(f.read()).hexdigest()
        
        # Create thumbnail
        with Image.open(image_path) as img:
            # Convert to RGB if necessary
            if img.mode in ('RGBA', 'P'):
                img = img.convert('RGB')
            
            # Create thumbnail
            img.thumbnail(thumbnail_size, Image.Resampling.LANCZOS)
            
            # Save thumbnail to temp file
            temp_thumb = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
            img.save(temp_thumb.name, 'JPEG', quality=85)
            
        # Store original and thumbnail paths
        image_storage[image_hash] = {
            'original': image_path,
            'thumbnail': temp_thumb.name
        }
        
        return temp_thumb.name, image_hash
        
    except Exception as e:
        print(f"Error creating thumbnail: {e}")
        return image_path, None

def format_message_with_image(message: str, image_path: str, image_key: str) -> str:
    """Format message with embedded thumbnail image"""
    if image_path and image_key:
        return f"""<div class="message-with-image">
<img src="file://{image_path}" 
     class="chat-thumbnail" 
     data-key="{image_key}"
     style="max-width: 120px; max-height: 120px; border-radius: 8px; margin: 4px 0; cursor: pointer;" />
<div class="message-text">{message}</div>
</div>"""
    return message

def simulate_streaming_response(full_text: str):
    """Simulate streaming response like real chatbots"""
    words = full_text.split(' ')
    current_response = ""
    
    for i, word in enumerate(words):
        current_response += word + " "
        # Vary the delay to make it more natural
        if word.endswith(('.', '!', '?')):
            delay = 0.12  # Longer pause after sentences
        elif word.endswith(','):
            delay = 0.06  # Medium pause after commas
        else:
            delay = 0.025  # Quick pace for regular words
            
        yield current_response.strip()
        time.sleep(delay)

def chat_with_history(
    message: str,
    image: Optional[str],
    history: List[Tuple[str, str]],
    usage_mode: str,
    custom_system_prompt: str,
    max_tokens: int,
    min_pixels: int,
    max_pixels: int
):
    """Handle chat with conversation history and real streaming"""
    try:
        # Determine system prompt
        if custom_system_prompt.strip():
            system_prompt = custom_system_prompt.strip()
        else:
            system_prompt = SYSTEM_PROMPTS.get(usage_mode, SYSTEM_PROMPTS["General Assistant"])
        
        # If no message and no image, return
        if not message.strip() and not image:
            return history, "", None
        
        # Prepare user message
        user_message = message.strip() if message.strip() else "Please analyze this image."
        user_display_message = user_message
        
        # Handle image if provided
        image_key = None
        if image:
            thumbnail_path, image_key = create_image_thumbnail(image)
            if image_key:
                user_display_message = format_message_with_image(user_message, thumbnail_path, image_key)
        
        # Add user message to history immediately
        history.append([user_display_message, ""])
        yield history, "", None
        
        # Get AI response
        if image:
            response, img_height, img_width = inference(
                prompt=user_message,
                image_path=image,
                system_prompt=system_prompt,
                max_new_tokens=max_tokens,
                min_pixels=min_pixels,
                max_pixels=max_pixels
            )
        else:
            response = "Please upload an image to continue the conversation with this vision-language model."
        
        # Stream the response with markdown formatting
        history[-1] = [user_display_message, ""]
        for partial_response in simulate_streaming_response(response):
            # Format as markdown
            formatted_response = partial_response
            history[-1] = [user_display_message, formatted_response]
            yield history, "", None
        
    except Exception as e:
        error_msg = f"‚ùå **Error:** {str(e)}"
        if history and len(history) > 0 and history[-1][1] == "":
            history[-1] = [history[-1][0], error_msg]
        else:
            history.append([message if message else "Error occurred", error_msg])
        yield history, "", None

def clear_history():
    """Clear chat history and image storage"""
    global image_storage
    # Clean up temporary files
    for data in image_storage.values():
        try:
            if os.path.exists(data['thumbnail']):
                os.unlink(data['thumbnail'])
        except:
            pass
    image_storage.clear()
    return [], "", None

def update_system_prompt(usage_mode: str) -> str:
    """Update system prompt based on usage mode"""
    return SYSTEM_PROMPTS.get(usage_mode, SYSTEM_PROMPTS["General Assistant"])

def toggle_sidebar():
    """Toggle sidebar visibility"""
    return gr.update(visible=False), gr.update(visible=True)

def show_sidebar():
    """Show sidebar"""
    return gr.update(visible=True), gr.update(visible=False)

# Create Gradio interface
with gr.Blocks(css=custom_css, title="Qwen2.5 VL Chat", theme=gr.themes.Soft()) as demo:
    
    # State for sidebar visibility
    sidebar_visible = gr.State(True)
    
    with gr.Row(elem_classes="main-container"):
        # Sidebar
        with gr.Column(scale=1, elem_classes="sidebar", visible=True) as sidebar:
            gr.Markdown("### ‚öôÔ∏è Configuration", elem_classes="sidebar-title")
            
            # Hide sidebar button
            hide_sidebar_btn = gr.Button("‚Üê Hide", size="sm", variant="secondary")
            
            usage_mode = gr.Dropdown(
                choices=list(SYSTEM_PROMPTS.keys()),
                value="General Assistant",
                label="üéØ Mode",
                info="Select AI behavior"
            )
            
            custom_system_prompt = gr.Textbox(
                label="üìù Custom Prompt",
                placeholder="Override with custom prompt...",
                lines=3
            )
            
            with gr.Accordion("üîß Advanced", open=False):
                max_tokens = gr.Slider(
                    minimum=100,
                    maximum=32000,
                    value=2000,
                    step=100,
                    label="Max Tokens"
                )
                
                min_pixels = gr.Number(
                    value=512 * 28 * 28,
                    label="Min Pixels"
                )
                
                max_pixels = gr.Number(
                    value=2048 * 28 * 28,
                    label="Max Pixels"
                )
            
            clear_btn = gr.Button("üóëÔ∏è Clear", variant="stop")
        
        # Show sidebar button (hidden by default)
        show_sidebar_btn = gr.Button("‚ò∞", elem_classes="sidebar-toggle", visible=False)
        
        # Main chat area
        with gr.Column(scale=4, elem_classes="chat-container"):
            # Header
            gr.Markdown("# ü§ñ Qwen2.5 VL Assistant", elem_classes="chat-header")
            
            # Chatbot with markdown support
            chatbot = gr.Chatbot(
                label="",
                height=550,
                show_copy_button=True,
                elem_classes="chatbot",
                avatar_images=("üë§", "ü§ñ"),
                bubble_full_width=False,
                render_markdown=True,
                latex_delimiters=[{"left": "$$", "right": "$$", "display": True}]
            )
            
            # Message input area
            with gr.Row(elem_classes="message-input-container"):
                with gr.Column(scale=1):
                    # Image attachment
                    image_input = gr.File(
                        file_types=["image"],
                        label="üìé Image",
                        elem_classes="file-upload",
                        height=70
                    )
                
                with gr.Column(scale=6):
                    msg_input = gr.Textbox(
                        placeholder="Message Qwen2.5 VL...",
                        lines=2,
                        max_lines=5,
                        elem_classes="message-input",
                        show_label=False,
                        container=False
                    )
                
                with gr.Column(scale=1):
                    send_btn = gr.Button("Send", variant="primary", elem_classes="send-button")
    
    # Event handlers
    
    # Auto-update system prompt when usage mode changes
    usage_mode.change(
        fn=update_system_prompt,
        inputs=[usage_mode],
        outputs=[custom_system_prompt]
    )
    
    # Sidebar toggle functions
    hide_sidebar_btn.click(
        fn=toggle_sidebar,
        outputs=[sidebar, show_sidebar_btn]
    )
    
    show_sidebar_btn.click(
        fn=show_sidebar,
        outputs=[sidebar, show_sidebar_btn]
    )
    
    # Chat functions with real streaming
    send_btn.click(
        fn=chat_with_history,
        inputs=[
            msg_input,
            image_input,
            chatbot,
            usage_mode,
            custom_system_prompt,
            max_tokens,
            min_pixels,
            max_pixels
        ],
        outputs=[chatbot, msg_input, image_input]
    )
    
    msg_input.submit(
        fn=chat_with_history,
        inputs=[
            msg_input,
            image_input,
            chatbot,
            usage_mode,
            custom_system_prompt,
            max_tokens,
            min_pixels,
            max_pixels
        ],
        outputs=[chatbot, msg_input, image_input]
    )
    
    clear_btn.click(
        fn=clear_history,
        outputs=[chatbot, msg_input, image_input]
    )

if __name__ == "__main__":
    demo.launch(
        #server_name="0.0.0.0",
        #server_port=7860,
        #share=False,
        debug=True
    )

## v5

In [None]:
import gradio as gr
import torch
import json
import os
import asyncio
import time
import base64
import io
import tempfile

from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer
)
from qwen_vl_utils import process_vision_info
from PIL import Image, ImageDraw, ImageFont
from IPython.display import display
from typing import List, Tuple, Dict, Any, Optional

# Load model and processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)
model_path = './00_Model/Qwen2.5-VL-3B-Instruct'
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    quantization_config = bnb_config,
    device_map = 'auto',
).to(device) #''
processor = AutoProcessor.from_pretrained(model_path)

# Dark purple theme CSS inspired by Qwen
custom_css = """
/* Base font size and dark theme */
* {
    font-size: 13px !important;
    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important;
}

/* Main container */
.gradio-container {
    background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%) !important;
    color: #e2e8f0 !important;
    min-height: 100vh !important;
}

/* Sidebar styling */
.sidebar {
    background: linear-gradient(180deg, #2d1b69 0%, #1a1a2e 100%) !important;
    border-right: 1px solid #4c1d95 !important;
    padding: 16px !important;
    color: #e2e8f0 !important;
    box-shadow: 2px 0 10px rgba(0,0,0,0.3) !important;
}

.sidebar h3 {
    color: #a78bfa !important;
    font-size: 16px !important;
    font-weight: 600 !important;
    margin-bottom: 16px !important;
}

.sidebar label {
    color: #c4b5fd !important;
    font-size: 12px !important;
    font-weight: 500 !important;
}

/* Chat container */
.chat-container {
    background: linear-gradient(135deg, #1e1e3f 0%, #2d1b69 100%) !important;
    min-height: 100vh !important;
}

/* Header */
.chat-header {
    background: linear-gradient(90deg, #5b21b6 0%, #7c3aed 100%) !important;
    color: #f8fafc !important;
    padding: 12px 20px !important;
    border-bottom: 1px solid #4c1d95 !important;
    box-shadow: 0 2px 8px rgba(0,0,0,0.2) !important;
}

.chat-header h1 {
    font-size: 18px !important;
    font-weight: 600 !important;
    margin: 0 !important;
}

/* Chatbot container */
.chatbot {
    background: rgba(30, 30, 63, 0.5) !important;
    border: 1px solid #4c1d95 !important;
    border-radius: 12px !important;
    margin: 16px !important;
    box-shadow: inset 0 2px 10px rgba(0,0,0,0.2) !important;
}

/* Message bubbles */
.message {
    font-size: 13px !important;
    line-height: 1.5 !important;
    margin: 8px 12px !important;
    padding: 12px 16px !important;
    border-radius: 16px !important;
    max-width: 80% !important;
}

/* User messages */
.message.user {
    background: linear-gradient(135deg, #7c3aed 0%, #5b21b6 100%) !important;
    color: #f8fafc !important;
    margin-left: auto !important;
    margin-right: 12px !important;
    box-shadow: 0 2px 8px rgba(124, 58, 237, 0.3) !important;
}

/* Bot messages */
.message.bot {
    background: linear-gradient(135deg, #374151 0%, #4b5563 100%) !important;
    color: #f1f5f9 !important;
    margin-right: auto !important;
    margin-left: 12px !important;
    border-left: 3px solid #a78bfa !important;
    box-shadow: 0 2px 8px rgba(0,0,0,0.2) !important;
}

/* Message input area */
.message-input-container {
    background: rgba(45, 27, 105, 0.8) !important;
    border-top: 1px solid #4c1d95 !important;
    padding: 16px !important;
    backdrop-filter: blur(10px) !important;
}

/* Input styling */
.message-input textarea {
    background: rgba(55, 65, 81, 0.9) !important;
    border: 1px solid #6366f1 !important;
    border-radius: 20px !important;
    padding: 12px 16px !important;
    color: #f1f5f9 !important;
    font-size: 13px !important;
    box-shadow: 0 2px 8px rgba(0,0,0,0.1) !important;
    transition: all 0.2s ease !important;
}

.message-input textarea:focus {
    border-color: #a78bfa !important;
    box-shadow: 0 0 0 2px rgba(167, 139, 250, 0.2) !important;
    background: rgba(55, 65, 81, 1) !important;
}

.message-input textarea::placeholder {
    color: #9ca3af !important;
}

/* Button styling */
.send-button {
    background: linear-gradient(135deg, #7c3aed 0%, #5b21b6 100%) !important;
    border: none !important;
    border-radius: 16px !important;
    padding: 12px 20px !important;
    color: #f8fafc !important;
    font-size: 13px !important;
    font-weight: 600 !important;
    box-shadow: 0 2px 8px rgba(124, 58, 237, 0.4) !important;
    transition: all 0.2s ease !important;
}

.send-button:hover {
    transform: translateY(-1px) !important;
    box-shadow: 0 4px 12px rgba(124, 58, 237, 0.5) !important;
}

/* File upload area */
.file-upload {
    background: rgba(55, 65, 81, 0.6) !important;
    border: 2px dashed #6366f1 !important;
    border-radius: 12px !important;
    padding: 12px !important;
    color: #c4b5fd !important;
    transition: all 0.2s ease !important;
}

.file-upload:hover {
    border-color: #a78bfa !important;
    background: rgba(55, 65, 81, 0.8) !important;
}

/* Dropdown and input components */
.dropdown select {
    background: rgba(55, 65, 81, 0.9) !important;
    border: 1px solid #6366f1 !important;
    color: #f1f5f9 !important;
    border-radius: 8px !important;
    padding: 8px 12px !important;
    font-size: 12px !important;
}

.textbox input, .textbox textarea {
    background: rgba(55, 65, 81, 0.9) !important;
    border: 1px solid #6366f1 !important;
    color: #f1f5f9 !important;
    border-radius: 8px !important;
    font-size: 12px !important;
}

.textbox input::placeholder, .textbox textarea::placeholder {
    color: #9ca3af !important;
}

/* Number inputs and sliders */
.number input {
    background: rgba(55, 65, 81, 0.9) !important;
    border: 1px solid #6366f1 !important;
    color: #f1f5f9 !important;
    border-radius: 6px !important;
    font-size: 12px !important;
}

.slider {
    background: rgba(55, 65, 81, 0.6) !important;
}

/* Accordion */
.accordion {
    background: rgba(45, 27, 105, 0.3) !important;
    border: 1px solid #4c1d95 !important;
    border-radius: 8px !important;
}

/* Clear button */
.clear-btn {
    background: linear-gradient(135deg, #dc2626 0%, #991b1b 100%) !important;
    border: none !important;
    border-radius: 12px !important;
    padding: 10px 16px !important;
    color: #f8fafc !important;
    font-size: 12px !important;
    margin-top: 16px !important;
}

/* Hide/Show sidebar buttons */
.sidebar-toggle {
    background: linear-gradient(135deg, #5b21b6 0%, #4c1d95 100%) !important;
    border: none !important;
    border-radius: 8px !important;
    padding: 8px 12px !important;
    color: #f8fafc !important;
    font-size: 12px !important;
    position: fixed !important;
    top: 16px !important;
    left: 16px !important;
    z-index: 1000 !important;
    box-shadow: 0 2px 8px rgba(0,0,0,0.3) !important;
}

/* Image preview in chat */
.chat-image {
    border-radius: 12px !important;
    border: 2px solid #6366f1 !important;
    box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important;
    margin: 8px 0 !important;
    max-width: 200px !important;
    max-height: 200px !important;
    object-fit: cover !important;
    cursor: pointer !important;
    transition: all 0.2s ease !important;
}

.chat-image:hover {
    transform: scale(1.02) !important;
    border-color: #a78bfa !important;
}

/* Message with image container */
.message-with-image {
    display: flex !important;
    flex-direction: column !important;
    gap: 8px !important;
}

/* Scrollbar styling */
::-webkit-scrollbar {
    width: 6px !important;
}

::-webkit-scrollbar-track {
    background: rgba(30, 30, 63, 0.5) !important;
}

::-webkit-scrollbar-thumb {
    background: linear-gradient(180deg, #7c3aed 0%, #5b21b6 100%) !important;
    border-radius: 3px !important;
}

::-webkit-scrollbar-thumb:hover {
    background: linear-gradient(180deg, #8b5cf6 0%, #6d28d9 100%) !important;
}

/* Responsive design */
@media (max-width: 768px) {
    .sidebar {
        width: 100% !important;
        position: fixed !important;
        z-index: 999 !important;
        height: 100vh !important;
        left: -100% !important;
        transition: left 0.3s ease !important;
    }
    
    .sidebar.visible {
        left: 0 !important;
    }
    
    .message {
        max-width: 90% !important;
    }
    
    .chat-header h1 {
        font-size: 16px !important;
    }
}

/* Code blocks */
pre {
    background: rgba(17, 24, 39, 0.8) !important;
    border: 1px solid #4c1d95 !important;
    border-radius: 8px !important;
    padding: 12px !important;
    color: #f1f5f9 !important;
    font-size: 12px !important;
    overflow-x: auto !important;
}

code {
    background: rgba(55, 65, 81, 0.6) !important;
    color: #c4b5fd !important;
    padding: 2px 6px !important;
    border-radius: 4px !important;
    font-size: 12px !important;
}
"""

# System prompts for different usage modes
SYSTEM_PROMPTS = {
    "General Assistant": "You are a helpful assistant that can analyze images and answer questions.",
    "Image Analyzer": "You are an expert image analyst. Provide detailed descriptions and analysis of images.",
    "OCR Reader": "You are an OCR specialist. Extract and transcribe all text from images accurately.",
    "Medical Assistant": "You are a medical AI assistant. Analyze medical images and provide insights (for educational purposes only).",
    "Educational Tutor": "You are an educational tutor. Help explain concepts shown in images and answer related questions."
}

def inference_with_image(
    prompt,
    image_path,
    system_prompt = 'You are a helpful assistant',
    max_new_tokens = 32000,
    min_pixels = 512 * 28 * 28,
    max_pixels = 2048 * 28 * 28
):
    messages = [
        {
            'role' : 'user',
            'content' : [
                {
                    'type' : 'image',
                    'image' : image_path,
                    'min_pixels' : min_pixels,
                    'max_pixels' : max_pixels,
                },
                {'type' : 'text', 'text' : prompt},
            ],
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize = False, add_generation_prompt = True
    )
        
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text = [text],
        images = image_inputs,
        videos = video_inputs,
        padding = True,
        return_tensors = 'pt',
    )
    inputs = inputs.to('cuda')

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens = max_new_tokens)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens = True, clean_up_tokenization_spaces = False
    )

    input_height = inputs['image_grid_thw'][0][1] * 14
    input_width = inputs['image_grid_thw'][0][2] * 14

    return output_text[0], input_height, input_width

def inference_text_only(
    prompt,
    system_prompt = 'You are a helpful assistant',
    max_new_tokens = 32000,
):
    messages = [
        {
            'role' : 'user',
            'content' : [
                {'type' : 'text', 'text' : prompt},
            ],
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize = False, add_generation_prompt = True
    )
    print('input:\n', text)
        
    inputs = processor(
        text = text,
        padding = True,
        return_tensors = 'pt',
    )
    inputs = inputs.to('cuda')

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens = max_new_tokens)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens = True, clean_up_tokenization_spaces = False
    )

    return output_text[0]

# Global storage for image thumbnails and originals
image_storage = {}

def create_image_display(image_path: str) -> tuple:
    """Create image display for chat, return HTML and storage key"""
    try:
        # Generate unique key for this image
        import hashlib
        with open(image_path, 'rb') as f:
            image_hash = hashlib.md5(f.read()).hexdigest()
        
        # Store image path
        image_storage[image_hash] = image_path
        
        # Create HTML for image display
        image_html = f'<img src="file://{image_path}" class="chat-image" style="max-width: 200px; max-height: 200px; border-radius: 12px; border: 2px solid #6366f1; margin: 8px 0;" />'
        
        return image_html, image_hash
        
    except Exception as e:
        print(f"Error creating image display: {e}")
        return "", None

def simulate_streaming_response(full_text: str):
    """Simulate streaming response"""
    # Ensure we have a string
    if not isinstance(full_text, str):
        full_text = str(full_text)
    
    # Handle empty or whitespace-only text
    if not full_text.strip():
        yield "No response generated."
        return
    
    words = full_text.split(' ')
    current_response = ""
    
    for i, word in enumerate(words):
        current_response += word + " "
        # Vary the delay to make it more natural
        if word.endswith(('.', '!', '?')):
            delay = 0.1
        elif word.endswith(','):
            delay = 0.05
        else:
            delay = 0.02
            
        yield current_response.strip()
        time.sleep(delay)

def chat_with_history(
    message: str,
    image: Optional[str],
    history: List[Tuple[str, str]],
    usage_mode: str,
    custom_system_prompt: str,
    max_tokens: int,
    min_pixels: int,
    max_pixels: int
):
    """Handle chat with conversation history and real streaming"""
    try:
        # Determine system prompt
        if custom_system_prompt.strip():
            system_prompt = custom_system_prompt.strip()
        else:
            system_prompt = SYSTEM_PROMPTS.get(usage_mode, SYSTEM_PROMPTS["General Assistant"])
        
        # If no message and no image, return
        if not message.strip() and not image:
            return history, "", None
        
        # Prepare user message
        user_message = message.strip() if message.strip() else "Please analyze this image."
        user_display_message = user_message
        
        # Handle image if provided
        if image:
            image_html, image_key = create_image_display(image)
            if image_html:
                user_display_message = f"{user_message}\n\n{image_html}"
        
        # Add user message to history immediately
        history.append([user_display_message, ""])
        yield history, "", None
        
        # Get AI response
        if image:
            response = inference_with_image(
                prompt=user_message,
                image_path=image,
                system_prompt=system_prompt,
                max_new_tokens=max_tokens,
                min_pixels=min_pixels,
                max_pixels=max_pixels
            )
        else:
            # Text-only conversation
            response = inference_text_only(
                prompt=user_message,
                system_prompt=system_prompt,
                max_new_tokens=max_tokens
            )
        
        # Stream the response
        history[-1] = [user_display_message, ""]
        for partial_response in simulate_streaming_response(response):
            history[-1] = [user_display_message, partial_response]
            yield history, "", None
        
    except Exception as e:
        error_msg = f"‚ùå Error: {str(e)}"
        if history and len(history) > 0 and history[-1][1] == "":
            history[-1] = [history[-1][0], error_msg]
        else:
            history.append([message if message else "Error occurred", error_msg])
        yield history, "", None

def clear_history():
    """Clear chat history and image storage"""
    global image_storage
    image_storage.clear()
    return [], "", None

def update_system_prompt(usage_mode: str) -> str:
    """Update system prompt based on usage mode"""
    return SYSTEM_PROMPTS.get(usage_mode, SYSTEM_PROMPTS["General Assistant"])

def toggle_sidebar():
    """Toggle sidebar visibility"""
    return gr.update(visible=False), gr.update(visible=True)

def show_sidebar():
    """Show sidebar"""
    return gr.update(visible=True), gr.update(visible=False)

# Create Gradio interface
with gr.Blocks(css=custom_css, title="Qwen2.5 VL Chat", theme=gr.themes.Base()) as demo:
    
    with gr.Row(elem_classes="main-container"):
        # Sidebar
        with gr.Column(scale=1, elem_classes="sidebar", visible=True) as sidebar:
            gr.Markdown("### ‚öôÔ∏è Configuration")
            
            # Hide sidebar button
            hide_sidebar_btn = gr.Button("‚Üê Hide Panel", size="sm")
            
            usage_mode = gr.Dropdown(
                choices=list(SYSTEM_PROMPTS.keys()),
                value="General Assistant",
                label="üéØ Usage Mode",
                info="Select AI behavior"
            )
            
            custom_system_prompt = gr.Textbox(
                label="üìù Custom System Prompt",
                placeholder="Override with custom prompt...",
                lines=3
            )
            
            with gr.Accordion("üîß Advanced Settings", open=False):
                max_tokens = gr.Slider(
                    minimum=100,
                    maximum=32000,
                    value=2000,
                    step=100,
                    label="Max Tokens"
                )
                
                min_pixels = gr.Number(
                    value=512 * 28 * 28,
                    label="Min Pixels"
                )
                
                max_pixels = gr.Number(
                    value=2048 * 28 * 28,
                    label="Max Pixels"
                )
            
            clear_btn = gr.Button("üóëÔ∏è Clear Chat", elem_classes="clear-btn")
        
        # Show sidebar button (hidden by default)
        show_sidebar_btn = gr.Button("‚ò∞", elem_classes="sidebar-toggle", visible=False)
        
        # Main chat area
        with gr.Column(scale=4, elem_classes="chat-container"):
            # Header
            gr.Markdown("# ü§ñ Qwen2.5 VL Chat Assistant", elem_classes="chat-header")
            
            # Chatbot
            chatbot = gr.Chatbot(
                label="",
                height=600,
                show_copy_button=True,
                elem_classes="chatbot",
                avatar_images=("üë§", "ü§ñ"),
                bubble_full_width=False,
                render_markdown=True
            )
            
            # Message input area
            with gr.Row(elem_classes="message-input-container"):
                with gr.Column(scale=1):
                    # Image attachment (optional)
                    image_input = gr.File(
                        file_types=["image"],
                        label="üìé Attach Image (Optional)",
                        elem_classes="file-upload"
                    )
                
                with gr.Column(scale=5):
                    msg_input = gr.Textbox(
                        placeholder="üí¨ Type your message here... (Image is optional)",
                        lines=2,
                        max_lines=6,
                        elem_classes="message-input",
                        show_label=False,
                        container=False
                    )
                
                with gr.Column(scale=1):
                    send_btn = gr.Button("Send", variant="primary", elem_classes="send-button")
    
    # Event handlers
    
    # Auto-update system prompt when usage mode changes
    usage_mode.change(
        fn=update_system_prompt,
        inputs=[usage_mode],
        outputs=[custom_system_prompt]
    )
    
    # Sidebar toggle functions
    hide_sidebar_btn.click(
        fn=toggle_sidebar,
        outputs=[sidebar, show_sidebar_btn]
    )
    
    show_sidebar_btn.click(
        fn=show_sidebar,
        outputs=[sidebar, show_sidebar_btn]
    )
    
    # Chat functions with real streaming
    send_btn.click(
        fn=chat_with_history,
        inputs=[
            msg_input,
            image_input,
            chatbot,
            usage_mode,
            custom_system_prompt,
            max_tokens,
            min_pixels,
            max_pixels
        ],
        outputs=[chatbot, msg_input, image_input]
    )
    
    msg_input.submit(
        fn=chat_with_history,
        inputs=[
            msg_input,
            image_input,
            chatbot,
            usage_mode,
            custom_system_prompt,
            max_tokens,
            min_pixels,
            max_pixels
        ],
        outputs=[chatbot, msg_input, image_input]
    )
    
    clear_btn.click(
        fn=clear_history,
        outputs=[chatbot, msg_input, image_input]
    )

if __name__ == "__main__":
    demo.launch(
        #server_name="0.0.0.0",
        #server_port=7860,
        #share=False,
        debug=True
    )