In [None]:
%%capture
!pip install gradio
!pip install qwen-vl-utils
!sudo apt-get update
!sudo apt-get install poppler-utils

## v1 - ok

In [None]:
import torch
import json
import random
import io
import ast
import xml.etree.ElementTree as ET
import gradio as gr
import os
import tempfile # For creating temporary files

from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer
)
# Assuming qwen_vl_utils.py is in the same directory
from qwen_vl_utils import process_vision_info
from PIL import Image, ImageDraw, ImageFont, ImageColor

model_path = 'Qwen/Qwen2.5-VL-3B-Instruct'

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    device_map = 'auto',
)
processor = AutoProcessor.from_pretrained(model_path)

def plot_text_bounding_boxes(image: Image.Image, bounding_boxes, input_width, input_height):
    """
    Plots bounding boxes on an image with markers for each a name, using PIL, normalized coordinates, and different colors.

    Args:
        image: The PIL Image object.
        bounding_boxes: A list of bounding boxes containing the name of the object
          and their positions in normalized [y1 x1 y2 x2] format.
    Returns:
        PIL.Image.Image: The image with bounding boxes drawn.
    """

    width, height = image.size
    # Create a drawing object
    draw = ImageDraw.Draw(image)

    # Parsing out the markdown fencing
    bounding_boxes = parse_json(bounding_boxes)

    # Ensure the font file exists and is accessible
    try:
        # Adjust font size based on image size for better visibility
        font_size = max(10, min(int(height / 30), int(width / 60)))
        font = ImageFont.truetype('./00_Dataset/NotoSansCJK-Regular.ttc', size = font_size)
    except IOError:
        print("Warning: NotoSansCJK-Regular.ttc not found. Using default font.")
        font = ImageFont.load_default()

    # Iterate over the bounding boxes
    for i, bounding_box in enumerate(ast.literal_eval(bounding_boxes)):
      color = 'green'

      # Convert normalized coordinates to absolute coordinates
      abs_y1 = int(bounding_box["bbox_2d"][1] / input_height * height)
      abs_x1 = int(bounding_box["bbox_2d"][0] / input_width * width)
      abs_y2 = int(bounding_box["bbox_2d"][3] / input_height * height)
      abs_x2 = int(bounding_box["bbox_2d"][2] / input_width * width)

      # Ensure coordinates are in correct order (top-left to bottom-right)
      if abs_x1 > abs_x2:
        abs_x1, abs_x2 = abs_x2, abs_x1
      if abs_y1 > abs_y2:
        abs_y1, abs_y2 = abs_y2, abs_y1

      # Draw the bounding box
      draw.rectangle(
          ((abs_x1, abs_y1), (abs_x2, abs_y2)), outline = color, width = 2 # Increased width for better visibility
      )

      # Draw the text
      if 'text_content' in bounding_box:
        # Position text just below the bounding box
        draw.text((abs_x1, abs_y2), bounding_box['text_content'], fill = color, font = font)

    return image

# @title Parsing JSON output
def parse_json(json_output):
    # Parsing out the markdown fencing
    lines = json_output.splitlines()
    for i, line in enumerate(lines):
        if line == "```json":
            json_output = "\n".join(lines[i+1:])  # Remove everything before "```json"
            json_output = json_output.split("```")[0]  # Remove everything after the closing "```"
            break  # Exit the loop once "```json" is found
    return json_output

def inference(
    prompt,
    image_path: str, # Now accepts a file path (string)
    system_prompt = 'You are a helpful assistant',
    max_new_tokens = 4096,
    min_pixels = 512 * 28 * 28,
    max_pixels = 2048 * 28 * 28,
):
    messages = [
        {
            'role' : 'user',
            'content' : [
                {
                    'type' : 'image',
                    'image' : image_path, # Pass the file path directly
                    'min_pixels' : min_pixels,
                    'max_pixels' : max_pixels,
                },
                {'type' : 'text', 'text' : prompt},
            ],
        }
    ]

    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize = False, add_generation_prompt = True
    )
    print('input:\n', text)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text = [text],
        images = image_inputs,
        videos = video_inputs,
        padding = True,
        return_tensors = 'pt',
    )
    inputs = inputs.to('cuda')

    streamer = TextStreamer(processor.tokenizer, skip_special_tokens = True, skip_prompt = True)

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens = max_new_tokens, streamer = streamer)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens = True, clean_up_tokenization_spaces = False
    )

    print('output:\n', output_text[0])

    input_height = inputs['image_grid_thw'][0][1] * 14
    input_width = inputs['image_grid_thw'][0][2] * 14

    return output_text[0], input_width, input_height

def ocr_and_display(image: Image.Image):
    """
    Gradio interface function to perform OCR and display before/after images.
    """
    if image is None:
        return None, None, "Please upload an image."

    # Save the PIL Image to a temporary file
    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
        image_path = tmp_file.name
        image.save(image_path)

    try:
        prompt = 'Spotting all the text in the image with line-level, and output in JSON format.'
        ocr_response, input_width, input_height = inference(prompt, image_path)

        # Create the image with bounding boxes (operate on a copy to avoid modifying the original)
        image_with_boxes = plot_text_bounding_boxes(image.copy(), ocr_response, input_width, input_height)

        # Extract plain text from the OCR response
        try:
            parsed_boxes = ast.literal_eval(parse_json(ocr_response))
            extracted_text = "\n".join([item.get('text_content', '') for item in parsed_boxes if 'text_content' in item])
        except Exception as e:
            extracted_text = f"Error parsing OCR response: {e}\nRaw response:\n{ocr_response}"

        return image_with_boxes, extracted_text # Only return the OCR'd image and text

    finally:
        # Clean up the temporary file
        os.remove(image_path)

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Qwen2.5-VL OCR Interface
        Upload an image to perform OCR and visualize the detected text with bounding boxes.
        """
    )
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload Image")
            ocr_button = gr.Button("Perform OCR")
        with gr.Column():
            # Removed the separate "Original Image" display as the input already serves this purpose
            image_ocr = gr.Image(label="OCR with Bounding Boxes")
    with gr.Row():
        ocr_text_output = gr.Textbox(label="Extracted Text", lines=10)

    ocr_button.click(
        ocr_and_display,
        inputs=[image_input],
        outputs=[image_ocr, ocr_text_output] # Updated outputs
    )

demo.launch()

## v2 - ok

In [None]:
import torch
import json
import random
import io
import ast
import xml.etree.ElementTree as ET
import gradio as gr
import os
import tempfile
from pdf2image import convert_from_path # Import pdf2image

from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer
)
from qwen_vl_utils import process_vision_info
from PIL import Image, ImageDraw, ImageFont, ImageColor

model_path = 'Qwen/Qwen2.5-VL-3B-Instruct'

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    device_map = 'auto',
)
processor = AutoProcessor.from_pretrained(model_path)

def plot_text_bounding_boxes(image: Image.Image, bounding_boxes, input_width, input_height):
    """
    Plots bounding boxes on an image with markers for each a name, using PIL, normalized coordinates, and different colors.

    Args:
        image: The PIL Image object.
        bounding_boxes: A list of bounding boxes containing the name of the object
          and their positions in normalized [y1 x1 y2 x2] format.
    Returns:
        PIL.Image.Image: The image with bounding boxes drawn.
    """

    width, height = image.size
    draw = ImageDraw.Draw(image)

    # Parsing out the markdown fencing
    parsed_json_str = parse_json(bounding_boxes)

    try:
        font_size = max(10, min(int(height / 30), int(width / 60)))
        # IMPORTANT: Ensure this font path is correct in Kaggle or handle its absence
        font = ImageFont.truetype('./00_Dataset/NotoSansCJK-Regular.ttc', size = font_size)
    except IOError:
        print("Warning: NotoSansCJK-Regular.ttc not found. Using default font.")
        font = ImageFont.load_default()

    try:
        # Safely evaluate the literal. It should be a list of dicts.
        bounding_boxes_list = ast.literal_eval(parsed_json_str)
        if not isinstance(bounding_boxes_list, list):
            print(f"Warning: Expected a list from ast.literal_eval, but got {type(bounding_boxes_list)}. Raw parsed JSON: {parsed_json_str}")
            bounding_boxes_list = [] # Treat as empty if not a list
    except (ValueError, SyntaxError) as e:
        print(f"Error evaluating bounding boxes string: {e}")
        print(f"Problematic string for bounding boxes: {parsed_json_str[:500]}...") # Print first 500 chars for debugging
        bounding_boxes_list = [] # Fallback to empty list on error

    for i, bounding_box in enumerate(bounding_boxes_list):
      color = 'green'

      # Ensure bounding_box is a dictionary and has 'bbox_2d'
      if not isinstance(bounding_box, dict) or 'bbox_2d' not in bounding_box or not isinstance(bounding_box['bbox_2d'], list) or len(bounding_box['bbox_2d']) != 4:
          print(f"Skipping malformed bounding box entry: {bounding_box}")
          continue

      abs_y1 = int(bounding_box["bbox_2d"][1] / input_height * height)
      abs_x1 = int(bounding_box["bbox_2d"][0] / input_width * width)
      abs_y2 = int(bounding_box["bbox_2d"][3] / input_height * height)
      abs_x2 = int(bounding_box["bbox_2d"][2] / input_width * width)

      if abs_x1 > abs_x2:
        abs_x1, abs_x2 = abs_x2, abs_x1
      if abs_y1 > abs_y2:
        abs_y1, abs_y2 = abs_y2, abs_y1

      draw.rectangle(
          ((abs_x1, abs_y1), (abs_x2, abs_y2)), outline = color, width = 2
      )

      if 'text_content' in bounding_box:
        draw.text((abs_x1, abs_y2), bounding_box['text_content'], fill = color, font = font)

    return image

def parse_json(json_output):
    # This function is correct for removing markdown fences
    lines = json_output.splitlines()
    for i, line in enumerate(lines):
        if line == "```json":
            json_output = "\n".join(lines[i+1:])
            json_output = json_output.split("```")[0]
            break
    return json_output

def inference(
    prompt,
    image_path: str,
    system_prompt = 'You are a helpful assistant',
    max_new_tokens = 4096,
    min_pixels = 512 * 28 * 28,
    max_pixels = 2048 * 28 * 28,
):
    messages = [
        {
            'role' : 'user',
            'content' : [
                {
                    'type' : 'image',
                    'image' : image_path,
                    'min_pixels' : min_pixels,
                    'max_pixels' : max_pixels,
                },
                {'type' : 'text', 'text' : prompt},
            ],
        }
    ]

    text = processor.apply_chat_template(
        messages, tokenize = False, add_generation_prompt = True
    )
    print('Input to model:\n', text) # Changed for clarity
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text = [text],
        images = image_inputs,
        videos = video_inputs,
        padding = True,
        return_tensors = 'pt',
    )
    inputs = inputs.to('cuda')

    streamer = TextStreamer(processor.tokenizer, skip_special_tokens = True, skip_prompt = True)

    generated_ids = model.generate(**inputs, max_new_tokens = max_new_tokens, streamer = streamer)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens = True, clean_up_tokenization_spaces = False
    )

    print('Output from model:\n', output_text[0]) # Changed for clarity

    input_height = inputs['image_grid_thw'][0][1] * 14
    input_width = inputs['image_grid_thw'][0][2] * 14

    return output_text[0], input_width, input_height

def process_pdf_for_ocr(file):
    if file is None:
        return [], [], "Please upload a PDF file."

    input_file_path = file.name

    with tempfile.TemporaryDirectory() as temp_dir:
        all_ocr_texts = []
        all_ocr_images = [] # For images with bounding boxes
        original_images = [] # For original PDF pages

        try:
            # You might need to specify poppler_path if not in system PATH on Windows
            # For Kaggle, usually poppler is pre-installed. If not, you might need to
            # add a custom installation step or ensure it's in the Docker image.
            images_from_pdf = convert_from_path(input_file_path)

            for i, pil_image_orig in enumerate(images_from_pdf):
                # Save original for display
                original_images.append(pil_image_orig)

                # Save each PIL image to a temporary file for inference
                temp_image_path = os.path.join(temp_dir, f"page_{i}.png")
                pil_image_orig.save(temp_image_path, format="PNG")

                prompt = 'Spotting all the text in the image with line-level, and output in JSON format.'
                ocr_response, input_width, input_height = inference(prompt, temp_image_path)

                # --- Robust Parsing for OCR Text and Bounding Boxes ---
                page_extracted_text = f"--- Page {i+1} ---\n"
                current_page_image_with_boxes = pil_image_orig.copy() # Make a copy for drawing

                try:
                    parsed_json_str = parse_json(ocr_response)
                    parsed_boxes = ast.literal_eval(parsed_json_str)

                    if isinstance(parsed_boxes, list):
                        # This is the expected format: a list of dictionaries
                        texts_on_page = []
                        for item in parsed_boxes:
                            if isinstance(item, dict) and 'text_content' in item:
                                texts_on_page.append(item['text_content'])
                        page_extracted_text += "\n".join(texts_on_page)

                        # Plot bounding boxes only if parsing was successful and we have a list
                        current_page_image_with_boxes = plot_text_bounding_boxes(
                            current_page_image_with_boxes, ocr_response, input_width, input_height
                        )
                    else:
                        # If it's not a list, it's an unexpected format from the model
                        page_extracted_text += f"OCR response was not a list of text objects. Raw:\n{parsed_json_str}"
                        print(f"Warning: OCR response for page {i+1} was not a list: {parsed_json_str}")

                except (ValueError, SyntaxError) as e:
                    # Catch errors from ast.literal_eval if the string isn't valid Python literal
                    page_extracted_text += f"Error parsing OCR JSON for page {i+1}: {e}\nRaw response (after fence removal):\n{parsed_json_str}"
                    print(f"Error parsing OCR JSON for page {i+1}: {e}\nRaw response (after fence removal):\n{parsed_json_str}")
                except Exception as e:
                    # Catch any other unexpected errors during text extraction
                    page_extracted_text += f"An unexpected error occurred during OCR text extraction for page {i+1}: {e}"
                    print(f"An unexpected error occurred during OCR text extraction for page {i+1}: {e}")

                all_ocr_texts.append(page_extracted_text)
                all_ocr_images.append(current_page_image_with_boxes)

        except Exception as e:
            print(f"Fatal Error processing PDF: {e}")
            # If pdf2image itself fails or there's an issue with the temp file
            return [], [], f"Fatal Error processing PDF: {e}"

        return original_images, all_ocr_images, "\n\n".join(all_ocr_texts)

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Qwen2.5-VL PDF OCR Interface
        Upload a PDF file to perform OCR on each page and visualize the detected text with bounding boxes.
        """
    )
    with gr.Row():
        with gr.Column():
            pdf_input = gr.File(label="Upload PDF File", file_types=[".pdf"])
            ocr_button = gr.Button("Perform OCR")
        with gr.Column():
            original_pages_gallery = gr.Gallery(
                label="Original PDF Pages",
                show_label=True,
                elem_id="gallery",
                columns=2,
                rows=2,
                object_fit="contain",
                height="auto",
            )
            ocr_pages_gallery = gr.Gallery(
                label="OCR with Bounding Boxes (Each Page)",
                show_label=True,
                elem_id="gallery_ocr",
                columns=2,
                rows=2,
                object_fit="contain",
                height="auto",
            )
    with gr.Row():
        ocr_text_output = gr.Textbox(label="Extracted Text (All Pages)", lines=20)

    ocr_button.click(
        process_pdf_for_ocr,
        inputs=[pdf_input],
        outputs=[original_pages_gallery, ocr_pages_gallery, ocr_text_output]
    )

demo.launch()

## v3 - ok

In [None]:
import torch
import json
import random
import io
import ast
import xml.etree.ElementTree as ET
import gradio as gr
import os
import tempfile
from pdf2image import convert_from_path # Import pdf2image
import gc # Import garbage collection module

from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer
)
from qwen_vl_utils import process_vision_info
from PIL import Image, ImageDraw, ImageFont, ImageColor

# --- Model Loading (Moved to top level) ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

try:
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        'Qwen/Qwen2.5-VL-3B-Instruct',
        device_map='auto',
    )#.to(device)
    processor = AutoProcessor.from_pretrained('Qwen/Qwen2.5-VL-3B-Instruct')
    print("Model and Processor loaded successfully.")
except Exception as e:
    print(f"Error loading model or processor: {e}")
    print("Please ensure 'Qwen/Qwen2.5-VL-3B-Instruct' is accessible and properly configured.")
    model = None
    processor = None

# --- Helper Functions ---
def plot_text_bounding_boxes(image: Image.Image, bounding_boxes_str: str, input_width, input_height):
    """
    Plots bounding boxes on an image with markers for each text, using PIL, normalized coordinates, and different colors.
    """
    if not image:
        return None

    width, height = image.size
    draw = ImageDraw.Draw(image)

    parsed_json_str = parse_json(bounding_boxes_str)

    try:
        font_path = './00_Dataset/NotoSansCJK-Regular.ttc'
        if os.path.exists(font_path):
            font_size = max(10, min(int(height / 30), int(width / 60)))
            font = ImageFont.truetype(font_path, size=font_size)
        else:
            print(f"Warning: Font file '{font_path}' not found. Using default font.")
            font = ImageFont.load_default()
    except IOError:
        print(f"Warning: Error loading font from '{font_path}'. Using default font.")
        font = ImageFont.load_default()

    try:
        bounding_boxes_list = ast.literal_eval(parsed_json_str)
        if not isinstance(bounding_boxes_list, list):
            print(f"Warning: Expected a list from ast.literal_eval, but got {type(bounding_boxes_list)}. Raw parsed JSON: {parsed_json_str[:200]}...")
            bounding_boxes_list = []
    except (ValueError, SyntaxError) as e:
        print(f"Error evaluating bounding boxes string: {e}")
        print(f"Problematic string for bounding boxes: {parsed_json_str[:500]}...")
        bounding_boxes_list = []

    for bounding_box in bounding_boxes_list:
        color = 'green'
        if not isinstance(bounding_box, dict) or 'bbox_2d' not in bounding_box or not isinstance(bounding_box['bbox_2d'], list) or len(bounding_box['bbox_2d']) != 4:
            print(f"Skipping malformed bounding box entry: {bounding_box}")
            continue

        abs_y1 = int(bounding_box["bbox_2d"][1] / input_height * height)
        abs_x1 = int(bounding_box["bbox_2d"][0] / input_width * width)
        abs_y2 = int(bounding_box["bbox_2d"][3] / input_height * height)
        abs_x2 = int(bounding_box["bbox_2d"][2] / input_width * width)

        abs_x1, abs_x2 = min(abs_x1, abs_x2), max(abs_x1, abs_x2)
        abs_y1, abs_y2 = min(abs_y1, abs_y2), max(abs_y1, abs_y2)

        draw.rectangle(
            ((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=2
        )

        if 'text_content' in bounding_box:
            draw.text((abs_x1, abs_y2), bounding_box['text_content'], fill=color, font=font)

    return image

def parse_json(json_output: str) -> str:
    """Removes markdown fencing from JSON string."""
    lines = json_output.splitlines()
    for i, line in enumerate(lines):
        if line.strip() == "```json":
            json_output = "\n".join(lines[i+1:])
            json_output = json_output.split("```")[0].strip()
            break
    return json_output

def inference(
    prompt: str,
    image_path: str,
    system_prompt: str = 'You are a helpful assistant', # Kept as requested
    max_new_tokens: int = 4096,
    min_pixels: int = 512 * 28 * 28,
    max_pixels: int = 2048 * 28 * 28,
):
    if model is None or processor is None:
        return "Error: Model not loaded.", 0, 0

    messages = [
        {
            'role': 'user',
            'content': [
                {
                    'type': 'image',
                    'image': image_path,
                    'min_pixels': min_pixels,
                    'max_pixels': max_pixels,
                },
                {'type': 'text', 'text': prompt},
            ],
        }
    ]

    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    print('Input to model:\n', text)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors='pt',
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    streamer = TextStreamer(processor.tokenizer, skip_special_tokens=True, skip_prompt=True)

    generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, streamer=streamer)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    print('Output from model:\n', output_text[0])

    input_height = 0
    input_width = 0
    if 'image_grid_thw' in inputs and len(inputs['image_grid_thw']) > 0:
        if isinstance(inputs['image_grid_thw'][0], torch.Tensor) and inputs['image_grid_thw'][0].dim() == 1 and inputs['image_grid_thw'][0].shape[0] >= 3:
             input_height = inputs['image_grid_thw'][0][1].item() * 14
             input_width = inputs['image_grid_thw'][0][2].item() * 14
        else:
            print(f"Warning: Unexpected format for image_grid_thw: {inputs['image_grid_thw']}")

    return output_text[0], input_width, input_height

def process_document(file, resolution: int):
    """
    Handles PDF or Image input: processes it for OCR and returns results.
    """
    if file is None:
        return [], [], "", "", "Please upload a file (PDF or Image)."

    input_file_path = file.name
    file_extension = os.path.splitext(input_file_path)[1].lower()

    all_ocr_texts_plain = []
    all_ocr_texts_full = []
    all_ocr_images = []
    original_images = []
    status_message = "Processing complete."

    with tempfile.TemporaryDirectory() as temp_dir:
        try:
            if file_extension == '.pdf':
                print(f"Processing PDF with DPI: {resolution}")
                images_from_doc = convert_from_path(input_file_path, dpi=resolution)
            elif file_extension in ['.png', '.jpg', '.jpeg']:
                print(f"Processing Image: {input_file_path}")
                images_from_doc = [Image.open(input_file_path).convert("RGB")] # Ensure RGB for consistency
            else:
                return [], [], "", "", "Unsupported file type. Please upload a PDF or an image (PNG, JPG)."

            if not images_from_doc:
                return [], [], "No content found in document.", "No content found in document.", "No content found in document."

            for i, pil_image_orig in enumerate(images_from_doc):
                original_images.append(pil_image_orig)

                temp_image_path = os.path.join(temp_dir, f"page_{i}.png")
                pil_image_orig.save(temp_image_path, format="PNG")

                prompt = 'Spotting all the text in the image with line-level, and output in JSON format.'
                ocr_response, input_width, input_height = inference(prompt, temp_image_path)

                page_label = f"--- Page {i+1}" if file_extension == '.pdf' else "--- Image"
                page_extracted_text_plain = f"{page_label} (Plain Text) ---\n"
                page_extracted_text_full = f"{page_label} (Raw OCR Output) ---\n"
                current_page_image_with_boxes = pil_image_orig.copy()

                page_extracted_text_full += ocr_response + "\n\n"

                try:
                    parsed_json_str = parse_json(ocr_response)
                    parsed_boxes = ast.literal_eval(parsed_json_str)

                    if isinstance(parsed_boxes, list):
                        texts_on_page = []
                        for item in parsed_boxes:
                            if isinstance(item, dict) and 'text_content' in item:
                                texts_on_page.append(item['text_content'])
                        page_extracted_text_plain += "\n".join(texts_on_page)

                        current_page_image_with_boxes = plot_text_bounding_boxes(
                            current_page_image_with_boxes, ocr_response, input_width, input_height
                        )
                    else:
                        page_extracted_text_plain += f"OCR response was not a list of text objects. Raw:\n{parsed_json_str}"
                        print(f"Warning: OCR response for {page_label} was not a list: {parsed_json_str}")

                except (ValueError, SyntaxError) as e:
                    page_extracted_text_plain += f"Error parsing OCR JSON for {page_label}: {e}\nRaw response (after fence removal):\n{parsed_json_str}"
                    print(f"Error parsing OCR JSON for {page_label}: {e}\nRaw response (after fence removal):\n{parsed_json_str}")
                except Exception as e:
                    page_extracted_text_plain += f"An unexpected error occurred during OCR text extraction for {page_label}: {e}"
                    print(f"An unexpected error occurred during OCR text extraction for {page_label}: {e}")

                all_ocr_texts_plain.append(page_extracted_text_plain)
                all_ocr_texts_full.append(page_extracted_text_full)
                all_ocr_images.append(current_page_image_with_boxes)

        except Exception as e:
            error_msg = f"Fatal Error processing file: {e}"
            print(error_msg)
            return [], [], error_msg, error_msg, error_msg

    return original_images, all_ocr_images, "\n\n".join(all_ocr_texts_plain), "\n\n".join(all_ocr_texts_full), status_message

def clear_vram():
    """Clears PyTorch's CUDA memory cache."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect() # Aggressively collect garbage
        print("CUDA VRAM cache emptied and garbage collected.")
        return "VRAM Cleared!"
    else:
        return "No CUDA device found to clear VRAM."

# --- Gradio Interface ---
with gr.Blocks(title="Qwen2.5-VL PDF/Image OCR Interface") as demo:
    gr.Markdown(
        """
        # Qwen2.5-VL PDF/Image OCR Interface
        Upload a PDF file or an image to perform OCR and visualize the detected text with bounding boxes.
        Adjust the PDF conversion resolution for better OCR results.
        """
    )

    # --- Tabs at the very top ---
    with gr.Tabs():
        with gr.TabItem("Preview Images"):
            gr.Markdown("### Original vs. OCR'd Pages")
            with gr.Row():
                original_pages_gallery = gr.Gallery(
                    label="Original Document Pages",
                    show_label=True,
                    elem_id="gallery_original",
                    columns=2,
                    rows=2,
                    object_fit="contain",
                    height="auto",
                )
                ocr_pages_gallery = gr.Gallery(
                    label="OCR with Bounding Boxes",
                    show_label=True,
                    elem_id="gallery_ocr",
                    columns=2,
                    rows=2,
                    object_fit="contain",
                    height="auto",
                )
        with gr.TabItem("Extracted Plain Text"):
            gr.Markdown("### Plain Text Output")
            ocr_text_plain_output = gr.Textbox(
                label="Extracted Text (All Pages/Image)", lines=20, interactive=False
            )
        with gr.TabItem("Complete OCR Output (JSON/Raw)"):
            gr.Markdown("### Raw Model Output with Bounding Box Data")
            ocr_text_full_output = gr.Textbox(
                label="Complete OCR Output (Includes Bounding Box JSON)",
                lines=20,
                interactive=False,
            )

    # --- Upload and Settings at the bottom ---
    with gr.Row():
        with gr.Column():
            with gr.Group():
                gr.Markdown("### 1. Upload Document")
                # Updated file_types to accept images as well
                pdf_image_input = gr.File(label="Upload PDF or Image File", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
                gr.Markdown("### 2. OCR Settings")
                resolution_slider = gr.Slider(
                    minimum=72, maximum=600, value=200, step=10,
                    label="PDF to Image Resolution (DPI)",
                    info="Higher DPI can improve OCR accuracy but uses more memory/time. (Ignored for direct image uploads)"
                )
                ocr_button = gr.Button("Perform OCR", variant="primary")
            gr.Markdown("---")
            with gr.Row():
                vram_reset_button = gr.Button("Reset VRAM", variant="secondary")
                vram_status_output = gr.Textbox(label="VRAM Status", interactive=False, max_lines=1)

    # Event Listener for processing
    ocr_button.click(
        process_document, # Changed to the new function name
        inputs=[pdf_image_input, resolution_slider],
        outputs=[original_pages_gallery, ocr_pages_gallery, ocr_text_plain_output, ocr_text_full_output, vram_status_output]
    )

    # Event Listener for VRAM reset
    vram_reset_button.click(
        clear_vram,
        inputs=[],
        outputs=[vram_status_output]
    )

demo.launch()

## v4 - ok

In [None]:
import torch
import json
import random
import io
import ast
import xml.etree.ElementTree as ET
import gradio as gr
import os
import tempfile
from pdf2image import convert_from_path # Used for PDF to image conversion
import gc # Import garbage collection module

from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer
)
from qwen_vl_utils import process_vision_info # Assuming this file is available
from PIL import Image, ImageDraw, ImageFont, ImageColor

# --- Model Loading (Moved to top level) ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

try:
    # IMPORTANT: Ensure 'Qwen/Qwen2.5-VL-3B-Instruct' is accessible.
    # On Kaggle, this typically means adding it as a dataset.
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        'Qwen/Qwen2.5-VL-3B-Instruct',
        device_map='auto',
    )#.to(device)
    processor = AutoProcessor.from_pretrained('Qwen/Qwen2.5-VL-3B-Instruct')
    print("Model and Processor loaded successfully.")
except Exception as e:
    print(f"Error loading model or processor: {e}")
    print("Please ensure 'Qwen/Qwen2.5-VL-3B-Instruct' is accessible and properly configured.")
    # Fallback or exit strategy if model fails to load
    model = None
    processor = None

# --- Helper Functions ---
def plot_text_bounding_boxes(image: Image.Image, bounding_boxes_str: str, input_width, input_height):
    """
    Plots bounding boxes on an image with markers for each text, using PIL, normalized coordinates, and different colors.
    """
    if not image:
        return None

    width, height = image.size
    draw = ImageDraw.Draw(image)

    parsed_json_str = parse_json(bounding_boxes_str)

    try:
        # Check if the font path is available on Kaggle. If not, default will be used.
        # This assumes NotoSansCJK-Regular.ttc is in a Kaggle dataset mounted at ./00_Dataset/
        font_path = './00_Dataset/NotoSansCJK-Regular.ttc'
        if os.path.exists(font_path):
            font_size = max(10, min(int(height / 30), int(width / 60)))
            font = ImageFont.truetype(font_path, size=font_size)
        else:
            print(f"Warning: Font file '{font_path}' not found. Using default font.")
            font = ImageFont.load_default()
    except IOError:
        print(f"Warning: Error loading font from '{font_path}'. Using default font.")
        font = ImageFont.load_default()

    try:
        bounding_boxes_list = ast.literal_eval(parsed_json_str)
        if not isinstance(bounding_boxes_list, list):
            print(f"Warning: Expected a list from ast.literal_eval, but got {type(bounding_boxes_list)}. Raw parsed JSON: {parsed_json_str[:200]}...")
            bounding_boxes_list = []
    except (ValueError, SyntaxError) as e:
        print(f"Error evaluating bounding boxes string: {e}")
        print(f"Problematic string for bounding boxes: {parsed_json_str[:500]}...")
        bounding_boxes_list = []

    for bounding_box in bounding_boxes_list:
        color = 'green'
        if not isinstance(bounding_box, dict) or 'bbox_2d' not in bounding_box or not isinstance(bounding_box['bbox_2d'], list) or len(bounding_box['bbox_2d']) != 4:
            print(f"Skipping malformed bounding box entry: {bounding_box}")
            continue

        abs_y1 = int(bounding_box["bbox_2d"][1] / input_height * height)
        abs_x1 = int(bounding_box["bbox_2d"][0] / input_width * width)
        abs_y2 = int(bounding_box["bbox_2d"][3] / input_height * height)
        abs_x2 = int(bounding_box["bbox_2d"][2] / input_width * width)

        # Ensure coordinates are in correct order
        abs_x1, abs_x2 = min(abs_x1, abs_x2), max(abs_x1, abs_x2)
        abs_y1, abs_y2 = min(abs_y1, abs_y2), max(abs_y1, abs_y2)

        draw.rectangle(
            ((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=2
        )

        if 'text_content' in bounding_box:
            draw.text((abs_x1, abs_y2), bounding_box['text_content'], fill=color, font=font)

    return image

def parse_json(json_output: str) -> str:
    """Removes markdown fencing from JSON string."""
    lines = json_output.splitlines()
    for i, line in enumerate(lines):
        if line.strip() == "```json":
            json_output = "\n".join(lines[i+1:])
            json_output = json_output.split("```")[0].strip()
            break
    return json_output

def inference(
    prompt: str,
    image_path: str,
    system_prompt = 'You are a helpful assistant', # Kept as requested
    max_new_tokens: int = 4096,
    min_pixels: int = 512 * 28 * 28,
    max_pixels: int = 2048 * 28 * 28,
):
    if model is None or processor is None:
        return "Error: Model not loaded.", 0, 0

    messages = [
        {
            'role': 'user',
            'content': [
                {
                    'type': 'image',
                    'image': image_path,
                    'min_pixels': min_pixels,
                    'max_pixels': max_pixels,
                },
                {'type': 'text', 'text': prompt},
            ],
        }
    ]

    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    print('Input to model:\n', text)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors='pt',
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    streamer = TextStreamer(processor.tokenizer, skip_special_tokens=True, skip_prompt=True)

    generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, streamer=streamer)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids) # Corrected dict access
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    print('Output from model:\n', output_text[0])

    input_height = 0
    input_width = 0
    if 'image_grid_thw' in inputs and len(inputs['image_grid_thw']) > 0:
        if isinstance(inputs['image_grid_thw'][0], torch.Tensor) and inputs['image_grid_thw'][0].dim() == 1 and inputs['image_grid_thw'][0].shape[0] >= 3:
             input_height = inputs['image_grid_thw'][0][1].item() * 14
             input_width = inputs['image_grid_thw'][0][2].item() * 14
        else:
            print(f"Warning: Unexpected format for image_grid_thw: {inputs['image_grid_thw']}")

    return output_text[0], input_width, input_height


def process_file_for_ocr(file, resolution: int):
    """
    Handles PDF or image input: converts pages to images (if PDF),
    performs OCR, and returns lists of images and texts.
    """
    if file is None:
        return [], [], "", "", "Please upload a PDF or image file."

    input_file_path = file.name
    file_extension = os.path.splitext(input_file_path)[1].lower()

    all_ocr_texts_plain = []
    all_ocr_texts_full = [] # To store the complete JSON/raw output
    all_ocr_images = []
    original_images = []
    status_message = "Processing complete."

    with tempfile.TemporaryDirectory() as temp_dir:
        try:
            images_to_process = []
            if file_extension == '.pdf':
                print(f"Processing PDF with DPI: {resolution}")
                images_to_process = convert_from_path(input_file_path, dpi=resolution)
            elif file_extension in ['.png', '.jpg', '.jpeg', '.webp', '.bmp', '.tiff', '.tif']: # Common image formats
                print(f"Processing image file: {input_file_path}")
                # For images, we just have one "page"
                try:
                    pil_image = Image.open(input_file_path).convert("RGB") # Ensure RGB for consistent processing
                    images_to_process = [pil_image]
                except Exception as img_e:
                    raise ValueError(f"Could not open image file. Is it a valid image? Error: {img_e}")
            else:
                return [], [], "", "", f"Unsupported file type: {file_extension}. Please upload a PDF or a common image format (.png, .jpg, .jpeg, etc.)."

            if not images_to_process:
                return [], [], "No content found in file.", "No content found in file.", "No content found in file."

            for i, pil_image_orig in enumerate(images_to_process):
                original_images.append(pil_image_orig)

                # Save each PIL image to a temporary file for inference
                # Even for single images, saving to temp file is good practice for consistent API with LLM
                temp_image_path = os.path.join(temp_dir, f"page_{i}.png")
                pil_image_orig.save(temp_image_path, format="PNG")

                prompt = 'Spotting all the text in the image with line-level, and output in JSON format.'
                ocr_response, input_width, input_height = inference(prompt, temp_image_path)

                # --- Robust Parsing for OCR Text and Bounding Boxes ---
                page_extracted_text_plain = f"--- Page {i+1} (Plain Text) ---\n"
                page_extracted_text_full = f"--- Page {i+1} (Raw OCR Output) ---\n"
                current_page_image_with_boxes = pil_image_orig.copy() # Make a copy for drawing

                page_extracted_text_full += ocr_response + "\n\n" # Store raw output

                try:
                    parsed_json_str = parse_json(ocr_response)
                    parsed_boxes = ast.literal_eval(parsed_json_str)

                    if isinstance(parsed_boxes, list):
                        # This is the expected format: a list of dictionaries
                        texts_on_page = []
                        for item in parsed_boxes:
                            if isinstance(item, dict) and 'text_content' in item:
                                texts_on_page.append(item['text_content'])
                        page_extracted_text_plain += "\n".join(texts_on_page)

                        # Plot bounding boxes only if parsing was successful and we have a list
                        current_page_image_with_boxes = plot_text_bounding_boxes(
                            current_page_image_with_boxes, ocr_response, input_width, input_height
                        )
                    else:
                        # If it's not a list, it's an unexpected format from the model
                        page_extracted_text_plain += f"OCR response was not a list of text objects. Raw:\n{parsed_json_str}"
                        print(f"Warning: OCR response for page {i+1} was not a list: {parsed_json_str}")

                except (ValueError, SyntaxError) as e:
                    # Catch errors from ast.literal_eval if the string isn't valid Python literal
                    page_extracted_text_plain += f"Error parsing OCR JSON for page {i+1}: {e}\nRaw response (after fence removal):\n{parsed_json_str}"
                    print(f"Error parsing OCR JSON for page {i+1}: {e}\nRaw response (after fence removal):\n{parsed_json_str}")
                except Exception as e:
                    # Catch any other unexpected errors during text extraction
                    page_extracted_text_plain += f"An unexpected error occurred during OCR text extraction for page {i+1}: {e}"
                    print(f"An unexpected error occurred during OCR text extraction for page {i+1}: {e}")

                all_ocr_texts_plain.append(page_extracted_text_plain)
                all_ocr_texts_full.append(page_extracted_text_full)
                all_ocr_images.append(current_page_image_with_boxes)

        except Exception as e:
            error_msg = f"Fatal Error processing file: {e}" # Changed "PDF" to "file"
            print(error_msg)
            return [], [], error_msg, error_msg, error_msg

        return original_images, all_ocr_images, "\n\n".join(all_ocr_texts_plain), "\n\n".join(all_ocr_texts_full), status_message

def clear_vram():
    """Clears PyTorch's CUDA memory cache."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect() # Aggressively collect garbage
        print("CUDA VRAM cache emptied and garbage collected.")
        return "VRAM Cleared!"
    else:
        return "No CUDA device found to clear VRAM."

# --- Gradio Interface ---
with gr.Blocks(title="Qwen2.5-VL PDF OCR Interface") as demo:
    gr.Markdown(
        """
        # Qwen2.5-VL PDF OCR Interface
        Upload a PDF or image file to perform OCR and visualize the detected text with bounding boxes.
        Adjust the PDF conversion resolution for better OCR results.
        """
    )

    # Tabs for Outputs (Previews and Texts) - Stays at the top
    with gr.Tabs():
        with gr.TabItem("Preview Images"):
            gr.Markdown("### Original vs. OCR'd Pages")
            with gr.Row():
                original_pages_gallery = gr.Gallery(
                    label="Original File Pages/Images",
                    show_label=True,
                    elem_id="gallery_original",
                    columns=2,
                    rows=2,
                    object_fit="contain",
                    height="auto",
                )
                ocr_pages_gallery = gr.Gallery(
                    label="OCR with Bounding Boxes",
                    show_label=True,
                    elem_id="gallery_ocr",
                    columns=2,
                    rows=2,
                    object_fit="contain",
                    height="auto",
                )
        with gr.TabItem("Extracted Plain Text"):
            gr.Markdown("### Plain Text Output")
            ocr_text_plain_output = gr.Textbox(
                label="Extracted Text (All Pages)", lines=20, interactive=False
            )
        with gr.TabItem("Complete OCR Output (JSON/Raw)"):
            gr.Markdown("### Raw Model Output with Bounding Box Data")
            ocr_text_full_output = gr.Textbox(
                label="Complete OCR Output (Includes Bounding Box JSON)",
                lines=20,
                interactive=False,
            )

    # Upload and Settings section - Moved to the bottom, under the previews
    with gr.Row():
        # Column 1: Upload File
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown("### 1. Upload File")
                pdf_image_input = gr.File(
                    label="Upload PDF or Image File",
                    file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff", ".tif"]
                )
    
        # Column 2: OCR Settings
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown("### 2. OCR Settings")
                resolution_slider = gr.Slider(
                    minimum=72, maximum=600, value=200, step=10,
                    label="PDF to Image Resolution (DPI)",
                    info="Applies only to PDFs. Higher DPI can improve OCR accuracy but uses more memory/time.",
                )
                ocr_button = gr.Button("Perform OCR", variant="primary")
    
        # Column 3: VRAM Reset & Status
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown("### 3. VRAM Control")
                vram_reset_button = gr.Button("Reset VRAM", variant="secondary")
                vram_status_output = gr.Textbox(
                    label="VRAM Status",
                    interactive=False,
                    max_lines=1
                )

    # Event Listeners
    ocr_button.click(
        process_file_for_ocr, # Changed function name to handle both file types
        inputs=[pdf_image_input, resolution_slider],
        outputs=[original_pages_gallery, ocr_pages_gallery, ocr_text_plain_output, ocr_text_full_output, vram_status_output]
    )

    vram_reset_button.click(
        clear_vram,
        inputs=[],
        outputs=[vram_status_output]
    )

demo.launch()

## v5 - ok

In [1]:
import torch
import json
import random
import io
import ast
import xml.etree.ElementTree as ET
import gradio as gr
import os
import tempfile
from pdf2image import convert_from_path # Used for PDF to image conversion
import gc # Import garbage collection module
import time # Import the time module

from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoTokenizer,
    AutoProcessor,
    BitsAndBytesConfig,
    TextStreamer
)
from qwen_vl_utils import process_vision_info # Assuming this file is available
from PIL import Image, ImageDraw, ImageFont, ImageColor

# --- Model Loading (Moved to top level) ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

try:
    # IMPORTANT: Ensure 'Qwen/Qwen2.5-VL-3B-Instruct' is accessible.
    # On Kaggle, this typically means adding it as a dataset.
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        'Qwen/Qwen2.5-VL-3B-Instruct',
        device_map='auto',
    )#.to(device)
    processor = AutoProcessor.from_pretrained('Qwen/Qwen2.5-VL-3B-Instruct')
    print("Model and Processor loaded successfully.")
except Exception as e:
    print(f"Error loading model or processor: {e}")
    print("Please ensure 'Qwen/Qwen2.5-VL-3B-Instruct' is accessible and properly configured.")
    # Fallback or exit strategy if model fails to load
    model = None
    processor = None

# --- Helper Functions ---
def plot_text_bounding_boxes(image: Image.Image, bounding_boxes_str: str, input_width, input_height):
    """
    Plots bounding boxes on an image with markers for each text, using PIL, normalized coordinates, and different colors.
    """
    if not image:
        return None

    width, height = image.size
    draw = ImageDraw.Draw(image)

    parsed_json_str = parse_json(bounding_boxes_str)

    try:
        # Check if the font path is available on Kaggle. If not, default will be used.
        # This assumes NotoSansCJK-Regular.ttc is in a Kaggle dataset mounted at ./00_Dataset/
        font_path = './00_Dataset/NotoSansCJK-Regular.ttc'
        if os.path.exists(font_path):
            font_size = max(10, min(int(height / 30), int(width / 60)))
            font = ImageFont.truetype(font_path, size=font_size)
        else:
            print(f"Warning: Font file '{font_path}' not found. Using default font.")
            font = ImageFont.load_default()
    except IOError:
        print(f"Warning: Error loading font from '{font_path}'. Using default font.")
        font = ImageFont.load_default()

    try:
        bounding_boxes_list = ast.literal_eval(parsed_json_str)
        if not isinstance(bounding_boxes_list, list):
            print(f"Warning: Expected a list from ast.literal_eval, but got {type(bounding_boxes_list)}. Raw parsed JSON: {parsed_json_str[:200]}...")
            bounding_boxes_list = []
    except (ValueError, SyntaxError) as e:
        print(f"Error evaluating bounding boxes string: {e}")
        print(f"Problematic string for bounding boxes: {parsed_json_str[:500]}...")
        bounding_boxes_list = []

    for bounding_box in bounding_boxes_list:
        color = 'green'
        if not isinstance(bounding_box, dict) or 'bbox_2d' not in bounding_box or not isinstance(bounding_box['bbox_2d'], list) or len(bounding_box['bbox_2d']) != 4:
            print(f"Skipping malformed bounding box entry: {bounding_box}")
            continue

        abs_y1 = int(bounding_box["bbox_2d"][1] / input_height * height)
        abs_x1 = int(bounding_box["bbox_2d"][0] / input_width * width)
        abs_y2 = int(bounding_box["bbox_2d"][3] / input_height * height)
        abs_x2 = int(bounding_box["bbox_2d"][2] / input_width * width)

        # Ensure coordinates are in correct order
        abs_x1, abs_x2 = min(abs_x1, abs_x2), max(abs_x1, abs_x2)
        abs_y1, abs_y2 = min(abs_y1, abs_y2), max(abs_y1, abs_y2)

        draw.rectangle(
            ((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=2
        )

        if 'text_content' in bounding_box:
            draw.text((abs_x1, abs_y2), bounding_box['text_content'], fill=color, font=font)

    return image

def parse_json(json_output: str) -> str:
    """Removes markdown fencing from JSON string."""
    lines = json_output.splitlines()
    for i, line in enumerate(lines):
        if line.strip() == "```json":
            json_output = "\n".join(lines[i+1:])
            json_output = json_output.split("```")[0].strip()
            break
    return json_output

def inference(
    prompt: str,
    image_path: str,
    system_prompt = 'You are a helpful assistant', # Kept as requested
    max_new_tokens: int = 4096,
    min_pixels: int = 512 * 28 * 28,
    max_pixels: int = 2048 * 28 * 28,
):
    if model is None or processor is None:
        return "Error: Model not loaded.", 0, 0

    messages = [
        {
            'role': 'user',
            'content': [
                {
                    'type': 'image',
                    'image': image_path,
                    'min_pixels': min_pixels,
                    'max_pixels': max_pixels,
                },
                {'type': 'text', 'text': prompt},
            ],
        }
    ]

    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    # print('Input to model:\n', text) # Commented out to reduce console clutter
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors='pt',
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    streamer = TextStreamer(processor.tokenizer, skip_special_tokens=True, skip_prompt=True)

    generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, streamer=streamer)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    # print('Output from model:\n', output_text[0]) # Commented out to reduce console clutter

    input_height = 0
    input_width = 0
    if 'image_grid_thw' in inputs and len(inputs['image_grid_thw']) > 0:
        if isinstance(inputs['image_grid_thw'][0], torch.Tensor) and inputs['image_grid_thw'][0].dim() == 1 and inputs['image_grid_thw'][0].shape[0] >= 3:
             input_height = inputs['image_grid_thw'][0][1].item() * 14
             input_width = inputs['image_grid_thw'][0][2].item() * 14
        else:
            print(f"Warning: Unexpected format for image_grid_thw: {inputs['image_grid_thw']}")

    return output_text[0], input_width, input_height


def process_file_for_ocr(file, resolution: int):
    """
    Handles PDF or image input: converts pages to images (if PDF),
    performs OCR, and returns lists of images and texts.
    """
    if file is None:
        return [], [], "", "", "Please upload a PDF or image file.", "" # Added an empty string for time_taken

    start_time = time.time() # Start time measurement

    input_file_path = file.name
    file_extension = os.path.splitext(input_file_path)[1].lower()

    all_ocr_texts_plain = []
    all_ocr_texts_full = [] # To store the complete JSON/raw output
    all_ocr_images = []
    original_images = []
    status_message = "Processing complete."
    time_taken = ""

    with tempfile.TemporaryDirectory() as temp_dir:
        try:
            images_to_process = []
            if file_extension == '.pdf':
                print(f"Processing PDF with DPI: {resolution}")
                images_to_process = convert_from_path(input_file_path, dpi=resolution)
            elif file_extension in ['.png', '.jpg', '.jpeg', '.webp', '.bmp', '.tiff', '.tif']: # Common image formats
                print(f"Processing image file: {input_file_path}")
                try:
                    pil_image = Image.open(input_file_path).convert("RGB") # Ensure RGB for consistent processing
                    images_to_process = [pil_image]
                except Exception as img_e:
                    raise ValueError(f"Could not open image file. Is it a valid image? Error: {img_e}")
            else:
                return [], [], "", "", f"Unsupported file type: {file_extension}. Please upload a PDF or a common image format (.png, .jpg, .jpeg, etc.).", ""

            if not images_to_process:
                return [], [], "No content found in file.", "No content found in file.", "No content found in file.", ""

            for i, pil_image_orig in enumerate(images_to_process):
                original_images.append(pil_image_orig)

                temp_image_path = os.path.join(temp_dir, f"page_{i}.png")
                pil_image_orig.save(temp_image_path, format="PNG")

                prompt = 'Spotting all the text in the image with line-level, and output in JSON format.'
                ocr_response, input_width, input_height = inference(prompt, temp_image_path)

                # --- Robust Parsing for OCR Text and Bounding Boxes ---
                page_extracted_text_plain = f"--- Page {i+1} (Plain Text) ---\n"
                page_extracted_text_full = f"--- Page {i+1} (Raw OCR Output) ---\n"
                current_page_image_with_boxes = pil_image_orig.copy() # Make a copy for drawing

                page_extracted_text_full += ocr_response + "\n\n" # Store raw output

                try:
                    parsed_json_str = parse_json(ocr_response)
                    parsed_boxes = ast.literal_eval(parsed_json_str)

                    if isinstance(parsed_boxes, list):
                        texts_on_page = []
                        for item in parsed_boxes:
                            if isinstance(item, dict) and 'text_content' in item:
                                texts_on_page.append(item['text_content'])
                        page_extracted_text_plain += "\n".join(texts_on_page)

                        current_page_image_with_boxes = plot_text_bounding_boxes(
                            current_page_image_with_boxes, ocr_response, input_width, input_height
                        )
                    else:
                        page_extracted_text_plain += f"OCR response was not a list of text objects. Raw:\n{parsed_json_str}"
                        print(f"Warning: OCR response for page {i+1} was not a list: {parsed_json_str}")

                except (ValueError, SyntaxError) as e:
                    page_extracted_text_plain += f"Error parsing OCR JSON for page {i+1}: {e}\nRaw response (after fence removal):\n{parsed_json_str}"
                    print(f"Error parsing OCR JSON for page {i+1}: {e}\nRaw response (after fence removal):\n{parsed_json_str}")
                except Exception as e:
                    page_extracted_text_plain += f"An unexpected error occurred during OCR text extraction for page {i+1}: {e}"
                    print(f"An unexpected error occurred during OCR text extraction for page {i+1}: {e}")

                all_ocr_texts_plain.append(page_extracted_text_plain)
                all_ocr_texts_full.append(page_extracted_text_full)
                all_ocr_images.append(current_page_image_with_boxes)

        except Exception as e:
            error_msg = f"Fatal Error processing file: {e}"
            print(error_msg)
            # Ensure time_taken is still returned even on error
            end_time = time.time()
            time_taken = f"Time: {end_time - start_time:.2f} s"
            return [], [], error_msg, error_msg, error_msg, time_taken

    end_time = time.time()
    time_taken = f"Time: {end_time - start_time:.2f} s"
    return original_images, all_ocr_images, "\n\n".join(all_ocr_texts_plain), "\n\n".join(all_ocr_texts_full), status_message, time_taken

def clear_vram():
    """Clears PyTorch's CUDA memory cache."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect() # Aggressively collect garbage
        print("CUDA VRAM cache emptied and garbage collected.")
        return "VRAM Cleared!", "" # Also clear the time counter on VRAM reset
    else:
        return "No CUDA device found to clear VRAM.", ""

# --- Gradio Interface ---
with gr.Blocks(title="Qwen2.5-VL PDF OCR Interface") as demo:
    gr.Markdown(
        """
        # Qwen2.5-VL PDF OCR Interface
        Upload a PDF or image file to perform OCR and visualize the detected text with bounding boxes.
        Adjust the PDF conversion resolution for better OCR results.
        """
    )

    # Tabs for Outputs (Previews and Texts) - Stays at the top
    with gr.Tabs():
        with gr.TabItem("Preview Images"):
            gr.Markdown("### Original vs. OCR'd Pages")
            with gr.Row():
                original_pages_gallery = gr.Gallery(
                    label="Original File Pages/Images",
                    show_label=True,
                    elem_id="gallery_original",
                    columns=2,
                    rows=2,
                    object_fit="contain",
                    height="auto",
                )
                ocr_pages_gallery = gr.Gallery(
                    label="OCR with Bounding Boxes",
                    show_label=True,
                    elem_id="gallery_ocr",
                    columns=2,
                    rows=2,
                    object_fit="contain",
                    height="auto",
                )
        with gr.TabItem("Extracted Plain Text"):
            gr.Markdown("### Plain Text Output")
            ocr_text_plain_output = gr.Textbox(
                label="Extracted Text (All Pages)", lines=20, interactive=False
            )
        with gr.TabItem("Complete OCR Output (JSON/Raw)"):
            gr.Markdown("### Raw Model Output with Bounding Box Data")
            ocr_text_full_output = gr.Textbox(
                label="Complete OCR Output (Includes Bounding Box JSON)",
                lines=20,
                interactive=False,
            )

    # Upload and Settings section - Moved to the bottom, under the previews
    with gr.Row():
        # Column 1: Upload File
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown("### 1. Upload File")
                pdf_image_input = gr.File(
                    label="Upload PDF or Image File",
                    file_types=[".pdf", ".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff", ".tif"]
                )
    
        # Column 2: OCR Settings
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown("### 2. OCR Settings")
                resolution_slider = gr.Slider(
                    minimum=72, maximum=600, value=200, step=10,
                    label="PDF to Image Resolution (DPI)",
                    info="Applies only to PDFs. Higher DPI can improve OCR accuracy but uses more memory/time.",
                )
                ocr_button = gr.Button("Perform OCR", variant="primary")
    
        # Column 3: VRAM Reset & Status
        with gr.Column(scale=1):
            gr.Markdown("---") # Separator
            with gr.Row(): # VRAM Reset and Time Counter in their own row
                vram_reset_button = gr.Button("Reset VRAM", variant="secondary")
                vram_status_output = gr.Textbox(label="VRAM Status", interactive=False, max_lines=1)
                time_counter_output = gr.Textbox(label="Processing Time", interactive=False, max_lines=1) # New time counter

    # Event Listeners
    ocr_button.click(
        process_file_for_ocr,
        inputs=[pdf_image_input, resolution_slider],
        outputs=[
            original_pages_gallery,
            ocr_pages_gallery,
            ocr_text_plain_output,
            ocr_text_full_output,
            vram_status_output,
            time_counter_output # New output for time
        ]
    )

    vram_reset_button.click(
        clear_vram,
        inputs=[],
        outputs=[vram_status_output, time_counter_output] # Clear time counter too
    )

demo.launch()

2025-07-28 07:32:59.707093: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753687979.729307    1428 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753687979.736075    1428 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


Model and Processor loaded successfully.
* Running on local URL:  http://127.0.0.1:7860
It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://4da5878345e73d4893.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Processing PDF with DPI: 300
```json
{
    "line_1": "Sreejesh/Aditya",
    "line_2": "",
    "line_3": "THE LOOK (MACAO COMMERCIAL OFFSHORE) COM",
    "line_4": "PANYI LIMITED",
    "line_5": "UNIT 1703-1704 17/F",
    "line_6": "AIA TOWER",
    "line_7": "AVENIDA COMERCIAL DE MACAU NOS 251A - 301",
    "line_8": "MACAU 999999",
    "line_9": "MACAO",
    "line_10": "Phone: 853-85920822",
    "line_11": "",
    "line_12": "For Account Of",
    "line_13": "adidas Emerging Markets FZE",
    "line_14": "c/o adidas International Trading AG",
    "line_15": "Platz 1b",
    "line_16": "Root D4 6039",
    "line_17": "SWITZERLAND",
    "line_18": "Phone: +41 41 450 04 00",
    "line_19": "",
    "line_20": "Actual Manufacturer",
    "line_21": "PT NIKOMAS GEMILANG",
    "line_22": "KEL TAMBAK",
    "line_23": "KEC KIBIN",
    "line_24": "JALAN RAYA SERANG KM.71",
    "line_25": "Serang,BT.00000",
    "line_26": "INDONESIA",
    "line_27": "",
    "line_28": "Consignee",
    "line_29": "adidas