In [2]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.37.0-py3-none-any.whl.metadata (16 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.6.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.10.4 (from gradio)
  Downloading gradio_client-1.10.4-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting huggingface-hub>=0.28.1 (from gradio)
  Downloading huggingface_hub-0.33.4-py3-none-any.whl.metadata (14 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.18-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (41 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.12.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Col

In [None]:
import json
import re
import io
import gradio as gr
from PIL import Image
import traceback # Added for detailed error reporting

from google import genai
from google.genai import types
from pydantic import BaseModel # Import BaseModel for BoundingBox

# Helper class to represent a bounding box (from user's reference)
class BoundingBox(BaseModel):
    """
    Represents a bounding box with its 2D coordinates and associated label.
    Attributes:
        box_2d (list[int]): A list of integers representing the 2D coordinates of the bounding box,
                            typically in the format [y_min, x_min, y_max, x_max].
        label (str): A string representing the label or class associated with the object within the bounding box.
    """
    box_2d: list[int]
    label: str


# --- Modified Functions for Gradio Compatibility ---

def _infer_mime_type_and_convert_to_bytes(image_pil: Image.Image):
    """
    Infers the MIME type and converts a PIL Image object to bytes.
    Returns (bytes, mime_type).
    """
    img_byte_arr = io.BytesIO()
    mime_type = "image/jpeg" # Default
    format = "JPEG"

    # Check if image has an alpha channel (RGBA) or is a PNG-like format
    if image_pil.mode in ('RGBA', 'LA') or (image_pil.mode == 'P' and 'transparency' in image_pil.info):
        mime_type = "image/png"
        format = "PNG"
    
    try:
        # Save the image to the BytesIO object in the determined format
        image_pil.save(img_byte_arr, format=format)
        img_byte_arr.seek(0) # Rewind to the beginning
        return img_byte_arr.getvalue(), mime_type
    except Exception as e:
        print(f"Error converting PIL image to bytes: {e}")
        return None, None # Return None if conversion fails


def analyze_image_text_readability_gradio(image_pil: Image.Image, project_id: str, location: str) -> dict:
    """
    Checks if text in the image (provided as PIL Image) is readable using Gemini API.
    """
    if image_pil is None:
        return {"error": "No image provided for readability check."}

    img_bytes, mime_type = _infer_mime_type_and_convert_to_bytes(image_pil)
    if img_bytes is None:
        return {"error": "Failed to convert image for API call."}
    
    print(f"DEBUG: Readability - Image bytes length: {len(img_bytes)}, MIME type: {mime_type}") # Debug print

    try:
        client = genai.Client(vertexai=True, project=project_id, location=location)
        model_name = "gemini-2.5-flash"
        
        # Changed from types.Part.from_data to types.Part.from_bytes as requested
        try:
            image_part = types.Part.from_bytes(data=img_bytes, mime_type=mime_type)
        except Exception as e_part:
            print(f"ERROR: types.Part.from_bytes failed: {e_part}")
            traceback.print_exc() # Print full traceback here for detailed error
            return {"error": f"Failed to create image part for API: {e_part}"}


        prompt_text = (
            "Analyze the text within this image. Provide a JSON object with two fields: "
            "'is_readable' (boolean, true if the text is clearly legible and easily understandable, false otherwise) "
            "and 'reason' (string, explaining why the text is or isn't readable, including specific details like "
            "blurriness, font size, lighting, or obstruction if applicable). Focus solely on the text's readability."
        )

        contents = [
            types.Content(role="user", parts=[{"text": prompt_text}, image_part])
        ]

        generation_config = types.GenerateContentConfig(
            temperature=1,
            top_p=1,
            seed=0,
            max_output_tokens=65535,
            safety_settings=[
                types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"),
                # types.SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="OFF") # Removed HARASSMENT
            ],
            thinking_config=types.ThinkingConfig(thinking_budget=-1),
            response_mime_type="application/json",
            response_schema={
                "type": "OBJECT",
                "properties": {
                    "is_readable": {"type": "BOOLEAN"},
                    "reason": {"type": "STRING"}
                },
                "required": ["is_readable", "reason"]
            }
        )

        full_response_text = ""
        for chunk in client.models.generate_content_stream(
            model=model_name,
            contents=contents,
            config=generation_config,
        ):
            if chunk.text:
                full_response_text += chunk.text

        if full_response_text:
            try:
                parsed_json = json.loads(full_response_text)
                if "is_readable" in parsed_json and "reason" in parsed_json:
                    return parsed_json
                else:
                    return {
                        "error": "Unexpected JSON structure from API. Missing 'is_readable' or 'reason'.",
                        "api_response": parsed_json
                    }
            except json.JSONDecodeError as e:
                return {
                    "error": f"Failed to parse JSON response from API: {e}",
                    "raw_response_text": full_response_text
                }
        else:
            return {"error": "No response text received from Gemini API."}

    except Exception as e:
        # Capture the full exception for better debugging
        print(f"ERROR: An error occurred during API call or client setup: {e}")
        traceback.print_exc() # Print full traceback here for detailed error
        return {"error": f"An error occurred during API call or client setup: {e}"}


def analyze_image_rotation_gradio(image_pil: Image.Image, project_id: str, location: str) -> dict:
    """
    Determines if an image needs rotation and suggests an angle (0, 90, 180, or 270 degrees) CLOCKWISE.
    """
    if image_pil is None:
        return {"error": "No image provided for rotation analysis."}

    img_bytes, mime_type = _infer_mime_type_and_convert_to_bytes(image_pil)
    if img_bytes is None:
        return {"error": "Failed to convert image for API call (rotation analysis)."}

    print(f"DEBUG: Rotation - Image bytes length: {len(img_bytes)}, MIME type: {mime_type}")

    try:
        client = genai.Client(vertexai=True, project=project_id, location=location)
        model_name = "gemini-2.5-flash"

        try:
            image_part = types.Part.from_bytes(data=img_bytes, mime_type=mime_type)
        except Exception as e_part:
            print(f"ERROR: types.Part.from_bytes failed for rotation analysis: {e_part}")
            traceback.print_exc()
            return {"error": f"Failed to create image part for rotation API: {e_part}"}

        prompt_text = (
            "Analyze the orientation of this image. If the image appears to be rotated or incorrectly oriented "
            "for typical viewing (e.g., text is sideways, objects are falling), indicate 'should_rotate' as true "
            "and suggest the **clockwise** 'rotation_angle' in degrees (from 0, 90, 180, or 270) needed to make it upright. "
            "If no rotation is needed, set 'should_rotate' to false and 'rotation_angle' to 0. "
            "Provide a brief 'reason' for your assessment.\n"
            "Respond only with a JSON object like: "
            "{'should_rotate': true/false, 'rotation_angle': 0/90/180/270, 'reason': '...'}"
        )

        contents = [
            types.Content(role="user", parts=[{"text": prompt_text}, image_part])
        ]

        generation_config = types.GenerateContentConfig(
            temperature=0, # Keep low for deterministic orientation analysis
            top_p=1,
            seed=0,
            max_output_tokens=65535,
            safety_settings=[
                types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"),
                # types.SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="OFF") # Removed HARASSMENT
            ],
            thinking_config=types.ThinkingConfig(thinking_budget=-1),
            response_mime_type="application/json",
            response_schema={
                "type": "OBJECT",
                "properties": {
                    "should_rotate": {"type": "BOOLEAN"},
                    "rotation_angle": {"type": "STRING", "enum": ["0", "90", "180", "270"]}, # Values are strings
                    "reason": {"type": "STRING"}
                },
                "required": ["should_rotate", "rotation_angle", "reason"]
            }
        )

        full_response_text = ""
        for chunk in client.models.generate_content_stream(
            model=model_name,
            contents=contents,
            config=generation_config,
        ):
            if chunk.text:
                full_response_text += chunk.text

        if full_response_text:
            try:
                parsed_json = json.loads(full_response_text)
                if "should_rotate" in parsed_json and "rotation_angle" in parsed_json and "reason" in parsed_json:
                    return parsed_json
                else:
                    return {
                        "error": "Unexpected JSON structure from API for rotation. Missing 'should_rotate', 'rotation_angle', or 'reason'.",
                        "api_response": parsed_json
                    }
            except json.JSONDecodeError as e:
                return {
                    "error": f"Failed to parse JSON response from API for rotation: {e}",
                    "raw_response_text": full_response_text
                }
        else:
            return {"error": "No response text received from Gemini API for rotation analysis."}

    except Exception as e:
        print(f"ERROR: An error occurred during rotation analysis API call: {e}")
        traceback.print_exc()
        return {"error": f"An error occurred during rotation analysis API call: {e}"}


def get_bounding_boxes_gradio(image_pil: Image.Image, project_id: str, location: str) -> list[BoundingBox]:
    """
    Detects bounding boxes of text regions or main objects in the image using Gemini API.
    Returns a list of BoundingBox objects.
    """
    if image_pil is None:
        return []

    img_bytes, mime_type = _infer_mime_type_and_convert_to_bytes(image_pil)
    if img_bytes is None:
        print("ERROR: Failed to convert image for bounding box detection API call.")
        return []

    print(f"DEBUG: Bounding Box Detection - Image bytes length: {len(img_bytes)}, MIME type: {mime_type}")

    try:
        client = genai.Client(vertexai=True, project=project_id, location=location)
        model_name = "gemini-2.5-flash"

        try:
            image_part = types.Part.from_bytes(data=img_bytes, mime_type=mime_type)
        except Exception as e_part:
            print(f"ERROR: types.Part.from_bytes failed for bounding box detection: {e_part}")
            traceback.print_exc()
            return []

        prompt_text = (
            "Identify and return bounding boxes for significant text regions or main objects in this image. "
            "Provide the bounding boxes as a JSON array of objects, where each object has 'box_2d' (normalized coordinates [y_min, x_min, y_max, x_max] from 0 to 1000) "
            "and a descriptive 'label' for the object within the box."
            "Limit to 25 objects. If an object is present multiple times, give each object a unique label "
            "according to its distinct characteristics (colors, size, position, etc..)."
        )

        contents = [
            types.Content(role="user", parts=[{"text": prompt_text}, image_part])
        ]

        generation_config = types.GenerateContentConfig(
            temperature=0.5, # Allow some creativity for labels
            top_p=1,
            seed=0,
            max_output_tokens=65535,
            safety_settings=[
                types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"),
                # types.SafetySetting(category="HARM_CATEGORY_HARASSMENT", threshold="OFF") # Removed HARASSMENT
            ],
            thinking_config=types.ThinkingConfig(thinking_budget=-1),
            response_mime_type="application/json",
            response_schema=list[BoundingBox] # Expect a list of BoundingBox objects
        )

        full_response_text = ""
        for chunk in client.models.generate_content_stream(
            model=model_name,
            contents=contents,
            config=generation_config,
        ):
            if chunk.text:
                full_response_text += chunk.text

        if full_response_text:
            try:
                # pydantic's model_validate_json handles parsing and validation
                parsed_boxes = [BoundingBox.model_validate(item) for item in json.loads(full_response_text)]
                return parsed_boxes
            except json.JSONDecodeError as e:
                print(f"ERROR: Failed to parse JSON response from API for bounding boxes: {e}")
                print(f"Raw response text: {full_response_text}")
                return []
            except Exception as e: # Catch Pydantic validation errors
                print(f"ERROR: Bounding box schema validation failed: {e}")
                traceback.print_exc()
                print(f"Raw response text: {full_response_text}")
                return []
        else:
            print("WARNING: No response text received from Gemini API for bounding box detection.")
            return []

    except Exception as e:
        print(f"ERROR: An error occurred during bounding box detection API call: {e}")
        traceback.print_exc()
        return []


def analyze_image_enhancement_gradio(image_pil: Image.Image, project_id: str, location: str) -> dict:
    """
    Analyzes the image and suggests enhancements (brighten, sharpen, contrast change, upscale, noise removal).
    Returns a dictionary with a list of suggested enhancements or an error.
    """
    if image_pil is None:
        return {"error": "No image provided for enhancement analysis."}

    img_bytes, mime_type = _infer_mime_type_and_convert_to_bytes(image_pil)
    if img_bytes is None:
        return {"error": "Failed to convert image for API call (enhancement analysis)."}

    print(f"DEBUG: Enhancement - Image bytes length: {len(img_bytes)}, MIME type: {mime_type}")

    try:
        client = genai.Client(vertexai=True, project=project_id, location=location)
        model_name = "gemini-2.5-flash"

        try:
            image_part = types.Part.from_bytes(data=img_bytes, mime_type=mime_type)
        except Exception as e_part:
            print(f"ERROR: types.Part.from_bytes failed for enhancement analysis: {e_part}")
            traceback.print_exc()
            return {"error": f"Failed to create image part for enhancement API: {e_part}"}

        prompt_text = (
            "Analyze this image and suggest necessary enhancements. "
            "Provide a list of one or more suggestions from: "
            "'brighten', 'sharpen', 'contrast change', 'upscale', 'noise removal'. "
            "If no enhancement is needed, return an empty list. "
            "Respond only with a JSON object containing a 'suggestions' key, which holds the list of strings."
            "Example: {'suggestions': ['brighten', 'noise removal']}"
        )

        contents = [
            types.Content(role="user", parts=[{"text": prompt_text}, image_part])
        ]

        # Define the response schema for a list of enhancement suggestions
        generation_config = types.GenerateContentConfig(
            temperature=0.5, # Allow some flexibility in suggestions
            top_p=1,
            seed=0,
            max_output_tokens=65535,
            safety_settings=[
                types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"),
                # types.SafetySetting(category="HARASSMENT", threshold="OFF") # Removed HARASSMENT
            ],
            thinking_config=types.ThinkingConfig(thinking_budget=-1),
            response_mime_type="application/json",
            response_schema={
                "type": "OBJECT",
                "properties": {
                    "suggestions": {
                        "type": "ARRAY",
                        "items": {
                            "type": "STRING",
                            "enum": ["brighten", "sharpen", "contrast change", "upscale", "noise removal"]
                        }
                    }
                },
                "required": ["suggestions"]
            }
        )

        full_response_text = ""
        for chunk in client.models.generate_content_stream(
            model=model_name,
            contents=contents,
            config=generation_config,
        ):
            if chunk.text:
                full_response_text += chunk.text

        if full_response_text:
            try:
                parsed_json = json.loads(full_response_text)
                if "suggestions" in parsed_json and isinstance(parsed_json["suggestions"], list):
                    return parsed_json
                else:
                    return {
                        "error": "Unexpected JSON structure from API for enhancement. Missing 'suggestions' key or not a list.",
                        "api_response": parsed_json
                    }
            except json.JSONDecodeError as e:
                return {
                    "error": f"Failed to parse JSON response from API for enhancement: {e}",
                    "raw_response_text": full_response_text
                }
        else:
            return {"error": "No response text received from Gemini API for enhancement analysis."}

    except Exception as e:
        print(f"ERROR: An error occurred during enhancement analysis API call: {e}")
        traceback.print_exc()
        return {"error": f"An error occurred during enhancement analysis API call: {e}"}


def extract_text_from_image_gradio(image_pil: Image.Image, project_id: str, location: str, keys_to_extract: list) -> dict:
    """
    Extracts specific text fields from the image (provided as PIL Image) using Gemini API
    and returns them in a JSON format.
    """
    if image_pil is None:
        return {"error": "No image provided for text extraction."}
    if not keys_to_extract:
        return {"error": "No keys provided for extraction."}

    img_bytes, mime_type = _infer_mime_type_and_convert_to_bytes(image_pil)
    if img_bytes is None:
        return {"error": "Failed to convert image for API call."}

    print(f"DEBUG: Extraction - Image bytes length: {len(img_bytes)}, MIME type: {mime_type}") # Debug print

    try:
        client = genai.Client(vertexai=True, project=project_id, location=location)
        model_name = "gemini-2.5-flash"
        
        # Changed from types.Part.from_data to types.Part.from_bytes as requested
        try:
            image_part = types.Part.from_bytes(data=img_bytes, mime_type=mime_type)
        except Exception as e_part:
            print(f"ERROR: types.Part.from_bytes failed: {e_part}")
            traceback.print_exc() # Print full traceback here for detailed error
            return {"error": f"Failed to create image part for API: {e_part}"}

        prompt_text = (
            f"Extract the following information from this image and provide it as a JSON object: "
            f"{', '.join(keys_to_extract)}. If a piece of information is not found, use 'N/A' as its value. "
            "Ensure the keys in the JSON exactly match the requested information and their casing."
            "\nExample: {'Name': 'John Doe', 'Father\'s Name': 'N/A', 'PAN Number': 'ABCDE1234F', 'Date of birth': '1990-01-01'}"
        )

        contents = [
            types.Content(role="user", parts=[{"text": prompt_text}, image_part])
        ]

        response_schema_properties = {}
        for key in keys_to_extract:
            schema_key = re.sub(r'[^a-zA-Z0-9_]', '', key.replace(" ", "_")).lower()
            response_schema_properties[schema_key] = {"type": "STRING"}
        
        if "Date of birth" in keys_to_extract:
             response_schema_properties["date_of_birth"] = {"type": "STRING", "pattern": "^\\d{4}-\\d{2}-\\d{2}$|^N/A$"}


        generation_config = types.GenerateContentConfig(
            temperature=0.2, # Keep low for factual extraction
            top_p=1,
            seed=0,
            max_output_tokens=65535,
            safety_settings=[
                types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"),
                # types.SafetySetting(category="HARASSMENT", threshold="OFF") # Removed HARASSMENT
            ],
            thinking_config=types.ThinkingConfig(thinking_budget=-1),
            response_mime_type="application/json",
            response_schema={
                "type": "OBJECT",
                "properties": response_schema_properties,
                "required": [re.sub(r'[^a-zA-Z0-9_]', '', key.replace(" ", "_")).lower() for key in keys_to_extract]
            }
        )

        full_response_text = ""
        for chunk in client.models.generate_content_stream(
            model=model_name,
            contents=contents,
            config=generation_config,
        ):
            if chunk.text:
                full_response_text += chunk.text

        if full_response_text:
            try:
                parsed_json = json.loads(full_response_text)
                
                final_extracted_data = {}
                for original_key in keys_to_extract:
                    schema_key = re.sub(r'[^a-zA-Z0-9_]', '', original_key.replace(" ", "_")).lower()
                    final_extracted_data[original_key] = parsed_json.get(schema_key, "N/A")

                return final_extracted_data
            except json.JSONDecodeError as e:
                return {
                    "error": f"Failed to parse JSON response from API: {e}",
                    "raw_response_text": full_response_text
                }
        else:
            return {"error": "No structured text extracted from the image by Gemini API."}

    except Exception as e:
        print(f"ERROR: An error occurred during text extraction API call: {e}")
        traceback.print_exc() # Print full traceback here for detailed error
        return {"error": f"An error occurred during text extraction API call: {e}"}

    
def verify_extracted_data(extracted_text_json: dict, keys_to_verify: list, project_id: str, location: str) -> dict:
    """
    Verifies if specific keys (like name, father's name, PAN, DOB) are present
    and valid in the *structured* extracted data.
    """
    if not extracted_text_json or not isinstance(extracted_text_json, dict):
        return {"error": "Invalid or no extracted JSON data provided for verification."}

    if not keys_to_verify:
        return {"error": "No keys provided for verification."}

    try:
        client = genai.Client(vertexai=True, project=project_id, location=location)
        model_name = "gemini-2.5-flash"

        extracted_json_str = json.dumps(extracted_text_json, indent=2)

        verification_prompt = (
            f"Given the following structured extracted data:\n\n-----{extracted_json_str}\n---\n\n"
            "For each of the following pieces of information, state if its value is meaningfully present "
            "and not 'N/A' or empty. Provide your answer as a JSON object "
            "where keys are the requested information and values are 'Yes' or 'No'.\n\n"
            f"Information to verify: {', '.join(keys_to_verify)}\n\n"
            "Example: {'Name': 'Yes', 'Address': 'No'}"
        )

        contents = [types.Content(role="user", parts=[{"text": verification_prompt}])]

        response_schema_properties = {
            re.sub(r'[^a-zA-Z0-9_]', '', key.replace(" ", "_")).lower(): {"type": "STRING", "enum": ["Yes", "No"]} for key in keys_to_verify
        }

        generation_config = types.GenerateContentConfig(
            temperature=0,
            top_p=1,
            seed=0,
            max_output_tokens=65535,
            safety_settings=[
                types.SafetySetting(category="HARM_CATEGORY_HATE_SPEECH", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_DANGEROUS_CONTENT", threshold="OFF"),
                types.SafetySetting(category="HARM_CATEGORY_SEXUALLY_EXPLICIT", threshold="OFF"),
                # types.SafetySetting(category="HARASSMENT", threshold="OFF") # Removed HARASSMENT
            ],
            thinking_config=types.ThinkingConfig(thinking_budget=-1),
            response_mime_type="application/json",
            response_schema={
                "type": "OBJECT",
                "properties": response_schema_properties
            }
        )

        full_response_text = ""
        for chunk in client.models.generate_content_stream(
            model=model_name,
            contents=contents,
            config=generation_config,
        ):
            if chunk.text:
                full_response_text += chunk.text

        if full_response_text:
            try:
                parsed_json = json.loads(full_response_text)
                result = {}
                for key in keys_to_verify:
                    schema_key = re.sub(r'[^a-zA-Z0-9_]', '', key.replace(" ", "_")).lower()
                    result[key] = parsed_json.get(schema_key, "No")
                return result
            except json.JSONDecodeError as e:
                return {
                    "error": f"Failed to parse JSON response from API during verification: {e}",
                    "raw_response_text": full_response_text
                }
        else:
            return {"error": "No response text received from Gemini API for verification."}

    except Exception as e:
        print(f"ERROR: An error occurred during verification API call: {e}")
        traceback.print_exc() # Print full traceback here for detailed error
        return {"error": f"An error occurred during verification API call: {e}"}


# --- Gradio Interface ---

PROJECT_ID = ""
# Changed location from 'global' to 'us-central1' for more explicit regional targeting,
# as it's a common and well-supported region for Vertex AI.
LOCATION = "us-central1" 
KEYS_OF_INTEREST = ["Name", "Father's Name", "PAN Number", "Date of birth"]

def process_image(image_pil: Image.Image, extract_data: bool, validate_data: bool, analyze_rotation: bool, analyze_enhancement: bool, enable_cropping: bool):
    """
    Main function to process the uploaded image through the different stages.
    Ensures all outputs for gr.JSON are JSON strings.
    Applies rotation first, then crops (conditionally), then sends for extraction/validation.
    """
    # Initialize all results as empty JSON strings
    readability_output = json.dumps({})
    rotation_analysis_output = json.dumps({})
    enhancement_suggestions_output = json.dumps({})
    
    # Initialize image outputs
    rotated_image_display = None
    cropped_image_display = None

    if image_pil is None:
        readability_output = json.dumps({"error": "Please upload an image."})
        # Return gr.update for textboxes, setting initial empty value and no class
        return readability_output, rotation_analysis_output, rotated_image_display, cropped_image_display, enhancement_suggestions_output, \
               gr.update(value="", elem_classes=["unvalidated-field"]), \
               gr.update(value="", elem_classes=["unvalidated-field"]), \
               gr.update(value="", elem_classes=["unvalidated-field"]), \
               gr.update(value="", elem_classes=["unvalidated-field"])

    original_image_pil = image_pil
    image_after_rotation = original_image_pil # This will be modified if rotation occurs

    # Stage 1: Readability Check (always on original image)
    readability_response_dict = analyze_image_text_readability_gradio(original_image_pil, PROJECT_ID, LOCATION)
    readability_output = json.dumps(readability_response_dict, indent=2)

    # Stage 2: Perform Rotation Analysis on the ORIGINAL image FIRST
    rotation_angle_to_apply = 0 # Default no rotation
    if analyze_rotation:
        rotation_analysis_response_dict = analyze_image_rotation_gradio(original_image_pil, PROJECT_ID, LOCATION) # Analyze original image
        rotation_analysis_output = json.dumps(rotation_analysis_response_dict, indent=2)

        should_rotate = rotation_analysis_response_dict.get("should_rotate", False)
        predicted_angle_str = rotation_analysis_response_dict.get("rotation_angle")

        if should_rotate and predicted_angle_str in ["0", "90", "180", "270"]:
            try:
                # Map desired CLOCKWISE angle to PIL's expected COUNTER-CLOCKWISE angle
                # PIL.Image.rotate(angle) rotates counter-clockwise
                # To achieve X degrees clockwise, we need to rotate by -X degrees counter-clockwise
                pil_rotation_map = {
                    "0": 0,
                    "90": -90,  # 90 degrees clockwise is -90 degrees counter-clockwise
                    "180": -180, # 180 degrees clockwise is -180 degrees counter-clockwise
                    "270": -270 # 270 degrees clockwise is -270 degrees counter-clockwise
                }
                rotation_angle_for_pil = pil_rotation_map.get(predicted_angle_str, 0) # Default to 0 if invalid

                if rotation_angle_for_pil != 0: # Only rotate if there's an actual rotation needed
                    image_after_rotation = original_image_pil.rotate(rotation_angle_for_pil, expand=True)
                    rotated_image_display = image_after_rotation # Display this rotated image
                    print(f"DEBUG: Original image rotated by {predicted_angle_str} degrees CLOCKWISE.")
                else:
                    print("DEBUG: Rotation analysis recommended 0-degree rotation, no actual rotation applied.")
                    rotated_image_display = original_image_pil # Display original if 0 rotation
            except ValueError:
                print(f"WARNING: Invalid rotation_angle received from API (not an integer string): {predicted_angle_str}")
                rotated_image_display = original_image_pil # Display original on error
        else:
            print("DEBUG: Rotation analysis did not recommend rotation or angle was invalid/not found.")
            rotated_image_display = original_image_pil # Display original if no rotation/invalid angle
    else:
        rotation_analysis_output = json.dumps({"status": "Rotation analysis skipped"})
        rotated_image_display = original_image_pil # Display original if rotation analysis is skipped

    # Now, determine the image to be used for subsequent stages
    image_for_extraction_validation = image_after_rotation # Default to rotated image

    # Stage 3: Get Bounding Boxes and Optionally Crop (on the image_after_rotation)
    if enable_cropping: # Only perform bounding box detection and cropping if enabled
        bounding_boxes = []
        # Only detect boxes if extraction/validation/enhancement is intended OR if rotation analysis was active (to potentially crop before extraction)
        if (extract_data or validate_data or analyze_enhancement) and analyze_rotation: # Added analyze_enhancement condition
            bounding_boxes = get_bounding_boxes_gradio(image_after_rotation, PROJECT_ID, LOCATION) # Detect boxes on ROTATED image

            if bounding_boxes:
                largest_bbox = None
                max_area = -1
                width_px, height_px = image_after_rotation.size

                for bbox in bounding_boxes:
                    y_min, x_min, y_max, x_max = bbox.box_2d
                    abs_x_min = int(x_min / 1000 * width_px)
                    abs_y_min = int(y_min / 1000 * height_px)
                    abs_x_max = int(x_max / 1000 * width_px)
                    abs_y_max = int(y_max / 1000 * height_px)

                    current_area = (abs_x_max - abs_x_min) * (abs_y_max - abs_y_min)
                    if current_area > max_area:
                        max_area = current_area
                        largest_bbox = (abs_x_min, abs_y_min, abs_x_max, abs_y_max)
                
                if largest_bbox:
                    # Crop the rotated image using the largest bounding box
                    image_for_extraction_validation = image_after_rotation.crop(largest_bbox)
                    cropped_image_display = image_for_extraction_validation # Display this cropped image
                    print(f"DEBUG: Rotated image cropped to largest bounding box for extraction/validation: {largest_bbox}")
                else:
                    print("DEBUG: No valid bounding boxes found on rotated image for cropping. Using full rotated image for extraction.")
                    cropped_image_display = image_after_rotation # Display full rotated image if no crop but cropping enabled
            else:
                print("DEBUG: No bounding boxes detected on rotated image. Using full rotated image for extraction.")
                cropped_image_display = image_after_rotation # Display full rotated image if no crop but cropping enabled
        else:
            # If cropping is enabled but conditions for bounding box detection are not met
            cropped_image_display = image_after_rotation # Display the rotated image, no cropping happened
    else: # Cropping is disabled
        print("DEBUG: Cropping disabled by user. Using full rotated image for extraction/validation/enhancement.")
        image_for_extraction_validation = image_after_rotation # Use the full rotated image
        cropped_image_display = image_after_rotation # Display the full rotated image


    # Stage 4: Conditional Image Enhancement Analysis (on the final processed image)
    if analyze_enhancement:
        enhancement_response_dict = analyze_image_enhancement_gradio(image_for_extraction_validation, PROJECT_ID, LOCATION)
        enhancement_suggestions_output = json.dumps(enhancement_response_dict, indent=2)
    else:
        enhancement_suggestions_output = json.dumps({"status": "Enhancement analysis skipped"})


    # Stage 5: Conditional Text Extraction (now uses potentially rotated AND cropped image)
    extracted_data_response_dict = {}
    is_readable = readability_response_dict.get("is_readable") # Still refer to original readability
    if is_readable and extract_data:
        extracted_data_response_dict = extract_text_from_image_gradio(image_for_extraction_validation, PROJECT_ID, LOCATION, KEYS_OF_INTEREST)
    elif extract_data:
        # If extraction requested but original image not readable, populate default values for display
        extracted_data_response_dict = {key: "N/A" for key in KEYS_OF_INTEREST}


    # Stage 6: Conditional Data Verification
    verified_data_response_dict = {}
    if extract_data and validate_data: # Only validate if extraction was attempted
        extraction_successful = extracted_data_response_dict and not extracted_data_response_dict.get("error")
        if extraction_successful:
            verified_data_response_dict = verify_extracted_data(extracted_data_response_dict, KEYS_OF_INTEREST, PROJECT_ID, LOCATION)
        else:
            # If extraction failed, set all verification to "No" for display clarity
            verified_data_response_dict = {key: "No" for key in KEYS_OF_INTEREST}
    elif validate_data: # If validation requested but extraction not, default all to "No"
        verified_data_response_dict = {key: "No" for key in KEYS_OF_INTEREST}


    # Populate individual text fields and their validation classes
    name_value = extracted_data_response_dict.get("Name", "N/A")
    father_name_value = extracted_data_response_dict.get("Father's Name", "N/A")
    pan_number_value = extracted_data_response_dict.get("PAN Number", "N/A")
    dob_value = extracted_data_response_dict.get("Date of birth", "N/A")

    # Determine CSS classes AND append tick/cross marks based on validation result
    name_validation_class = "valid-field" if verified_data_response_dict.get("Name") == "Yes" else ("invalid-field" if validate_data else "")
    father_name_validation_class = "valid-field" if verified_data_response_dict.get("Father's Name") == "Yes" else ("invalid-field" if validate_data else "")
    pan_number_validation_class = "valid-field" if verified_data_response_dict.get("PAN Number") == "Yes" else ("invalid-field" if validate_data else "")
    dob_validation_class = "valid-field" if verified_data_response_dict.get("Date of birth") == "Yes" else ("invalid-field" if validate_data else "")

    if validate_data:
        name_value += " ✔" if verified_data_response_dict.get("Name") == "Yes" else " ✘"
        father_name_value += " ✔" if verified_data_response_dict.get("Father's Name") == "Yes" else " ✘"
        pan_number_value += " ✔" if verified_data_response_dict.get("PAN Number") == "Yes" else " ✘"
        dob_value += " ✔" if verified_data_response_dict.get("Date of birth") == "Yes" else " ✘"


    # Return all outputs as gr.update objects for the textboxes to set value and class
    return readability_output, rotation_analysis_output, rotated_image_display, cropped_image_display, enhancement_suggestions_output, \
           gr.update(value=name_value, elem_classes=[name_validation_class]), \
           gr.update(value=father_name_value, elem_classes=[father_name_validation_class]), \
           gr.update(value=pan_number_value, elem_classes=[pan_number_validation_class]), \
           gr.update(value=dob_value, elem_classes=[dob_validation_class])

# Gradio Interface setup
# Custom CSS for validation styling
custom_css = """
.valid-field {
    border: 2px solid green !important;
    background-color: #e6ffe6 !important; /* Light green background */
}
.invalid-field {
    border: 2px solid red !important;
    background-color: #ffe6e6 !important; /* Light red background */
}
/* Optional: style for textboxes that haven't been validated yet */
.unvalidated-field {
    border: 1px solid #ccc !important;
    background-color: #f0f0f0 !important;
}
"""

with gr.Blocks(css=custom_css) as demo:
    gr.Markdown("# Intelligent Document Processor") # Changed title here
    gr.Markdown("Upload an image to check its readability, analyze its orientation for rotation, get enhancement suggestions, and extract/validate data.")

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload Original Image")
            analyze_rotation_checkbox = gr.Checkbox(label="Analyze & Auto-Rotate Image", value=True)
            enable_cropping_checkbox = gr.Checkbox(label="Enable Cropping (after rotation)", value=True) # New checkbox
            analyze_enhancement_checkbox = gr.Checkbox(label="Analyze Image Enhancement Needs", value=True)
            extract_checkbox = gr.Checkbox(label="Extract Data", value=True)
            validate_checkbox = gr.Checkbox(label="Validate Extracted Data", value=True)
            process_button = gr.Button("Analyze Image")
        with gr.Column():
            gr.Markdown("---") # Separator
            gr.Markdown("## Extracted & Verified Data")
            with gr.Row():
                # Textboxes for Name and Father's Name
                name_box = gr.Textbox(label="Name", interactive=False, elem_id="name_box")
                father_name_box = gr.Textbox(label="Father's Name", interactive=False, elem_id="father_name_box")
            with gr.Row():
                # Textboxes for PAN Number and Date of Birth
                pan_box = gr.Textbox(label="PAN Number", interactive=False, elem_id="pan_box")
                dob_box = gr.Textbox(label="Date of Birth", interactive=False, elem_id="dob_box")

            gr.Markdown("---") # Separator
            readability_output = gr.JSON(label="Readability Analysis (Original Image)")
            rotation_analysis_output = gr.JSON(label="Image Rotation Analysis (on Original Image)")
            enhancement_suggestions_output = gr.JSON(label="Image Enhancement Suggestions (on Final Processed Image)")
            rotated_image_display = gr.Image(type="pil", label="Image After Rotation (if applied)", show_label=True, visible=True)
            cropped_image_display = gr.Image(type="pil", label="Cropped Image for Analysis (after rotation)", show_label=True, visible=True)
            
    process_button.click(
        process_image,
        inputs=[image_input, extract_checkbox, validate_checkbox, analyze_rotation_checkbox, analyze_enhancement_checkbox, enable_cropping_checkbox],
        outputs=[
            readability_output,
            rotation_analysis_output,
            rotated_image_display,
            cropped_image_display,
            enhancement_suggestions_output,
            name_box,
            father_name_box,
            pan_box,
            dob_box,
        ]
    )

demo.launch(debug=True, share=True)


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://3a37d42fbd334aaee8.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


DEBUG: Readability - Image bytes length: 61432, MIME type: image/jpeg
DEBUG: Rotation - Image bytes length: 61432, MIME type: image/jpeg
DEBUG: Original image rotated by 270 degrees CLOCKWISE.
DEBUG: Bounding Box Detection - Image bytes length: 61311, MIME type: image/jpeg
DEBUG: Rotated image cropped to largest bounding box for extraction/validation: (682, 158, 933, 367)
DEBUG: Enhancement - Image bytes length: 10895, MIME type: image/jpeg
DEBUG: Extraction - Image bytes length: 10895, MIME type: image/jpeg
DEBUG: Readability - Image bytes length: 96368, MIME type: image/jpeg
DEBUG: Rotation - Image bytes length: 96368, MIME type: image/jpeg
DEBUG: Rotation analysis did not recommend rotation or angle was invalid/not found.
DEBUG: Bounding Box Detection - Image bytes length: 96368, MIME type: image/jpeg
DEBUG: Rotated image cropped to largest bounding box for extraction/validation: (913, 419, 1183, 699)
DEBUG: Enhancement - Image bytes length: 8421, MIME type: image/jpeg
DEBUG: Extrac