In [27]:
import google.generativeai as genai
import io
import os
from dotenv import load_dotenv
from langchain_core.messages import HumanMessage
import base64
from langchain_google_genai import ChatGoogleGenerativeAI
from pathlib import Path
from PIL import Image
from typing import Optional

In [29]:
def extract_text_from_image(
    image_path: str,
    prompt: Optional[str] = "Extract all readable text from this image.",
    api_key: Optional[str] = None
) -> str:
    """
    Extract text from an image using Gemini 2.5 Pro Vision multimodal capabilities via LangChain.

    Args:
        image_path (str): Path to the image file.
        prompt (str): Prompt to guide Gemini in extracting text.
        api_key (str): Your Google API key with Gemini access.

    Returns:
        str: Extracted text response from Gemini.
    """


    with Image.open(image_path) as img:
        img_bytes = io.BytesIO()
        img.save(img_bytes, format="PNG")
        img_bytes.seek(0)
        image_base64 = base64.b64encode(img_bytes.read()).decode("utf-8")

        message = [
        HumanMessage(
            content=[
                {
                    "type": "text",
                    "text": (
                        "Extract all the text from this image. "
                        "Return only the extracted text, no explanations."
                    ),
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{image_base64}"
                    },
                },
            ]
        )
    ]
    if api_key is None:
        raise ValueError("You must provide a Google Gemini API key.")

    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-pro-exp-03-25",
        google_api_key=api_key,
        convert_system_message_to_human=True,
    )

    response = llm.invoke(message)

    return response.content if hasattr(response, "content") else str(response)


In [None]:
IMAGE_PATH = ".\\src\\gaia_test\\data\\sample_ocr_image.png"
GOOGLE_API_KEY = "your-google-api-key"

# List available models to debug the error
models = genai.list_models()
print("Available models:")
for model in models:
    print(model.name)

In [30]:
text = extract_text_from_image(IMAGE_PATH, api_key=GOOGLE_API_KEY)
print("Extracted Text:\n", text)



Extracted Text:
 Lorem ips's deplace
-Lorem,-ipsum dolor sit as a consectindiarm, consectresum, indisim
incildis me dolore-illant, quis appristenched labor, filsit in labor
magna, amil jouri paratum.
-A em-sim, occasentdis tempar cublling dota, consentables
noisucle disfirycliplor vise, ant Emadisalitais sild befure posscat
in at depececting hlis molcicat, tempers desforentius lean compdntr,
for the magnants thic procedients.
-A at perfermtimg the llaventor ate.
-A perum dutis may minim placere, et culdnt, blals, temporam places
vollurat paceal supper the diffectates, quis,
caperour decinsum on the offpeate maginam.
