In [1]:
import os
import uuid
import fitz  # PyMuPDF

from PIL import Image
from io import BytesIO
from dotenv import load_dotenv
from base64 import b64encode
from typing import Literal, List, Dict, Tuple, NoReturn
from openai import AzureOpenAI

from langchain_community.document_loaders import ImageCaptionLoader
# from langchain_google_genai import GoogleGenerativeAI
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage


## Extracting images from PDF

In [2]:

# === 1) Extract images ===
def extract_images_from_pdf(pdf_path: str, output_folder: str) -> Dict[str,str]:
    """ This function will extract the images from user provided PDF assign a unique id (doc_id)
        to the each image and return with image's address

    Args:
        pdf_path (str): Path of the user provided PDF
        output_folder (str): Path of empty folder in which all extracted images will saved.

    Returns:
        Dict[str,str]: A python dictionary that contains the doc_id as key and original image path as value.

    Example:
        ```
        doc_ids = extract_images_from_pdf(
        pdf_path="./content/attention.pdf", 
        output_folder="extracted_images"
        )
        ```
    """
    
    os.makedirs(output_folder, exist_ok=True)

    ## removing existting files
    for dirname, _, filenames in os.walk(output_folder):
        for filename in filenames:
            os.remove(os.path.join(dirname, filename))

    pdf_doc = fitz.open(pdf_path)
    doc_ids: Dict[str, str] = dict()  # To store doc_id mappings

    for page_num in range(len(pdf_doc)):
        page = pdf_doc[page_num]
        images = page.get_images(full=True)

        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = pdf_doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]

            doc_id = str(uuid.uuid4())  # unique ID for this image

            image_filename = f"{doc_id}.{image_ext}"
            image_filepath = os.path.join(output_folder, image_filename)

            with open(image_filepath, "wb") as f:
                f.write(image_bytes)

            doc_ids.update({doc_id:image_filepath})

    pdf_doc.close()
    return doc_ids

In [3]:
doc_ids = extract_images_from_pdf(
    pdf_path="./content/attention.pdf", 
    output_folder="extracted_images"
)

In [4]:
doc_ids

{'b2eb2596-f938-445d-a28e-99951b8ee1a9': 'extracted_images/b2eb2596-f938-445d-a28e-99951b8ee1a9.png',
 '66823934-ae1a-4d86-94e6-7445e45bff91': 'extracted_images/66823934-ae1a-4d86-94e6-7445e45bff91.png',
 '09ed0e42-ef78-4287-8a4a-a43651125b8d': 'extracted_images/09ed0e42-ef78-4287-8a4a-a43651125b8d.png'}

## model setup

In [5]:
def load_model() -> GoogleGenerativeAI:
    """ It will creating model instance and return it

    Returns:
        GoogleGenerativeAI: LLM (GEMINI-1.5-FLASH) instance
    """

    load_dotenv()

    llm = GoogleGenerativeAI(
        name = 'mm-rag-1.0',
        model = "gemini-1.5-flash",
        max_output_tokens = 2048,
        temperature = 0.5,
        verbose = False
    )

    return llm
    

In [6]:
model = load_model()
model.invoke("Hi")

'Hi there! How can I help you today?'

## Azure Setup

In [10]:
def load_azure_client() -> AzureOpenAI:
    """ This function will load all the credentials and setup GPT-4o-mini LLM model
        afterthat return as instance of Azure Open AI class

    Returns:
        AzureOpenAI: Instance of class (main execution) 
    """ 

    load_dotenv()

    deployment = "gpt-4o-mini"
    api_version = "2024-12-01-preview"

    client = AzureOpenAI(
        api_version=api_version
    )

    return client



In [11]:
#                            TESTING 

online_transformer_image_path = "https://media.geeksforgeeks.org/wp-content/uploads/20250529164119039854/Screenshot-2024-05-24-112357-768.webp"

azure_llm = load_azure_client()
response = azure_llm.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {
            "role": "system",
            "content": "You are an expert vision assistant. Generate a summary for the given image only."
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Describe this image in detail."},
                {"type": "image_url", "image_url": {"url": online_transformer_image_path}}
            ]
        }
    ],
    max_tokens=4096,
    temperature=1.0,
    top_p=1.0
)

summary: str = response.choices[0].message.content                  # fetching summary from llm response


In [12]:
print(summary)

The image is a diagram illustrating the architecture of the Transformer model, which is used in natural language processing. It is divided into two main parts: the encoder and the decoder, each containing multiple stacked layers (denoted as Nx).

1. **Encoder Side**:
   - Begins with the "Input Embedding" layer that processes the input data.
   - "Positional Encoding" is added to provide information about the position of tokens in the sequence.
   - Each stack consists of:
     - A "Multi-Head Attention" layer that enables the model to focus on different parts of the input.
     - An "Add & Norm" layer that adds the input and output of the attention layer and normalizes it.
     - A "Feed Forward" layer for further processing of the attention output followed by another "Add & Norm".

2. **Decoder Side**:
   - Similar to the encoder but includes an additional "Masked Multi-Head Attention" layer to prevent looking ahead in the sequence during training.
   - It also processes the output w

## Generating summary of each extracted image (Gemini)

In [None]:

# === 2) Images summary generation ===
def generate_summary_of_each_image_using_gemini(extracted_images_folder_path: str, verbose: bool = True) -> Dict[str, str]:
    """ This function takes the path of folder in which extracted images are stores,
        and generate the summary of each image by the use of LLM, finally returned it.

    Args:
        extracted_images_folder_path (str): Path of folder that contains extracted images from PDF

    Returns:
        Dict[str, str]: Returns a python dictionary that contains the unique doc_id as key as 
                        generated summary of images as value
    """

    # checking provided folder path exists or not
    if not os.path.exists(extracted_images_folder_path):
        raise FileExistsError(f"Folder doesn't exists: {extracted_images_folder_path}. Try to provide full folder path.")

    ## loading model
    model = load_model()

    generated_summary: Dict[str,str] = dict()                               # defining variable which store generated summaries of each image
    images_path: List[str] = os.listdir(extracted_images_folder_path)       # fetching all images paths
    total_images_count = len(images_path)                                   # extracting total number of images

    ## iteration of each image
    for index,image_path in enumerate(images_path):
        doc_id = image_path.split('.')[0]                                   # extracting doc_id from image title
        
        # updating image path with its parent folder
        image_full_path = os.path.join(extracted_images_folder_path, image_path)
                               
        # fetching image data
        with open(image_full_path, "rb") as f:
            image_bytes = f.read()
            image_data = b64encode(image_bytes).decode("utf-8")

        if verbose:
            print(f"Generating summary of image ({index+1}/{total_images_count}).... ")


        ## creating instance of message (HumanMessage)
        message = HumanMessage(
            content=[
                {
                    "type": "text",
                    "text": "Describe this PNG image in detail."
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{image_data}"
                    }
                }
            ]
        )

        summary = model.invoke([message])                                   # generating summary from LLM
        generated_summary.update({doc_id:summary})                          # updating generated summary 

    return generated_summary



In [60]:
summaries = generate_summary_of_each_image('extracted_images')

Generating summary of image (1/1).... 


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 50
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 58
}
].


ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 50
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 56
}
]

In [56]:
summaries

{'a787b605-36ff-4bae-9083-61680ff5363c': "Please provide me with the image you'd like me to describe. I need the image itself to be able to give you a detailed description."}

## Generating summary of each extracted image (Azure GPT-4o-mini)

In [None]:
# === 2) Images summary generation ===
def generate_summary_of_each_image(extracted_images_folder_path: str, verbose: bool = True) -> Dict[str, str]:
    """ This function takes the path of folder in which extracted images are stores,
        and generate the summary of each image by the use of LLM, finally returned it.

    Args:
        extracted_images_folder_path (str): Path of folder that contains extracted images from PDF

    Returns:
        Dict[str, str]: Returns a python dictionary that contains the unique doc_id as key as 
                        generated summary of images as value
    """

    # checking provided folder path exists or not
    if not os.path.exists(extracted_images_folder_path):
        raise FileNotFoundError(f"Folder doesn't exists: {extracted_images_folder_path}. Try to provide full folder path.")

    ## loading model
    client = load_azure_client()

    generated_summary: Dict[str,str] = dict()                               # defining variable which store generated summaries of each image
    images_path: List[str] = os.listdir(extracted_images_folder_path)       # fetching all images paths
    total_images_count = len(images_path)                                   # extracting total number of images

    ## iteration of each image
    for index,image_path in enumerate(images_path):
        doc_id = image_path.split('.')[0]                                   # extracting doc_id from image title
        
        # updating image path with its parent folder
        image_full_path = os.path.join(extracted_images_folder_path, image_path)
                               
        # fetching image data
        with open(image_full_path, "rb") as f:
            image_b64 = b64encode(f.read()).decode("utf-8")

        if verbose:
            print(f"Generating summary of image ({index+1}/{total_images_count}).... ")


        data_url = f"data:image/png;base64,{image_b64}"                     # creating data url variable

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert vision assistant. Generate a summary for the given image only."
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Describe this image in detail."},
                        {"type": "image_url", "image_url": {"url": data_url}}
                    ]
                }
            ],
            max_tokens=4096,
            temperature=1.0,
            top_p=1.0
        )

        summary: str = response.choices[0].message.content                  # fetching summary from llm response
        generated_summary.update({doc_id:summary})                          # updating generated summary 

    return generated_summary

In [78]:
summaries = generate_summary_of_each_image('./extracted_images')

Generating summary of image (1/3).... 
Generating summary of image (2/3).... 
Generating summary of image (3/3).... 


In [81]:
summaries['0497f69e-d3c7-4531-bdce-cd54c4800538'].split(' ').__len__()

244

## Vector Store

In [None]:
# === 3) Store in vector store ===
def store_in_vector_store(captions, doc_ids, store_dir="caption_faiss"):
    embeddings_azure_client = load_azure_client()

    # Build LangChain Documents with metadata
    caption_docs = []
    for (doc_id, image_path), caption_doc in zip(doc_ids, captions):
        caption = caption_doc.page_content
        caption_docs.append(
            Document(
                page_content=caption,
                metadata={"doc_id": doc_id, "image_path": image_path}
            )
        )

    # Store embeddings of captions
    vectorstore = FAISS.from_documents(caption_docs, embeddings)
    vectorstore.save_local(store_dir)
    return vectorstore


In [None]:



# === 4) Done! Now use it as retriever ===
def build_retriever(vectorstore_dir="caption_faiss"):
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.load_local(vectorstore_dir, embeddings, allow_dangerous_deserialization=True)
    retriever = vectorstore.as_retriever()
    return retriever

# === Example ===
if __name__ == "__main__":
    pdf_path = "sample.pdf"
    output_folder = "extracted_images"

    # 1) Extract images
    doc_ids = extract_images_from_pdf(pdf_path, output_folder)

    # 2) Caption images
    image_files = [path for _, path in doc_ids]
    captions = caption_images(image_files)

    # 3) Store captions
    store_in_vector_store(captions, doc_ids)

    # 4) Use retriever
    retriever = build_retriever()
    results = retriever.invoke("a cat on a table")
    print("Top result:", results[0])
