# Multimodal RAG development scripts

In [5]:
import fitz
import os

### Image extraction from PDF

In [9]:
# import fitz  # PyMuPDF
# import os

def extract_images_from_pdf(pdf_path: str, output_folder: str, verbose: bool = True) -> None:
    """ This will save the availabel images in the provided PDF after extraction

    Args:
        pdf_path (str): path of PDF
        output_folder (str): path of folder that will contains the extracted images
    """

    # Open the PDF
    pdf_doc = fitz.open(pdf_path)
    if verbose:
        print(f"Total pages: {pdf_doc.page_count}")

    for page_num in range(len(pdf_doc)):
        page = pdf_doc[page_num]
        images = page.get_images(full=True)
        
        if verbose:
            print(f"Page {page_num + 1} has {len(images)} images")

        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = pdf_doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"page{page_num + 1}_img{img_index + 1}.{image_ext}"
            image_filepath = os.path.join(output_folder, image_filename)

            with open(image_filepath, "wb") as f:
                f.write(image_bytes)
                if verbose:
                    print(f"Saved: {image_filepath}")

    pdf_doc.close()



In [10]:

# Example usage
extract_images_from_pdf("./content/attention.pdf", "extracted_images")

Total pages: 11
Page 1 has 0 images
Page 2 has 0 images
Page 3 has 1 images
Saved: extracted_images/page3_img1.png
Page 4 has 2 images
Saved: extracted_images/page4_img1.png
Saved: extracted_images/page4_img2.png
Page 5 has 0 images
Page 6 has 0 images
Page 7 has 0 images
Page 8 has 0 images
Page 9 has 0 images
Page 10 has 0 images
Page 11 has 0 images


In [21]:
from langchain_community.document_loaders import ImageCaptionLoader

# Suppose you saved images in ./output_images
loader = ImageCaptionLoader(images=["/home/archit-elitebook/workarea/whole working/genai/projects/mmrag/extracted_images"])
docs = loader.load()

# print(docs[0].page_content)  # This is the captioned text


ValueError: Could not get image data for /home/archit-elitebook/workarea/whole working/genai/projects/mmrag/extracted_images

In [22]:

# Your images directory
images_dir = "/home/archit-elitebook/workarea/whole working/genai/projects/mmrag/extracted_images"

# Get all image file paths
image_files = [
    os.path.join(images_dir, f)
    for f in os.listdir(images_dir)
    if f.lower().endswith(('.png', '.jpg', '.jpeg', '.webp'))
]

# Pass the list of image files
loader = ImageCaptionLoader(images=image_files)
docs = loader.load()

print(docs[0].page_content)


an image of a diagram of a product [SEP]


In [24]:
docs[0]

Document(metadata={'image_path': '/home/archit-elitebook/workarea/whole working/genai/projects/mmrag/extracted_images/page4_img1.png'}, page_content='an image of a diagram of a product [SEP]')