In [1]:
!pip install docling transformers Pillow pandas matplotlib




In [6]:
pip install docling Pillow

Note: you may need to restart the kernel to use updated packages.


In [16]:
# Import required libraries
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions
import os
import logging
from IPython.display import Image, display

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def find_caption_near_coordinates(doc, bbox, page_no):
    """Find caption text near the given coordinates."""
    caption = None
    min_distance = float('inf')
    
    # Look through all text elements
    for text_item in doc.texts:
        # Check if text is on the same page and near the image
        if hasattr(text_item, 'prov') and text_item.prov:
            text_prov = text_item.prov[0]
            if (text_prov.page_no == page_no and 
                hasattr(text_item, 'text') and 
                ('figure' in text_item.text.lower() or 'fig.' in text_item.text.lower())):
                
                # Calculate distance from image bottom to text
                distance = abs(text_prov.bbox.t - bbox.b)
                if distance < min_distance:
                    min_distance = distance
                    caption = text_item.text.strip()
    
    return caption

def extract_images_and_captions(pdf_path):
    """Extract images and their captions from a PDF file."""
    output_dir = "./output/images"
    os.makedirs(output_dir, exist_ok=True)
    
    # Configure pipeline options
    pipeline_options = PdfPipelineOptions()
    pipeline_options.images_scale = 2.0
    pipeline_options.generate_page_images = True
    pipeline_options.generate_picture_images = True
    
    # Initialize document converter
    doc_converter = DocumentConverter()
    
    try:
        logging.info(f"Processing {pdf_path.name}...")
        conv_res = doc_converter.convert(pdf_path)
        
        if not conv_res:
            logging.error("No conversion result found")
            return []
        
        images_list = []
        image_number = 1
        
        # Process items
        for item, level in conv_res.document.iterate_items():
            if isinstance(item, PictureItem) and item.prov:
                try:
                    logging.info(f"Processing PictureItem {image_number}")
                    
                    # Get image location
                    bbox = item.prov[0].bbox
                    page_no = item.prov[0].page_no
                    
                    # Find caption near image
                    caption = find_caption_near_coordinates(conv_res.document, bbox, page_no)
                    if not caption:
                        caption = "No caption found"
                    
                    logging.info(f"Image {image_number} on page {page_no}")
                    logging.info(f"Location: {bbox.l:.2f}, {bbox.t:.2f}, {bbox.r:.2f}, {bbox.b:.2f}")
                    logging.info(f"Caption: {caption}")
                    
                    # Store result
                    images_list.append({
                        'number': image_number,
                        'page': page_no,
                        'caption': caption,
                        'bbox': bbox
                    })
                    
                    image_number += 1
                    
                except Exception as e:
                    logging.error(f"Error processing image {image_number}: {str(e)}")
                    continue
        
        logging.info(f"Found {len(images_list)} images with captions")
        return images_list
    
    except Exception as e:
        logging.error(f"Error processing PDF: {str(e)}")
        return []

def save_output(images_list):
    """Save the extracted information to a text file."""
    if not images_list:
        logging.warning("No images to save")
        return
        
    output_text = ""
    for img in images_list:
        output_text += f"<image_{img['number']}>\n"
        output_text += f"Page {img['page']}\n"
        output_text += f"{{image_{img['number']}_description: {img['caption']}}}\n\n"
    
    output_file = "./output/final_output.txt"
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, "w") as f:
        f.write(output_text)
    logging.info(f"Output saved to {output_file}")

# Run the extraction
try:
    pdf_path = Path("data/paper.pdf")
    images_list = extract_images_and_captions(pdf_path)
    save_output(images_list)
except Exception as e:
    logging.error(f"Error in main execution: {str(e)}")