# üñºÔ∏è Image Extraction Pipeline

This notebook extracts **images and figures** from PDFs using a two-tier approach:

| Tier | Tool | What It Detects |
|------|------|------------------|
| **1** | PyMuPDF | Embedded raster images (JPG/PNG stored in PDF) |
| **2** | PaddleOCR PP-Structure | Vector figures (bar charts, diagrams, flowcharts) |

## How It Works

1. **PyMuPDF** extracts embedded images with exact bounding boxes
2. **PaddleOCR PP-Structure** detects "Figure" regions via ML-based layout detection
3. **Deduplication** removes overlapping detections (IoU > 0.5)
4. **GPT-4o Vision** generates descriptions for each unique image

> **Note:** Text and tables are handled separately by `universal_parser.ipynb`

In [None]:
# Cell 2: Install Dependencies
# ============================

# Core packages
%pip install -q pymupdf pdf2image openai Pillow

# PaddleOCR for vector figure detection (PP-Structure)
%pip install -q paddlepaddle paddleocr

# Install Poppler (required by pdf2image) - Colab only
import subprocess
import sys

try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    subprocess.run(['apt-get', 'update', '-qq'], check=True)
    subprocess.run(['apt-get', 'install', '-y', '-qq', 'poppler-utils'], check=True)
    print("‚úÖ Poppler installed (Colab)")
else:
    print("‚ö†Ô∏è Not in Colab - ensure poppler-utils is installed locally")
    print("   macOS: brew install poppler")
    print("   Ubuntu: sudo apt-get install poppler-utils")

print("\n‚úÖ All dependencies installed!")

In [None]:
# Cell 3: Configuration
# =====================

import os

# ============================================
# üîë SET YOUR OPENAI API KEY HERE
# ============================================
OPENAI_API_KEY = ""  # Paste your key or leave empty to use environment variable

# Output directory for extracted images
OUTPUT_DIR = "image_output"

# Options
GENERATE_DESCRIPTIONS = True  # Set False to skip LLM costs during testing

# ============================================
# Apply configuration
# ============================================
if OPENAI_API_KEY:
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
    print("‚úÖ API key set from notebook")
elif os.environ.get("OPENAI_API_KEY"):
    print("‚úÖ API key found in environment")
else:
    if GENERATE_DESCRIPTIONS:
        print("‚ö†Ô∏è Warning: No API key set! Descriptions will fail.")
        print("   Set OPENAI_API_KEY above or in environment.")
    else:
        print("‚ÑπÔ∏è No API key (descriptions disabled anyway)")

print(f"\nüìÅ Output directory: {OUTPUT_DIR}")
print(f"üìù Generate descriptions: {GENERATE_DESCRIPTIONS}")

In [None]:
# Cell 4: Upload PDF
# ==================

import os

try:
    import google.colab
    from google.colab import files
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    print("üì§ Upload your PDF file:")
    uploaded = files.upload()
    PDF_PATH = list(uploaded.keys())[0]
    print(f"\n‚úÖ Uploaded: {PDF_PATH}")
else:
    # Local development - set path manually
    PDF_PATH = "sample_pdfs/your_document.pdf"  # Change this path
    
    # Or use file picker
    sample_dir = "sample_pdfs"
    if os.path.exists(sample_dir):
        pdfs = [f for f in os.listdir(sample_dir) if f.endswith('.pdf')]
        if pdfs:
            print("üìÑ Available PDFs in sample_pdfs/:")
            for i, pdf in enumerate(pdfs):
                print(f"   {i+1}. {pdf}")
            PDF_PATH = os.path.join(sample_dir, pdfs[0])
            print(f"\n‚úÖ Using: {PDF_PATH}")
        else:
            print("‚ö†Ô∏è No PDFs found in sample_pdfs/")
    else:
        print(f"‚ö†Ô∏è Directory not found: {sample_dir}")
        print(f"   Set PDF_PATH manually above")

In [None]:
# Cell 5: ImageExtractor Class
# ============================

import os
import json
import base64
from datetime import datetime
from typing import List, Dict, Optional, Tuple
from pathlib import Path

import fitz  # PyMuPDF
from PIL import Image
from pdf2image import convert_from_path
from openai import OpenAI


class ImageExtractor:
    """
    Extracts images from PDFs using PyMuPDF + PaddleOCR PP-Structure.
    Deduplicates overlapping detections and generates GPT-4o descriptions.
    """
    
    def __init__(
        self, 
        output_dir: str = "image_output",
        api_key: Optional[str] = None,
        generate_descriptions: bool = True
    ):
        """
        Initialize the ImageExtractor.
        
        Args:
            output_dir: Directory to save extracted images
            api_key: OpenAI API key (uses env var if not provided)
            generate_descriptions: Whether to generate GPT-4o descriptions
        """
        self.output_dir = Path(output_dir)
        self.images_dir = self.output_dir / "images"
        self.generate_descriptions = generate_descriptions
        
        # Create output directories
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.images_dir.mkdir(parents=True, exist_ok=True)
        
        # Setup OpenAI client
        if generate_descriptions:
            api_key = api_key or os.environ.get("OPENAI_API_KEY")
            if not api_key:
                raise ValueError("OpenAI API key required for descriptions")
            self.client = OpenAI(api_key=api_key)
        else:
            self.client = None
        
        # Lazy load PaddleOCR model
        self._layout_engine = None
        self._paddle_loaded = False
    
    def _load_paddle_model(self):
        """Load PaddleOCR PP-Structure model."""
        if self._paddle_loaded:
            return
        
        print("üîÑ Loading PaddleOCR PP-Structure model...")
        from paddleocr import PPStructure
        
        self._layout_engine = PPStructure(
            show_log=False,
            layout=True,
            table=False,
            ocr=False
        )
        self._paddle_loaded = True
        print("‚úÖ PaddleOCR model loaded")
    
    def extract(self, pdf_path: str) -> Dict:
        """
        Main extraction pipeline.
        
        Args:
            pdf_path: Path to the PDF file
            
        Returns:
            Dictionary with metadata and extracted images
        """
        pdf_path = Path(pdf_path)
        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF not found: {pdf_path}")
        
        print(f"\n{'='*60}")
        print(f"üìÑ Processing: {pdf_path.name}")
        print(f"{'='*60}")
        
        # Get page count
        doc = fitz.open(pdf_path)
        page_count = len(doc)
        doc.close()
        print(f"üìë Pages: {page_count}")
        
        # Step 1: Extract embedded images (PyMuPDF)
        print("\nüîç Step 1: Extracting embedded images (PyMuPDF)...")
        pymupdf_images = self._extract_embedded_images(pdf_path)
        print(f"   Found: {len(pymupdf_images)} embedded images")
        
        # Step 2: Detect figure regions (PaddleOCR)
        print("\nüîç Step 2: Detecting figure regions (PaddleOCR)...")
        paddle_figures = self._detect_figure_regions(pdf_path)
        print(f"   Found: {len(paddle_figures)} figure regions")
        
        # Step 3: Deduplicate
        print("\nüîç Step 3: Deduplicating overlapping detections...")
        unique_paddle = self._deduplicate(paddle_figures, pymupdf_images)
        duplicates_removed = len(paddle_figures) - len(unique_paddle)
        print(f"   Duplicates removed: {duplicates_removed}")
        print(f"   Unique PaddleOCR figures: {len(unique_paddle)}")
        
        # Combine all unique images
        all_images = pymupdf_images + unique_paddle
        print(f"\nüìä Total unique images: {len(all_images)}")
        
        # Step 4: Generate descriptions
        if self.generate_descriptions and all_images:
            print("\nüîç Step 4: Generating GPT-4o descriptions...")
            for i, img in enumerate(all_images):
                print(f"   [{i+1}/{len(all_images)}] {img['id']}...", end=" ")
                try:
                    description = self._describe_image(img['file_path'])
                    img['description'] = description
                    print("‚úÖ")
                except Exception as e:
                    img['description'] = f"Error: {str(e)}"
                    print(f"‚ùå {e}")
        else:
            for img in all_images:
                img['description'] = None
            if not self.generate_descriptions:
                print("\n‚è≠Ô∏è Step 4: Skipped (descriptions disabled)")
        
        # Build result
        result = {
            "metadata": {
                "source_file": pdf_path.name,
                "page_count": page_count,
                "images_found": len(all_images),
                "extraction_timestamp": datetime.now().isoformat(),
                "methods_used": {
                    "pymupdf": True,
                    "paddleocr": True,
                    "descriptions": self.generate_descriptions
                },
                "stats": {
                    "pymupdf_images": len(pymupdf_images),
                    "paddleocr_figures": len(paddle_figures),
                    "duplicates_removed": duplicates_removed,
                    "unique_images": len(all_images)
                }
            },
            "images": all_images
        }
        
        print(f"\n{'='*60}")
        print("‚úÖ Extraction complete!")
        print(f"{'='*60}")
        
        return result
    
    def _extract_embedded_images(self, pdf_path: Path) -> List[Dict]:
        """
        Extract embedded raster images using PyMuPDF.
        
        Returns:
            List of image dictionaries with id, page, type, source, bbox, file_path
        """
        images = []
        doc = fitz.open(pdf_path)
        
        for page_num in range(len(doc)):
            page = doc[page_num]
            page_rect = page.rect
            page_width = page_rect.width
            page_height = page_rect.height
            
            # Get all images on this page
            image_list = page.get_images(full=True)
            
            for img_idx, img_info in enumerate(image_list):
                xref = img_info[0]
                
                try:
                    # Get image rectangle(s)
                    rects = page.get_image_rects(xref)
                    if not rects:
                        continue
                    
                    # Use the first rectangle
                    rect = rects[0]
                    
                    # Normalize bounding box (0-1)
                    bbox = [
                        rect.x0 / page_width,
                        rect.y0 / page_height,
                        rect.x1 / page_width,
                        rect.y1 / page_height
                    ]
                    
                    # Skip tiny images (likely decorative)
                    area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
                    if area < 0.01:  # Less than 1% of page
                        continue
                    
                    # Extract image data
                    base_image = doc.extract_image(xref)
                    image_bytes = base_image["image"]
                    image_ext = base_image["ext"]
                    
                    # Generate ID and save
                    img_id = f"img_{page_num+1:03d}_{img_idx+1:02d}"
                    file_name = f"page{page_num+1}_{img_id}.{image_ext}"
                    file_path = self.images_dir / file_name
                    
                    with open(file_path, "wb") as f:
                        f.write(image_bytes)
                    
                    images.append({
                        "id": img_id,
                        "page": page_num + 1,
                        "type": "embedded",
                        "source": "pymupdf",
                        "bbox": bbox,
                        "file_path": str(file_path)
                    })
                    
                except Exception as e:
                    print(f"      ‚ö†Ô∏è Page {page_num+1}, image {img_idx}: {e}")
                    continue
        
        doc.close()
        return images
    
    def _detect_figure_regions(self, pdf_path: Path) -> List[Dict]:
        """
        Detect figure regions using PaddleOCR PP-Structure.
        
        Returns:
            List of figure dictionaries with id, page, type, source, bbox, file_path
        """
        self._load_paddle_model()
        
        import numpy as np
        
        figures = []
        
        # Render PDF pages as images
        print("      Rendering PDF pages...")
        page_images = convert_from_path(pdf_path, dpi=150)
        
        for page_num, page_img in enumerate(page_images):
            print(f"      Processing page {page_num + 1}...")
            
            # Convert PIL Image to numpy array
            img_array = np.array(page_img)
            img_width, img_height = page_img.size
            
            # Run layout detection
            result = self._layout_engine(img_array)
            
            for idx, item in enumerate(result):
                if item['type'] != 'figure':
                    continue
                
                # Get bounding box [x1, y1, x2, y2]
                x1, y1, x2, y2 = item['bbox']
                
                # Normalize to 0-1
                bbox = [
                    x1 / img_width,
                    y1 / img_height,
                    x2 / img_width,
                    y2 / img_height
                ]
                
                # Skip tiny detections
                area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
                if area < 0.01:
                    continue
                
                # Crop and save
                cropped = page_img.crop((int(x1), int(y1), int(x2), int(y2)))
                fig_id = f"fig_{page_num+1:03d}_{idx+1:02d}"
                file_path = self.images_dir / f"page{page_num+1}_{fig_id}.png"
                cropped.save(file_path, "PNG")
                
                figures.append({
                    "id": fig_id,
                    "page": page_num + 1,
                    "type": "figure",
                    "source": "paddleocr",
                    "bbox": bbox,
                    "file_path": str(file_path)
                })
        
        return figures
    
    def _deduplicate(self, paddle_figures: List[Dict], pymupdf_images: List[Dict]) -> List[Dict]:
        """
        Remove PaddleOCR figures that overlap with PyMuPDF images.
        Uses IoU (Intersection over Union) with threshold of 0.5.
        
        Args:
            paddle_figures: List of figures from PaddleOCR
            pymupdf_images: List of images from PyMuPDF
            
        Returns:
            Deduplicated list of PaddleOCR figures
        """
        if not paddle_figures or not pymupdf_images:
            return paddle_figures
        
        unique_figures = []
        
        for paddle_fig in paddle_figures:
            is_duplicate = False
            
            # Only compare with PyMuPDF images on the same page
            same_page_images = [
                img for img in pymupdf_images 
                if img['page'] == paddle_fig['page']
            ]
            
            for pymupdf_img in same_page_images:
                iou = self._calculate_iou(paddle_fig['bbox'], pymupdf_img['bbox'])
                if iou > 0.5:
                    is_duplicate = True
                    # Clean up the duplicate file
                    try:
                        os.remove(paddle_fig['file_path'])
                    except:
                        pass
                    break
            
            if not is_duplicate:
                unique_figures.append(paddle_fig)
        
        return unique_figures
    
    @staticmethod
    def _calculate_iou(box1: List[float], box2: List[float]) -> float:
        """
        Calculate IoU between two bounding boxes.
        Boxes are [x_min, y_min, x_max, y_max] normalized (0-1).
        """
        # Calculate intersection
        x1 = max(box1[0], box2[0])
        y1 = max(box1[1], box2[1])
        x2 = min(box1[2], box2[2])
        y2 = min(box1[3], box2[3])
        
        # No overlap
        if x2 <= x1 or y2 <= y1:
            return 0.0
        
        intersection = (x2 - x1) * (y2 - y1)
        
        # Calculate union
        area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
        area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
        union = area1 + area2 - intersection
        
        return intersection / union if union > 0 else 0.0
    
    def _describe_image(self, image_path: str) -> str:
        """
        Generate description using GPT-4o Vision.
        
        Args:
            image_path: Path to the image file
            
        Returns:
            Text description of the image
        """
        if not self.client:
            return "Descriptions disabled"
        
        # Read and encode image
        with open(image_path, "rb") as f:
            image_bytes = f.read()
        
        base64_image = base64.b64encode(image_bytes).decode('utf-8')
        
        # Determine image type from extension
        ext = Path(image_path).suffix.lower()
        media_type = {
            '.png': 'image/png',
            '.jpg': 'image/jpeg',
            '.jpeg': 'image/jpeg',
            '.gif': 'image/gif',
            '.webp': 'image/webp'
        }.get(ext, 'image/png')
        
        prompt = """Describe this image concisely. Include:
1. What type of image it is (photo, chart, diagram, etc.)
2. Key content and information shown
3. Any important text, labels, or data

Keep the description under 100 words."""
        
        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:{media_type};base64,{base64_image}"
                            }
                        }
                    ]
                }
            ],
            max_tokens=200,
            temperature=0.3
        )
        
        return response.choices[0].message.content.strip()


print("‚úÖ ImageExtractor class loaded!")

In [None]:
# Cell 6: Run Extraction
# ======================

# Initialize extractor
extractor = ImageExtractor(
    output_dir=OUTPUT_DIR,
    generate_descriptions=GENERATE_DESCRIPTIONS
)

# Run extraction
result = extractor.extract(PDF_PATH)

# Show summary
print("\n" + "="*60)
print("üìä EXTRACTION SUMMARY")
print("="*60)
print(f"Source: {result['metadata']['source_file']}")
print(f"Pages: {result['metadata']['page_count']}")
print(f"Total images: {result['metadata']['images_found']}")
print(f"  - PyMuPDF: {result['metadata']['stats']['pymupdf_images']}")
print(f"  - PaddleOCR: {result['metadata']['stats']['paddleocr_figures']}")
print(f"  - Duplicates removed: {result['metadata']['stats']['duplicates_removed']}")
print("="*60)

In [None]:
# Cell 7: Preview Extracted Images
# =================================

from PIL import Image
import matplotlib.pyplot as plt

if result['images']:
    # Create grid of images
    n_images = len(result['images'])
    cols = min(3, n_images)
    rows = (n_images + cols - 1) // cols
    
    fig, axes = plt.subplots(rows, cols, figsize=(5*cols, 5*rows))
    
    # Handle different cases for axes array
    if n_images == 1:
        axes = [[axes]]
    elif rows == 1:
        axes = [axes]
    elif cols == 1:
        axes = [[ax] for ax in axes]
    
    for idx, img_info in enumerate(result['images']):
        row = idx // cols
        col = idx % cols
        ax = axes[row][col] if rows > 1 or cols > 1 else axes[0][0]
        
        # Load and display image
        img = Image.open(img_info['file_path'])
        ax.imshow(img)
        ax.set_title(f"{img_info['id']} (p{img_info['page']})\n{img_info['source']}", fontsize=10)
        ax.axis('off')
    
    # Hide empty subplots
    for idx in range(n_images, rows * cols):
        row = idx // cols
        col = idx % cols
        axes[row][col].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    # Print descriptions
    print("\n" + "="*60)
    print("üìù IMAGE DESCRIPTIONS")
    print("="*60)
    for img_info in result['images']:
        print(f"\n[{img_info['id']}] Page {img_info['page']} ({img_info['source']})")
        desc = img_info.get('description')
        if desc:
            print(f"   {desc}")
        else:
            print("   (No description)")
else:
    print("‚ÑπÔ∏è No images found in this PDF")

In [None]:
# Cell 8: Save JSON and Download
# ===============================

import json
from pathlib import Path

# Save JSON output
output_path = Path(OUTPUT_DIR) / "extraction_result.json"
with open(output_path, "w") as f:
    json.dump(result, f, indent=2)

print(f"‚úÖ Results saved to: {output_path}")

# Pretty print JSON
print("\n" + "="*60)
print("üìÑ OUTPUT JSON")
print("="*60)
print(json.dumps(result, indent=2))

# Download in Colab
try:
    import google.colab
    from google.colab import files
    
    print("\nüì• Downloading files...")
    
    # Download JSON
    files.download(str(output_path))
    
    # Create zip of images
    import shutil
    images_dir = Path(OUTPUT_DIR) / "images"
    if images_dir.exists() and list(images_dir.iterdir()):
        zip_path = Path(OUTPUT_DIR) / "extracted_images"
        shutil.make_archive(str(zip_path), 'zip', str(images_dir))
        files.download(str(zip_path) + ".zip")
        print("‚úÖ Downloaded: extraction_result.json, extracted_images.zip")
    else:
        print("‚úÖ Downloaded: extraction_result.json")
        
except ImportError:
    print("\n‚ÑπÔ∏è Not in Colab - files saved locally:")
    print(f"   JSON: {output_path}")
    images_dir = Path(OUTPUT_DIR) / "images"
    if images_dir.exists():
        image_files = list(images_dir.iterdir())
        print(f"   Images: {len(image_files)} files in {images_dir}/")