## 1. Install Required Libraries

In [1]:
# !pip install pdfplumber pdf2image PyMuPDF Pillow
# !pip install layoutparser[layoutmodels,tesseract] torch torchvision

## 2. Import Libraries

In [2]:
import os
import fitz  # PyMuPDF
import pdfplumber
from pdf2image import convert_from_path
from PIL import Image, ImageDraw
import numpy as np
from pathlib import Path
import json

# Create output directories
OUTPUT_DIR = Path("../outputs")
DIAGRAMS_DIR = OUTPUT_DIR / "diagrams"
TEXT_DIR = OUTPUT_DIR / "text_sections"
ANNOTATED_DIR = OUTPUT_DIR / "annotated_pages"

for dir_path in [DIAGRAMS_DIR, TEXT_DIR, ANNOTATED_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

print("Libraries imported successfully!")

Libraries imported successfully!


## 3. Load PDF File

In [3]:
# Update this path to your PDF location
PDF_PATH = r"D:\Research Vision\research-vision\data\attention is all you need.pdf"

# Check if file exists
if not os.path.exists(PDF_PATH):
    print(f"PDF not found at {PDF_PATH}")
    print("Please download the paper and update the PDF_PATH variable.")
else:
    print(f"PDF found: {PDF_PATH}")
    
    # Open with PyMuPDF
    pdf_document = fitz.open(PDF_PATH)
    print(f"Total pages: {len(pdf_document)}")

PDF found: D:\Research Vision\research-vision\data\attention is all you need.pdf
Total pages: 15


## 4. Method 1: Extract Diagrams Using PyMuPDF

PyMuPDF can detect embedded images and extract them directly:

In [4]:
def extract_images_pymupdf(pdf_path, output_dir):
    """
    Extract all embedded images from PDF using PyMuPDF
    """
    doc = fitz.open(pdf_path)
    image_list = []
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        images = page.get_images(full=True)
        
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            
            # Save image
            image_filename = f"page_{page_num + 1}_img_{img_index + 1}.{image_ext}"
            image_path = output_dir / image_filename
            
            with open(image_path, "wb") as image_file:
                image_file.write(image_bytes)
            
            image_list.append({
                "page": page_num + 1,
                "index": img_index + 1,
                "filename": image_filename,
                "format": image_ext
            })
            
            print(f"Extracted: {image_filename}")
    
    doc.close()
    return image_list

# Extract images
if os.path.exists(PDF_PATH):
    extracted_images = extract_images_pymupdf(PDF_PATH, DIAGRAMS_DIR)
    print(f"\nTotal images extracted: {len(extracted_images)}")

Extracted: page_3_img_1.png
Extracted: page_4_img_1.png
Extracted: page_4_img_2.png

Total images extracted: 3

Total images extracted: 3


## 5. Method 2: Detect Figure Regions Using Layout Analysis

Use PyMuPDF to detect figure blocks and extract them with context:

In [5]:
def detect_and_extract_figures(pdf_path, output_dir):
    """
    Detect figure regions and extract them as cropped images
    """
    doc = fitz.open(pdf_path)
    figures = []
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        
        # Get page as image
        mat = fitz.Matrix(2, 2)  # 2x zoom for better quality
        pix = page.get_pixmap(matrix=mat)
        
        # Get all blocks (text, images, etc.)
        blocks = page.get_text("dict")["blocks"]
        
        figure_count = 0
        for block_num, block in enumerate(blocks):
            # Type 1 = image block
            if block["type"] == 1:
                bbox = block["bbox"]
                
                # Add margin around the figure
                margin = 10
                clip_bbox = fitz.Rect(
                    max(0, bbox[0] - margin),
                    max(0, bbox[1] - margin),
                    min(page.rect.width, bbox[2] + margin),
                    min(page.rect.height, bbox[3] + margin)
                )
                
                # Extract the region
                pix_clip = page.get_pixmap(matrix=mat, clip=clip_bbox)
                
                figure_count += 1
                filename = f"figure_page_{page_num + 1}_block_{figure_count}.png"
                filepath = output_dir / filename
                pix_clip.save(filepath)
                
                figures.append({
                    "page": page_num + 1,
                    "block": figure_count,
                    "filename": filename,
                    "bbox": [bbox[0], bbox[1], bbox[2], bbox[3]]
                })
                
                print(f"Extracted figure: {filename}")
    
    doc.close()
    return figures

# Extract figures with layout detection
if os.path.exists(PDF_PATH):
    detected_figures = detect_and_extract_figures(PDF_PATH, DIAGRAMS_DIR)
    print(f"\nTotal figures detected: {len(detected_figures)}")

Extracted figure: figure_page_3_block_1.png
Extracted figure: figure_page_4_block_1.png
Extracted figure: figure_page_4_block_2.png

Total figures detected: 3

Total figures detected: 3


## 6. Extract Text Content (Excluding Figure Regions)

Extract text while avoiding regions identified as figures:

In [6]:
def extract_text_excluding_figures(pdf_path, output_dir):
    """
    Extract text from PDF, marking and excluding figure regions
    """
    doc = fitz.open(pdf_path)
    text_data = []
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        
        # Get all blocks
        blocks = page.get_text("dict")["blocks"]
        
        page_text = []
        figure_regions = []
        
        for block in blocks:
            if block["type"] == 0:  # Text block
                # Extract text from lines
                for line in block.get("lines", []):
                    line_text = ""
                    for span in line.get("spans", []):
                        line_text += span.get("text", "")
                    if line_text.strip():
                        page_text.append(line_text)
            
            elif block["type"] == 1:  # Image block
                figure_regions.append(block["bbox"])
        
        # Save text for this page
        text_content = "\n".join(page_text)
        text_filename = f"page_{page_num + 1}_text.txt"
        text_filepath = output_dir / text_filename
        
        with open(text_filepath, "w", encoding="utf-8") as f:
            f.write(text_content)
        
        text_data.append({
            "page": page_num + 1,
            "filename": text_filename,
            "text_blocks": len(page_text),
            "figure_regions": len(figure_regions)
        })
        
        print(f"Page {page_num + 1}: {len(page_text)} text blocks, {len(figure_regions)} figures")
    
    doc.close()
    return text_data

# Extract text
if os.path.exists(PDF_PATH):
    text_extraction = extract_text_excluding_figures(PDF_PATH, TEXT_DIR)
    print(f"\nText extracted from {len(text_extraction)} pages")

Page 1: 56 text blocks, 0 figures
Page 2: 53 text blocks, 0 figures
Page 3: 27 text blocks, 1 figures
Page 4: 43 text blocks, 2 figures
Page 5: 56 text blocks, 0 figures
Page 6: 64 text blocks, 0 figures
Page 7: 51 text blocks, 0 figures
Page 8: 87 text blocks, 0 figures
Page 9: 149 text blocks, 0 figures
Page 10: 76 text blocks, 0 figures
Page 11: 47 text blocks, 0 figures
Page 12: 43 text blocks, 0 figures
Page 13: 72 text blocks, 0 figures
Page 14: 112 text blocks, 0 figures
Page 15: 112 text blocks, 0 figures

Text extracted from 15 pages
Page 4: 43 text blocks, 2 figures
Page 5: 56 text blocks, 0 figures
Page 6: 64 text blocks, 0 figures
Page 7: 51 text blocks, 0 figures
Page 8: 87 text blocks, 0 figures
Page 9: 149 text blocks, 0 figures
Page 10: 76 text blocks, 0 figures
Page 11: 47 text blocks, 0 figures
Page 12: 43 text blocks, 0 figures
Page 13: 72 text blocks, 0 figures
Page 14: 112 text blocks, 0 figures
Page 15: 112 text blocks, 0 figures

Text extracted from 15 pages


## 7. Create Annotated Visualization

Create annotated page images showing where diagrams and text are located:

In [7]:
def create_annotated_pages(pdf_path, output_dir):
    """
    Create annotated images showing text vs figure regions
    """
    doc = fitz.open(pdf_path)
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        
        # Get page as image
        mat = fitz.Matrix(2, 2)
        pix = page.get_pixmap(matrix=mat)
        
        # Convert to PIL Image
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        draw = ImageDraw.Draw(img)
        
        # Get blocks and annotate
        blocks = page.get_text("dict")["blocks"]
        
        for block in blocks:
            bbox = block["bbox"]
            # Scale bbox for 2x zoom
            scaled_bbox = [coord * 2 for coord in bbox]
            
            if block["type"] == 0:  # Text
                # Draw blue rectangle for text
                draw.rectangle(scaled_bbox, outline="blue", width=2)
            elif block["type"] == 1:  # Image/Figure
                # Draw red rectangle for figures
                draw.rectangle(scaled_bbox, outline="red", width=4)
        
        # Save annotated image
        annotated_filename = f"annotated_page_{page_num + 1}.png"
        img.save(output_dir / annotated_filename)
        print(f"Created: {annotated_filename}")
    
    doc.close()

# Create annotated pages
if os.path.exists(PDF_PATH):
    create_annotated_pages(PDF_PATH, ANNOTATED_DIR)
    print("\nAnnotated pages created (Red=Figures, Blue=Text)")

Created: annotated_page_1.png
Created: annotated_page_2.png
Created: annotated_page_3.png
Created: annotated_page_4.png
Created: annotated_page_3.png
Created: annotated_page_4.png
Created: annotated_page_5.png
Created: annotated_page_6.png
Created: annotated_page_7.png
Created: annotated_page_5.png
Created: annotated_page_6.png
Created: annotated_page_7.png
Created: annotated_page_8.png
Created: annotated_page_9.png
Created: annotated_page_10.png
Created: annotated_page_8.png
Created: annotated_page_9.png
Created: annotated_page_10.png
Created: annotated_page_11.png
Created: annotated_page_12.png
Created: annotated_page_13.png
Created: annotated_page_11.png
Created: annotated_page_12.png
Created: annotated_page_13.png
Created: annotated_page_14.png
Created: annotated_page_15.png

Annotated pages created (Red=Figures, Blue=Text)
Created: annotated_page_14.png
Created: annotated_page_15.png

Annotated pages created (Red=Figures, Blue=Text)


## 8. Save Metadata

Save extraction metadata for reference:

In [8]:
# Compile all metadata
metadata = {
    "pdf_path": PDF_PATH,
    "total_pages": len(pdf_document) if os.path.exists(PDF_PATH) else 0,
    "extracted_images": extracted_images if 'extracted_images' in locals() else [],
    "detected_figures": detected_figures if 'detected_figures' in locals() else [],
    "text_extraction": text_extraction if 'text_extraction' in locals() else [],
    "output_directories": {
        "diagrams": str(DIAGRAMS_DIR),
        "text": str(TEXT_DIR),
        "annotated": str(ANNOTATED_DIR)
    }
}

# Save metadata
metadata_path = OUTPUT_DIR / "extraction_metadata.json"
with open(metadata_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2)

print(f"Metadata saved to: {metadata_path}")

Metadata saved to: ..\outputs\extraction_metadata.json


## 9. Summary and Next Steps

### Output Structure:
- **Diagrams**: `outputs/diagrams/` - All extracted figures and diagrams
- **Text**: `outputs/text_sections/` - Text content per page
- **Annotated**: `outputs/annotated_pages/` - Visual representation of regions

### Key Features:
1. **Embedded Image Extraction**: Direct extraction of images from PDF
2. **Layout-Based Detection**: Identifies figure regions using PyMuPDF blocks
3. **Text Separation**: Extracts text while avoiding figure regions
4. **Visual Annotation**: Color-coded visualization of content regions

### Next Steps:
- Fine-tune extraction parameters for better accuracy
- Add OCR for text within diagrams if needed
- Use LayoutParser for more advanced document understanding
- Implement figure caption extraction and linking

In [9]:
# Display summary statistics
if os.path.exists(PDF_PATH):
    print("=" * 60)
    print("EXTRACTION SUMMARY")
    print("=" * 60)
    print(f"PDF: {PDF_PATH}")
    print(f"Total Pages: {len(pdf_document)}")
    print(f"Embedded Images: {len(extracted_images)}")
    print(f"Detected Figures: {len(detected_figures)}")
    print(f"Text Pages: {len(text_extraction)}")
    print("=" * 60)
    print(f"\nOutputs saved to: {OUTPUT_DIR}")
else:
    print("Please set the correct PDF_PATH and run the cells again.")

EXTRACTION SUMMARY
PDF: D:\Research Vision\research-vision\data\attention is all you need.pdf
Total Pages: 15
Embedded Images: 3
Detected Figures: 3
Text Pages: 15

Outputs saved to: ..\outputs
