# Detectron2 Figure Extractor

**Purpose:** Detect and extract figures from PDFs using Detectron2 + LayoutParser.

**Python:** 3.10 (Colab default)

**Flow:** PDF ‚Üí Render pages ‚Üí Detectron2 figure detection ‚Üí Crop ‚Üí GPT-4o Vision descriptions

---


## Cell 1: Install Dependencies

‚ö†Ô∏è **This cell takes ~3-5 minutes** (Detectron2 compilation)


In [None]:
# System dependencies
!apt-get update -qq && apt-get install -y -qq poppler-utils

# Python packages
%pip install -q pdf2image Pillow openai

# PyTorch (Colab usually has it, ensure CUDA 11.8 version)
%pip install -q torch torchvision --index-url https://download.pytorch.org/whl/cu118

# Detectron2 from source
!python -m pip install -q 'git+https://github.com/facebookresearch/detectron2.git'

# LayoutParser with Detectron2 support
%pip install -q "layoutparser[layoutmodels]"

print("‚úÖ Dependencies installed - restart runtime if prompted")


## Cell 2: Configuration


In [None]:
import os
from pathlib import Path

# === CONFIGURATION ===
OPENAI_API_KEY = ""  # Set your key here or use env var
PDF_PATH = "sample.pdf"  # Will be set after upload
OUTPUT_DIR = "detectron_output"
RENDER_DPI = 150  # Higher = better quality but slower

# Set API key
if OPENAI_API_KEY:
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# Create output directory
output_path = Path(OUTPUT_DIR)
output_path.mkdir(exist_ok=True)
(output_path / "figures").mkdir(exist_ok=True)

print(f"üìÅ Output directory: {output_path.absolute()}")


## Cell 3: Upload PDF (Colab)


In [None]:
from google.colab import files

print("üì§ Upload your PDF:")
uploaded = files.upload()

if uploaded:
    PDF_PATH = list(uploaded.keys())[0]
    print(f"‚úÖ Using: {PDF_PATH}")
else:
    print("‚ö†Ô∏è No file uploaded")


## Cell 4: Initialize Detectron2 Model


In [None]:
from layoutparser.models import Detectron2LayoutModel

# Load PubLayNet model (trained on academic papers)
# Labels: Text, Title, List, Table, Figure
print("üîÑ Loading Detectron2 model (first run downloads ~350MB)...")

model = Detectron2LayoutModel(
    config_path="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config",
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
    label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
)

print("‚úÖ Detectron2 model loaded")


## Cell 5: Detect & Crop Figures


In [None]:
from pdf2image import convert_from_path
from PIL import Image
import numpy as np

def detect_figures(pdf_path: str, model, output_dir: Path, dpi: int = 150):
    """Detect figures in PDF using Detectron2, crop and save them."""
    figures = []
    figures_dir = output_dir / "figures"
    
    # Render PDF pages
    print(f"üìÑ Rendering PDF at {dpi} DPI...")
    pages = convert_from_path(pdf_path, dpi=dpi)
    print(f"   Found {len(pages)} pages")
    
    for page_num, page_img in enumerate(pages, start=1):
        print(f"   Processing page {page_num}...")
        
        img_array = np.array(page_img)
        width, height = page_img.size
        
        # Run layout detection
        layout = model.detect(img_array)
        
        # Filter for figures only
        figure_blocks = [b for b in layout if b.type == "Figure"]
        
        for idx, block in enumerate(figure_blocks, start=1):
            x1 = int(block.block.x_1)
            y1 = int(block.block.y_1)
            x2 = int(block.block.x_2)
            y2 = int(block.block.y_2)
            
            # Skip tiny detections
            area_ratio = ((x2 - x1) * (y2 - y1)) / (width * height)
            if area_ratio < 0.01:
                continue
            
            # Crop and save
            cropped = page_img.crop((x1, y1, x2, y2))
            fig_id = f"fig_p{page_num:02d}_{idx:02d}"
            file_path = figures_dir / f"{fig_id}.png"
            cropped.save(file_path, "PNG")
            
            figures.append({
                "id": fig_id,
                "page": page_num,
                "bbox": [x1/width, y1/height, x2/width, y2/height],
                "confidence": round(block.score, 3),
                "file_path": str(file_path),
                "description": None
            })
            print(f"      ‚úì {fig_id} (conf: {block.score:.2f})")
    
    return figures

# Run detection
detected_figures = detect_figures(PDF_PATH, model, output_path, dpi=RENDER_DPI)
print(f"\nüéØ Total figures detected: {len(detected_figures)}")


## Cell 6: GPT-4o Vision Descriptions


import base64
from openai import OpenAI

def encode_image_base64(image_path: str) -> str:
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

def describe_figure(client: OpenAI, image_path: str) -> str:
    """Get GPT-4o Vision description of a figure."""
    base64_img = encode_image_base64(image_path)
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": "Describe this figure concisely in 1-2 sentences. Focus on what it shows (chart type, data, diagram elements)."},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_img}", "detail": "low"}}
            ]
        }],
        max_tokens=150
    )
    return response.choices[0].message.content.strip()

# Generate descriptions
if detected_figures and os.environ.get("OPENAI_API_KEY"):
    print("ü§ñ Generating GPT-4o Vision descriptions...\n")
    client = OpenAI()
    
    for fig in detected_figures:
        print(f"   {fig['id']}...", end=" ")
        try:
            fig["description"] = describe_figure(client, fig["file_path"])
            print("‚úì")
        except Exception as e:
            fig["description"] = f"Error: {e}"
            print(f"‚úó")
    
    print("\n‚úÖ Descriptions complete")
else:
    print("‚ö†Ô∏è Skipping descriptions (no API key or no figures)")


## Cell 7: Save Results


In [None]:
import json
from datetime import datetime

# Build output JSON
result = {
    "metadata": {
        "source_file": PDF_PATH,
        "extraction_method": "detectron2_layoutparser",
        "model": "PubLayNet/faster_rcnn_R_50_FPN_3x",
        "timestamp": datetime.now().isoformat(),
        "figures_detected": len(detected_figures)
    },
    "figures": detected_figures
}

# Save JSON
output_json = output_path / "detectron_figures.json"
with open(output_json, "w") as f:
    json.dump(result, f, indent=2)

print(f"üíæ Saved: {output_json}")

# Display results
print("\n" + "="*50)
for fig in detected_figures:
    print(f"\nüì∑ {fig['id']} (page {fig['page']}, conf: {fig['confidence']})")
    if fig['description']:
        print(f"   ‚Üí {fig['description'][:100]}...")


## Cell 8: Download (Colab)


In [None]:
import shutil
from google.colab import files

# Zip and download
zip_path = shutil.make_archive("detectron_output", "zip", output_path)
files.download(zip_path)
print("‚úÖ Download started")
