# PDF Text and Image Extraction

This notebook is designed to perform text extraction from all pages of specific PDFs, using PyMuPDF.
It extracts text as shown in the PDFs, with extraneous whitespace removed and ignores .72 inches of border on each page.
Additionally, it extracts images larger than 200 pixels in any dimension or larger than 100kb in size.

The extracted text and images are saved in the same directory as the PDFs.


In [1]:
import fitz  # PyMuPDF
import os


In [2]:
def extract_text_and_images(pdf_path):
    doc = fitz.open(pdf_path)
    text_folder = os.path.splitext(pdf_path)[0] + '_text'
    image_folder = os.path.splitext(pdf_path)[0] + '_images'
    os.makedirs(text_folder, exist_ok=True)
    os.makedirs(image_folder, exist_ok=True)
    
    for page_num, page in enumerate(doc):
        # Extract text, ignoring .72 inches of border
        text = page.get_textbox(rect=fitz.Rect(72 * 0.72, 72 * 0.72, page.rect.width - (72 * 0.72), page.rect.height - (72 * 0.72)))
        text_file_path = os.path.join(text_folder, f'page_{page_num}.txt')
        with open(text_file_path, 'w') as f:
            f.write(text.strip())
        
        # Extract images
        image_list = page.get_images(full=True)
        for image_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image = base_image["image"]
            # Check if image meets size criteria
            if base_image["width"] > 200 or base_image["height"] > 200 or len(image) > 100 * 1024:
                image_path = os.path.join(image_folder, f'page_{page_num}_image_{image_index}.png')
                with open(image_path, 'wb') as img_file:
                    img_file.write(image)


In [3]:
for pdf_path in pdf_paths:
    extract_text_and_images(pdf_path)
    print(f'Completed extraction for: {pdf_path}')


NameError: name 'pdf_paths' is not defined