In [1]:
import os
import pdfplumber
from pdf2image import convert_from_path

In [2]:
def extract_images_from_folder(input_folder, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # List all PDF files in the input folder
    pdf_files = [f for f in os.listdir(input_folder) if f.endswith('.pdf')]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(input_folder, pdf_file)
        pdf_name = os.path.splitext(pdf_file)[0]

        images_info = extract_images_info(pdf_path)

        # Create a subfolder for each PDF file
        pdf_output_folder = os.path.join(output_folder, pdf_name)
        if not os.path.exists(pdf_output_folder):
            os.makedirs(pdf_output_folder)

        for page_num, image_info in enumerate(images_info, start=1):
            x, y, width, height = image_info['x'], image_info['y'], image_info['width'], image_info['height']

            images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)

            for img_num, img in enumerate(images, start=1):
                img.save(os.path.join(pdf_output_folder, f"page_{page_num}_image_{img_num}.png"), "PNG")

def extract_images_info(pdf_path):
    images_info = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            for img in page.images:
                image_info = {
                    'page_number': page_num,
                    'x': img['x'],
                    'y': img['y'],
                    'width': img['width'],
                    'height': img['height'],
                }
                images_info.append(image_info)

    return images_info

In [3]:
# Specify the input and output folders
input_folder = 'PDF'
output_folder = 'Image'

extract_images_from_folder(input_folder, output_folder)

KeyError: 'x'

In [4]:
import os
import pdfplumber
from pdf2image import convert_from_path

def extract_images_from_folder(input_folder, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # List all PDF files in the input folder
    pdf_files = [f for f in os.listdir(input_folder) if f.endswith('.pdf')]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(input_folder, pdf_file)
        pdf_name = os.path.splitext(pdf_file)[0]

        images_info = extract_images_info(pdf_path)

        # Create a subfolder for each PDF file
        pdf_output_folder = os.path.join(output_folder, pdf_name)
        if not os.path.exists(pdf_output_folder):
            os.makedirs(pdf_output_folder)

        for page_num, image_info in enumerate(images_info, start=1):
            page_image = image_info['image']
            image_path = os.path.join(pdf_output_folder, f"page_{page_num}.png")
            page_image.save(image_path, "PNG")

def extract_images_info(pdf_path):
    images_info = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            images_on_page = page.images
            for img_num, img in enumerate(images_on_page, start=1):
                image_info = {
                    'page_number': page_num,
                    'image': page.to_image(resolution=300),  # Convert the page to an image
                }
                images_info.append(image_info)

    return images_info

# Specify the input and output folders
input_folder = 'PDF'
output_folder = 'Image'

extract_images_from_folder(input_folder, output_folder)


In [5]:
import fitz  # PyMuPDF
import os

def extract_images_from_folder(input_folder, output_folder):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # List all PDF files in the input folder
    pdf_files = [f for f in os.listdir(input_folder) if f.endswith('.pdf')]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(input_folder, pdf_file)
        pdf_name = os.path.splitext(pdf_file)[0]

        extract_images_from_pdf(pdf_path, output_folder, pdf_name)

def extract_images_from_pdf(pdf_file_path, output_folder, pdf_name):
    pdf_document = fitz.open(pdf_file_path)
    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_data = base_image["image"]

            # Save the image as a PNG file
            image_filename = os.path.join(output_folder, f"{pdf_name}_page_{page_num + 1}_img_{img_index + 1}.png")
            with open(image_filename, "wb") as image_file:
                image_file.write(image_data)

# Specify the input PDF folder and the output folder
input_folder = 'PDF'
output_folder = 'IMAGE'

extract_images_from_folder(input_folder, output_folder)
