In [3]:
import fitz  # PyMuPDF
import os

# Define input and output folder paths
PDF_FOLDER = r"D:\Academic\Major Project\DataMorphAI Code\Dataset For initial testing"
OUTPUT_FOLDER = r"D:\Academic\Major Project\DataMorphAI Code\Dataset For initial testing\Extracted_images"

# Create output folder if it doesn't exist
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Dictionary of PDF files and the page ranges to extract (0-indexed)
pdf_page_ranges = {
    "file1.pdf": (2, 8),
    "file2.pdf": (6, 8),
    "file3.pdf": (5, 9),
    "file4.pdf": (4, 6),
    "file5.pdf": (5, 6),
    "file6.pdf": (8, 9),
    "file7.pdf": (6, 7),
    "file8.pdf": (5, 7),
    "file9.pdf": (4, 5),
    "file10.pdf": (7, 8),
    "file11.pdf": (6, 7),
    "file12.pdf": (6, 8),
    "file13.pdf": (6, 10),
    "file14.pdf": (3, 4),
}

def convert_pdf_pages_to_images(pdf_path, start_page, end_page, output_dir):
    """
    Converts specified pages of a PDF into images and saves them.
    """
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Failed to open {pdf_path}: {e}")
        return

    total_pages = doc.page_count

    # Validate start and end page
    if start_page >= total_pages:
        print(f"Start page {start_page + 1} exceeds total pages in {pdf_path} ({total_pages}). Skipping.")
        doc.close()
        return

    end_page = min(end_page, total_pages - 1)

    # Iterate through valid page range and convert to image
    for page_num in range(start_page, end_page + 1):
        try:
            page = doc.load_page(page_num)
            pix = page.get_pixmap(dpi=200)  # 200 DPI for decent image quality

            base_name = os.path.splitext(os.path.basename(pdf_path))[0]
            image_name = f"{base_name}_page_{page_num + 1}.png"
            image_path = os.path.join(output_dir, image_name)

            pix.save(image_path)
            print(f"Saved page {page_num + 1} of {base_name} as {image_name}")
        except Exception as e:
            print(f"Error processing page {page_num + 1} of {pdf_path}: {e}")

    doc.close()

def main():
    """
    Main function that iterates over all PDFs and extracts specified pages.
    """
    for pdf_file, (start, end) in pdf_page_ranges.items():
        pdf_full_path = os.path.join(PDF_FOLDER, pdf_file)

        # Check if the PDF file exists
        if not os.path.exists(pdf_full_path):
            print(f"File not found: {pdf_full_path}, skipping.")
            continue

        print(f"\nProcessing {pdf_file}, pages {start + 1} to {end + 1}...")
        convert_pdf_pages_to_images(pdf_full_path, start, end, OUTPUT_FOLDER)

if __name__ == "__main__":
    main()



Processing file1.pdf, pages 3 to 9...
Saved page 3 of file1 as file1_page_3.png
Saved page 4 of file1 as file1_page_4.png
Saved page 5 of file1 as file1_page_5.png
Saved page 6 of file1 as file1_page_6.png
Saved page 7 of file1 as file1_page_7.png

Processing file2.pdf, pages 7 to 9...
Saved page 7 of file2 as file2_page_7.png
Saved page 8 of file2 as file2_page_8.png
Saved page 9 of file2 as file2_page_9.png

Processing file3.pdf, pages 6 to 10...
Saved page 6 of file3 as file3_page_6.png
Saved page 7 of file3 as file3_page_7.png
Saved page 8 of file3 as file3_page_8.png
Saved page 9 of file3 as file3_page_9.png
Saved page 10 of file3 as file3_page_10.png

Processing file4.pdf, pages 5 to 7...
Saved page 5 of file4 as file4_page_5.png
Saved page 6 of file4 as file4_page_6.png
Saved page 7 of file4 as file4_page_7.png

Processing file5.pdf, pages 6 to 7...
Saved page 6 of file5 as file5_page_6.png
Saved page 7 of file5 as file5_page_7.png

Processing file6.pdf, pages 9 to 10...
Saved 