In [1]:
import fitz
import io
import os
from PIL import Image

In [2]:
file = r'C:\Users\pandl\OneDrive\Desktop\FYP\Sarcoma.pdf'
pdf_file = fitz.open(file)

In [3]:
import os
import io
import glob
import fitz  # PyMuPDF
from PIL import Image, ImageStat

# Define the output folder using forward slashes
output_folder = "C:/Users/pandl/OneDrive/Desktop/FYP/Extracted"

# Create the folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"Created output folder: {output_folder}")

# Clear any existing images in the folder
def clear_existing_images(folder_path):
    """Remove all image files from the specified folder."""
    # Common image extensions to remove
    extensions = ['*.jpg', '*.jpeg', '*.png', '*.tiff', '*.bmp', '*.gif']
    file_count = 0
    
    for ext in extensions:
        pattern = os.path.join(folder_path, ext)
        files = glob.glob(pattern)
        for file_path in files:
            try:
                os.remove(file_path)
                file_count += 1
            except Exception as e:
                print(f"Error removing file {file_path}: {e}")
    
    # Also try with uppercase extensions
    for ext in [e.upper() for e in extensions]:
        pattern = os.path.join(folder_path, ext)
        files = glob.glob(pattern)
        for file_path in files:
            try:
                os.remove(file_path)
                file_count += 1
            except Exception as e:
                print(f"Error removing file {file_path}: {e}")
    
    print(f"Removed {file_count} existing image files from {folder_path}")

# Function to check if an image is mostly black
def is_black_image(image, threshold=5):
    """Returns True if the image is mostly black."""
    stat = ImageStat.Stat(image)
    brightness = sum(stat.mean) / len(stat.mean)  # Average brightness
    return brightness < (255 * (threshold / 100))  # Convert threshold to pixel range

def extract_images_from_pdf(pdf_file):
    """Extract images from PDF file and save non-black ones to the output folder."""
    # First, clear existing images - this MUST happen before extracting new images
    clear_existing_images(output_folder)
    print(f"Cleared existing images from {output_folder}")
    
    # Counter for extracted images
    total_images = 0
    saved_images = 0
    skipped_images = 0
    
    # Process each page in the PDF
    for page_number in range(len(pdf_file)):
        page = pdf_file[page_number]
        image_list = page.get_images()
        
        if image_list:
            print(f"Found {len(image_list)} images on page {page_number+1}")
        
        # Process each image on the page
        for image_index, img in enumerate(image_list, start=1):
            total_images += 1
            try:
                xref = img[0]
                base_image = pdf_file.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
                
                # Create a PIL Image object from the image bytes
                pil_image = Image.open(io.BytesIO(image_bytes))
                
                # Check if the image is black
                if is_black_image(pil_image):
                    print(f"⚠️ Skipping black image: Page {page_number+1}, Image {image_index}")
                    skipped_images += 1
                    continue  # Skip saving black images
                
                # Save the image to disk
                image_path = os.path.join(output_folder, f"image_page{page_number+1}_img{image_index}.{image_ext}")
                pil_image.save(image_path)
                saved_images += 1
                print(f"✅ Saved: {image_path}")
                
            except Exception as e:
                print(f"❌ Error processing image {image_index} on page {page_number+1}: {e}")
    
    # Print summary
    print(f"\n📊 Summary:")
    print(f"  - Total images found: {total_images}")
    print(f"  - Images saved: {saved_images}")
    print(f"  - Black images skipped: {skipped_images}")
    print(f"  - Images with errors: {total_images - saved_images - skipped_images}")
    print(f"\n🎉 Images saved in '{output_folder}' folder.")

# Path to your PDF file
pdf_path = "C:/Users/pandl/OneDrive/Desktop/FYP/Sarcoma.pdf"

# Main execution - this will run when the script is executed
if __name__ == "__main__":
    # Open the PDF file and process it
    with fitz.open(pdf_path) as pdf_file:
        extract_images_from_pdf(pdf_file)

Removed 5 existing image files from C:/Users/pandl/OneDrive/Desktop/FYP/Extracted
Cleared existing images from C:/Users/pandl/OneDrive/Desktop/FYP/Extracted
Found 3 images on page 1
✅ Saved: C:/Users/pandl/OneDrive/Desktop/FYP/Extracted\image_page1_img1.jpeg
✅ Saved: C:/Users/pandl/OneDrive/Desktop/FYP/Extracted\image_page1_img2.jpeg
✅ Saved: C:/Users/pandl/OneDrive/Desktop/FYP/Extracted\image_page1_img3.jpeg
Found 1 images on page 5
✅ Saved: C:/Users/pandl/OneDrive/Desktop/FYP/Extracted\image_page5_img1.jpeg
Found 1 images on page 7
✅ Saved: C:/Users/pandl/OneDrive/Desktop/FYP/Extracted\image_page7_img1.jpeg
Found 11 images on page 8
⚠️ Skipping black image: Page 8, Image 1
⚠️ Skipping black image: Page 8, Image 2
✅ Saved: C:/Users/pandl/OneDrive/Desktop/FYP/Extracted\image_page8_img3.jpeg
⚠️ Skipping black image: Page 8, Image 4
⚠️ Skipping black image: Page 8, Image 5
⚠️ Skipping black image: Page 8, Image 6
⚠️ Skipping black image: Page 8, Image 7
⚠️ Skipping black image: Page 8, I