In [4]:
import fitz  # PyMuPDF
import os

pdf_path = "/Users/bigo/Projects/timbergem/data/Gascoigne_CC_tiny2.pdf"
output_dir = "data"

os.makedirs(output_dir, exist_ok=True)

doc = fitz.open(pdf_path)
page = doc[0]  # first page

# Debug: Check page properties
print(f"Page rotation: {page.rotation}")
print(f"Page rect: {page.rect}")
print(f"Page CropBox: {page.cropbox}")
print(f"Page MediaBox: {page.mediabox}")
print(f"Page transformation matrix: {page.transformation_matrix}")

img_count = 0
for img in page.get_images(full=True):
    xref = img[0]
    print(f"\n--- Image {img_count} ---")
    print(f"Image info: {img}")
    print(f"Image xref: {xref}")
    
    pix = fitz.Pixmap(doc, xref)
    
    # Debug: Check pixmap properties
    print(f"Pixmap size: {pix.width} x {pix.height}")
    print(f"Pixmap n (components): {pix.n}")
    print(f"Pixmap alpha: {pix.alpha}")
    print(f"Pixmap colorspace: {pix.colorspace}")
    print(f"Pixmap stride: {pix.stride}")
    
    # Get image transformation matrix from page
    image_dict = doc.extract_image(xref)
    print(f"Image dictionary keys: {image_dict.keys()}")
    print(f"Image ext: {image_dict.get('ext', 'unknown')}")
    
    img_ext = "png" if pix.alpha else "jpg"
    img_filename = f"{output_dir}/page1_img{img_count}.{img_ext}"
    
    if pix.n < 5:  # GRAY or RGB
        pix.save(img_filename)
    else:  # CMYK: convert to RGB first
        pix1 = fitz.Pixmap(fitz.csRGB, pix)
        pix1.save(img_filename)
        pix1 = None
    pix = None
    img_count += 1

print(f"\nExtracted {img_count} images")
doc.close()

Page rotation: 0
Page rect: Rect(0.0, 0.0, 2592.0, 1728.0)
Page CropBox: Rect(0.0, 0.0, 2592.0, 1728.0)
Page MediaBox: Rect(0.0, 0.0, 2592.0, 1728.0)
Page transformation matrix: Matrix(1.0, 0.0, 0.0, -1.0, 0.0, 1728.0)

--- Image 0 ---
Image info: (7, 0, 908, 294, 8, 'ICCBased', '', 'Im1', 'FlateDecode', 0)
Image xref: 7
Pixmap size: 908 x 294
Pixmap n (components): 3
Pixmap alpha: 0
Pixmap colorspace: Colorspace(CS_RGB) - ICCBased(RGB,sRGB IEC61966-2.1)
Pixmap stride: 2724
Image dictionary keys: dict_keys(['width', 'height', 'ext', 'colorspace', 'xres', 'yres', 'bpc', 'size', 'image', 'smask', 'cs-name'])
Image ext: png

--- Image 1 ---
Image info: (8, 0, 901, 302, 8, 'ICCBased', '', 'Im2', 'FlateDecode', 0)
Image xref: 8
Pixmap size: 901 x 302
Pixmap n (components): 3
Pixmap alpha: 0
Pixmap colorspace: Colorspace(CS_RGB) - ICCBased(RGB,sRGB IEC61966-2.1)
Pixmap stride: 2703
Image dictionary keys: dict_keys(['width', 'height', 'ext', 'colorspace', 'xres', 'yres', 'bpc', 'size', 'image

In [5]:
# Enhanced debugging for image orientation issues
import fitz  # PyMuPDF
import os

pdf_path = "/Users/bigo/Projects/timbergem/data/Gascoigne_CC_tiny2.pdf"
output_dir = "data"

os.makedirs(output_dir, exist_ok=True)

doc = fitz.open(pdf_path)
page = doc[0]  # first page

# Extended debug information
print("=== PAGE ANALYSIS ===")
print(f"Page rotation: {page.rotation} degrees")
print(f"Page rect: {page.rect}")
print(f"Page CropBox: {page.cropbox}")
print(f"Page MediaBox: {page.mediabox}")
print(f"Page transformation matrix: {page.transformation_matrix}")

# Check if page is rotated
if page.rotation != 0:
    print(f"⚠️  Page is rotated by {page.rotation} degrees - this may cause image orientation issues!")

img_count = 0
for img in page.get_images(full=True):
    xref = img[0]
    print(f"\n=== IMAGE {img_count} ANALYSIS ===")
    print(f"Image info tuple: {img}")
    print(f"Image xref: {xref}")
    
    # Get the image rectangle and transformation on the page
    img_rect = page.get_image_rects(xref)[0] if page.get_image_rects(xref) else None
    print(f"Image rectangle on page: {img_rect}")
    
    # Try to get image transformation matrix
    try:
        img_transforms = page.get_image_transforms(xref)
        print(f"Image transforms: {img_transforms}")
        for i, transform in enumerate(img_transforms):
            print(f"  Transform {i}: {transform}")
    except Exception as e:
        print(f"Could not get image transforms: {e}")
    
    pix = fitz.Pixmap(doc, xref)
    
    # Enhanced pixmap debugging
    print(f"Pixmap dimensions: {pix.width} x {pix.height}")
    print(f"Pixmap components: {pix.n}")
    print(f"Pixmap has alpha: {pix.alpha}")
    print(f"Pixmap colorspace: {pix.colorspace}")
    print(f"Pixmap stride: {pix.stride}")
    
    # Get raw image data info
    image_dict = doc.extract_image(xref)
    print(f"Raw image format: {image_dict.get('ext', 'unknown')}")
    print(f"Raw image width: {image_dict.get('width', 'unknown')}")
    print(f"Raw image height: {image_dict.get('height', 'unknown')}")
    
    # Check if raw dimensions match pixmap dimensions
    raw_width = image_dict.get('width', 0)
    raw_height = image_dict.get('height', 0)
    if raw_width and raw_height:
        if raw_width != pix.width or raw_height != pix.height:
            print(f"⚠️  Dimension mismatch! Raw: {raw_width}x{raw_height}, Pixmap: {pix.width}x{pix.height}")
    
    # Save original image
    img_ext = "png" if pix.alpha else "jpg"
    img_filename = f"{output_dir}/page1_img{img_count}_original.{img_ext}"
    
    if pix.n < 5:  # GRAY or RGB
        pix.save(img_filename)
    else:  # CMYK: convert to RGB first
        pix1 = fitz.Pixmap(fitz.csRGB, pix)
        pix1.save(img_filename)
        pix1 = None
    
    print(f"Saved original image as: {img_filename}")
    
    pix = None
    img_count += 1

print(f"\n=== SUMMARY ===")
print(f"Total images extracted: {img_count}")
print(f"Page rotation: {page.rotation} degrees")
doc.close()

=== PAGE ANALYSIS ===
Page rotation: 0 degrees
Page rect: Rect(0.0, 0.0, 2592.0, 1728.0)
Page CropBox: Rect(0.0, 0.0, 2592.0, 1728.0)
Page MediaBox: Rect(0.0, 0.0, 2592.0, 1728.0)
Page transformation matrix: Matrix(1.0, 0.0, 0.0, -1.0, 0.0, 1728.0)

=== IMAGE 0 ANALYSIS ===
Image info tuple: (7, 0, 908, 294, 8, 'ICCBased', '', 'Im1', 'FlateDecode', 0)
Image xref: 7
Image rectangle on page: Rect(193.44000244140625, 1280.6400146484375, 1101.840087890625, 1575.3599853515625)
Could not get image transforms: 'Page' object has no attribute 'get_image_transforms'
Pixmap dimensions: 908 x 294
Pixmap components: 3
Pixmap has alpha: 0
Pixmap colorspace: Colorspace(CS_RGB) - ICCBased(RGB,sRGB IEC61966-2.1)
Pixmap stride: 2724
Raw image format: png
Raw image width: 908
Raw image height: 294
Saved original image as: data/page1_img0_original.jpg

=== IMAGE 1 ANALYSIS ===
Image info tuple: (8, 0, 901, 302, 8, 'ICCBased', '', 'Im2', 'FlateDecode', 0)
Image xref: 8
Image rectangle on page: Rect(1144.07

In [6]:
# Potential fixes for upside-down image extraction
import fitz  # PyMuPDF
import os
from PIL import Image

pdf_path = "/Users/bigo/Projects/timbergem/data/Gascoigne_CC_tiny2.pdf"
output_dir = "data"

os.makedirs(output_dir, exist_ok=True)

doc = fitz.open(pdf_path)
page = doc[0]  # first page

print("=== TESTING ORIENTATION FIXES ===")

img_count = 0
for img in page.get_images(full=True):
    xref = img[0]
    print(f"\n--- Processing Image {img_count} ---")
    
    # Method 1: Extract using PyMuPDF's get_pixmap with page rotation
    pix = fitz.Pixmap(doc, xref)
    
    # Save original first
    img_ext = "png" if pix.alpha else "jpg"
    original_filename = f"{output_dir}/page1_img{img_count}_method1_original.{img_ext}"
    
    if pix.n < 5:  # GRAY or RGB
        pix.save(original_filename)
    else:  # CMYK: convert to RGB first
        pix1 = fitz.Pixmap(fitz.csRGB, pix)
        pix1.save(original_filename)
        pix1 = None
    
    print(f"Method 1 (Original): {original_filename}")
    
    # Method 2: Apply page rotation correction
    if page.rotation != 0:
        print(f"Page rotation detected: {page.rotation} degrees")
        # For upside-down images, we might need to rotate 180 degrees
        rotated_filename = f"{output_dir}/page1_img{img_count}_method2_rotated.{img_ext}"
        
        # Use PIL to rotate the image
        pil_img = Image.open(original_filename)
        
        # Try different rotation angles based on page rotation
        if page.rotation == 180:
            rotated_img = pil_img.rotate(180, expand=True)
        elif page.rotation == 90:
            rotated_img = pil_img.rotate(-90, expand=True)
        elif page.rotation == 270:
            rotated_img = pil_img.rotate(90, expand=True)
        else:
            # If images are upside down, try 180 degree rotation
            rotated_img = pil_img.rotate(180, expand=True)
        
        rotated_img.save(rotated_filename)
        print(f"Method 2 (Rotated): {rotated_filename}")
    
    # Method 3: Extract raw image data directly
    image_dict = doc.extract_image(xref)
    raw_filename = f"{output_dir}/page1_img{img_count}_method3_raw.{image_dict.get('ext', 'png')}"
    
    with open(raw_filename, 'wb') as f:
        f.write(image_dict['image'])
    
    print(f"Method 3 (Raw): {raw_filename}")
    
    pix = None
    img_count += 1

print(f"\n=== COMPARISON COMPLETE ===")
print(f"Check the different versions of extracted images to see which orientation is correct.")
print(f"Methods tested:")
print(f"1. Original PyMuPDF extraction")
print(f"2. Rotation-corrected version")
print(f"3. Raw image data extraction")
doc.close()

=== TESTING ORIENTATION FIXES ===

--- Processing Image 0 ---
Method 1 (Original): data/page1_img0_method1_original.jpg
Method 3 (Raw): data/page1_img0_method3_raw.png

--- Processing Image 1 ---
Method 1 (Original): data/page1_img1_method1_original.jpg
Method 3 (Raw): data/page1_img1_method3_raw.png

--- Processing Image 2 ---
Method 1 (Original): data/page1_img2_method1_original.jpg
Method 3 (Raw): data/page1_img2_method3_raw.png

=== COMPARISON COMPLETE ===
Check the different versions of extracted images to see which orientation is correct.
Methods tested:
1. Original PyMuPDF extraction
2. Rotation-corrected version
3. Raw image data extraction
