In [45]:
import fitz
import pytesseract
from PIL import Image
import io
import os

def rotate_image(image, angle):
    """Rotates the image using PIL."""
    return image.rotate(angle, expand=True)

def detect_text_rotation(image):
    """Uses Tesseract OCR to detect text orientation, with error handling."""
    try:
        osd = pytesseract.image_to_osd(image)
        # print('this')
        angle = int(osd.split("\n")[1].split(":")[-1].strip())  # Extract rotation angle
        return angle
    except Exception as e:
        print(f"Warning: Could not detect rotation. Error: {e}")
        return None  # Return None if detection fails
    
# def detect_text_rotation(image):
#     """Uses Tesseract OCR to detect text orientation, with DPI forced."""
#     try:
#         # Save image to a TIFF with 300 DPI
#         buffered = io.BytesIO()
#         image.save(buffered, format="TIFF", dpi=(800, 800))
#         buffered.seek(0)
#         img_with_dpi = Image.open(buffered)

#         osd = pytesseract.image_to_osd(img_with_dpi)
#         angle = int(osd.split("\n")[1].split(":")[-1].strip())
#         return angle
#     except Exception as e:
#         print(f"Warning: Could not detect rotation. Error: {e}")
#         return None



def get_largest_image(images):
    """
    Takes in a list of images (from page.get_images(full=True)) and returns the reference number 
    of the largest image (by area).
    
    Parameters:
    - images: List of images extracted from page.get_images(full=True)
    
    Returns:
    - Reference number (xref) of the largest image
    """
    largest_area = 0
    largest_xref = None

    # Iterate over all images to find the largest one
    for img in images:
        width = img[2]  # The width of the image (from get_images())
        height = img[3]  # The height of the image (from get_images())

        # Calculate the area of the image
        area = width * height

        # If this image is the largest, update the largest xref
        if area > largest_area:
            largest_area = area
            largest_xref = img[0]  # Image reference number

    return largest_xref


def nearest_90_degree_rotation(angle):
    """Rounds the detected angle to the nearest multiple of 90° and returns the correction."""
    rounded_angle = round(angle / 90) * 90  # Round to nearest 90°
    correction = (rounded_angle % 360)  # Convert to required counter-rotation
    return correction if correction != 360 else 0  # If 360, no rotation needed

def correct_pdf_rotation(input_pdf, output_pdf):
    """Detects rotation from images and corrects page orientation accordingly."""
    doc = fitz.open(input_pdf)
    new_doc = fitz.open()
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        img_list = page.get_images(full=True)

        if not img_list:
            print(f"Page {page_num+1}: No image found, copying text page.")
            new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)  # Copy text pages as they are
            continue

        try:
            # Extract the first image (assuming full-page scan)
            
            xref = get_largest_image(img_list)
            base_image = doc.extract_image(xref)
            img_bytes = base_image["image"]
            img = Image.open(io.BytesIO(img_bytes))

            # Detect text rotation
            angle = detect_text_rotation(img)
            if angle is None:
                print(f"Page {page_num+1}: Skipping due to failed rotation detection.")
                new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
                continue

            # Determine correct rotation
            correction = nearest_90_degree_rotation(angle)
            print(f"Page {page_num+1}: Detected {angle}° rotation, correcting by {correction}°.")

            # Apply correction if necessary
            if correction != 0:
                # Rotate the image using PIL
                img_rotated = rotate_image(img, correction)
                
                # Save the rotated image back to a bytes object
                img_byte_arr = io.BytesIO()
                img_rotated.save(img_byte_arr, format=base_image['ext'],dpi=(300, 300))
                img_byte_arr = img_byte_arr.getvalue()

                # Insert the rotated image into the page
                rect = page.rect
                page.delete_image(xref)   #first delete the image
                page.insert_image(rect, stream=img_byte_arr)  # Corrected insertion method
                # page.clean_contents()
                print(f"Page {page_num+1}: Image rotation applied.")

        except Exception as e:
            print(f"Page {page_num+1}: Error processing page. Skipping. Error: {e}")
            # If there is an error processing the page, we just skip it

        # Add the modified page to the new document
        new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)

    # Save the corrected PDF
    new_doc.save(output_pdf)
    print(f"Corrected PDF saved as '{output_pdf}'.")


In [46]:
# Example usage:
input_pdf = "os.pdf"
output_pdf = "os1.pdf"
correct_pdf_rotation(input_pdf, output_pdf)

Page 1: Detected 0° rotation, correcting by 0°.
Page 2: Detected 0° rotation, correcting by 0°.
Page 3: Detected 0° rotation, correcting by 0°.
Page 4: Detected 0° rotation, correcting by 0°.
Page 5: Detected 0° rotation, correcting by 0°.
Page 6: Detected 0° rotation, correcting by 0°.
Page 7: Detected 0° rotation, correcting by 0°.
Page 8: Detected 0° rotation, correcting by 0°.
Page 9: Detected 0° rotation, correcting by 0°.
Page 10: Detected 0° rotation, correcting by 0°.
Page 11: Detected 0° rotation, correcting by 0°.
Page 12: Detected 0° rotation, correcting by 0°.
Page 13: Detected 0° rotation, correcting by 0°.
Page 14: Detected 0° rotation, correcting by 0°.
Page 15: Detected 0° rotation, correcting by 0°.
Page 16: Detected 0° rotation, correcting by 0°.
Page 17: Detected 0° rotation, correcting by 0°.
Page 18: Detected 0° rotation, correcting by 0°.
Page 19: Detected 0° rotation, correcting by 0°.
Page 20: Detected 0° rotation, correcting by 0°.
Page 21: Detected 90° rotatio

In [None]:
# import fitz  # PyMuPDF

# def reset_pdf_rotation(input_pdf, output_pdf):
#     # Open the input PDF
#     doc = fitz.open(input_pdf)

#     # Loop through all pages and set rotation to 0 degrees
#     for page_num in range(doc.page_count):
#         page = doc.load_page(page_num)
#         page.set_rotation(0)  # Set rotation to 0 degrees

#     # Save the modified PDF to the output file
#     doc.save(output_pdf)

# # Example usage
# input_pdf = 'econ.pdf'  # Replace with the path to your input PDF
# output_pdf = 'output.pdf'  # Replace with the desired output PDF path
# reset_pdf_rotation(input_pdf, output_pdf)
