In [305]:
import copy
import io
import logging
import zlib

from collections import defaultdict
from importlib import reload

from pikepdf import Pdf, PdfImage, Name
from PIL import Image, ImageDraw, ImageFilter

TARGETS = list(range(32, 63)) + [64, 65] + [67] + list(range(69, 84)) + list(range(86, 91))  # Starting at 1, not 0.
NUM_PAGES = 94

DEFAULT_LEFT = 1440
DEFAULT_RIGHT = 1640

OVERRIDES = defaultdict(dict)
for i in list(range(1, 39)):
    OVERRIDES[i]["left"] = 1520
OVERRIDES[37]["left"] = 1460
OVERRIDES[42]["left"] = 1500
OVERRIDES[43]["left"] = 1460
for i in list(range(44, 56)):
    OVERRIDES[i]["left"] = 1500
for i in list(range(56, 60)):
    OVERRIDES[i]["left"] = 1470
OVERRIDES[64]["left"] = 1490
OVERRIDES[73]["right"] = 1620
for i in list(range(77, 80)):
    OVERRIDES[i]["right"] = 1600

reload(logging)

logging.basicConfig(format='%(asctime)s %(levelname)s - %(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

LOG = logging.getLogger(__name__)
LOG.setLevel("INFO")

def get_pdf_image(page, page_number):
    raw_image = page.images[f"/Im{page_number}"]
    pdf_image = PdfImage(raw_image)
    return pdf_image

# These functions all return a pil image.
def remove_noise(pil_image):
    LOG.debug("Removing noise")
    pil_image = pil_image.convert("L")  # Grayscale.
    pil_image = pil_image.convert('1', dither=Image.NONE)
    return pil_image

def remove_black_center(pil_image, page_number):
    draw = ImageDraw.Draw(pil_image)
    left = OVERRIDES[page_number].get("left", DEFAULT_LEFT)
    right = OVERRIDES[page_number].get("right", DEFAULT_RIGHT)
    LOG.debug(f"Removing black center from {left} to {right}")
    draw.rectangle(((left, 0), (right, 10000)), fill="white")
    pil_image.save("/tmp/fix.jpg", format="jpeg")
    pil_image = Image.open("/tmp/fix.jpg")
    return pil_image

def fix_page(pdf_image, page_number, fix_noise=True, fix_black_center=True, post_blur=False):
    # The first page is 1, not 0. 
    pil_image = pdf_image.as_pil_image()

    if fix_noise:
        pil_image = remove_noise(pil_image)
    if fix_black_center:
        pil_image = remove_black_center(pil_image, page_number)
    if post_blur:
        pil_image = pil_image.filter(filter=ImageFilter.GaussianBlur(1))
    
    # pdf_image.obj.write(zlib.compress(pil_image.tobytes()), filter=Name("/FlateDecode"))
    # pdf_image.obj.ColorSpace = Name("/DeviceGray")
    LOG.debug(f"Fixed page {page_number}")
    return pil_image

In [325]:
import pytesseract

pdf = Pdf.open("input.pdf")
page_number = 8
page = pdf.pages.p(page_number)
pdf_image = get_pdf_image(page, page_number)
pil_image = pdf_image.as_pil_image()
pil_image = fix_page(pdf_image, page_number)

THRESHOLD = 300

from collections import Counter
def split_page(pil_image):
    # Return two pil_images.
    width, height = pil_image.size
    first = pil_image.crop((0, 0, width // 2, height))
    second = pil_image.crop((width // 2, 0, width, height))
    return first, second

def get_left_and_right(pil_image):
    width, height = pil_image.size
    data = pytesseract.image_to_boxes(pil_image, output_type=pytesseract.Output.DICT)
    left_data = [i for i in data["left"] if isinstance(i, int)] or [100000]
    right_data = [i for i in data["right"] if isinstance(i, int)] or [0]
    left = min(left_data)
    right = max(right_data)
    return left, right

# left, right = split_page(pil_image)
# left = trim(left)
# right = trim(right)

In [None]:
OUTPUT_DIR = "output/"

def maybe_crop(image, left_bound, right_bound, name="unspecified", page_number="unspecified"):
    width, height = image.size
    if left_bound > 0 and left_bound < width or right_bound < width and right_bound > 0:
        LOG.info(f"Cropping page {page_number} ({name}) to ({left_bound}, {right_bound})")
        image = image.crop((left_bound - 10, 0, right_bound + 10, height))
    return image 

pdf = Pdf.open("input.pdf")
images = []
leftmost = 1000000
rightmost = 0

pages = list(range(1, NUM_PAGES))

# leftmost, rightmost = (62, 1569)
# pages = [5]

for page_number in pages:
    page = pdf.pages.p(page_number)
    # fix_black_center = page_number + 1 in TARGETS
    pdf_image = get_pdf_image(page, page_number)
    pil_image = fix_page(pdf_image, page_number)
    first, second = split_page(pil_image)
    
    first_left_bound, first_right_bound = get_left_and_right(first)
    second_left_bound, second_right_bound = get_left_and_right(second)
    LOG.debug(f"Page {page_number} (first) left right bounds: ({first_left_bound}, {first_right_bound})")
    LOG.debug(f"Page {page_number} (second) left right bounds: ({second_left_bound}, {second_right_bound})")
    
    first = maybe_crop(first, first_left_bound, first_right_bound, name="first", page_number=page_number)
    second = maybe_crop(second, second_left_bound, second_right_bound, name="second", page_number=page_number)
    images += [first, second]
    
    leftmost = min(first_left_bound, second_left_bound, leftmost)
    rightmost = max(first_right_bound, second_right_bound, rightmost)

    pct = int(page_number * 100 / NUM_PAGES)
    if pct % 5 == 0:
        LOG.info(f"{pct}% done")

LOG.info(f"Left right bounds: ({leftmost}, {rightmost})")

for i, img in enumerate(images):
    old_width, old_height = img.size
    new_width, new_height = rightmost - leftmost, old_height
    new = Image.new("L", (new_width, new_height), color=255)
    top_left_corner = ((new_width-old_width)//2, (new_height-old_height)//2)
    LOG.info(f"Top left corner coords: {top_left_corner}")
    new.paste(img,top_left_corner)
    img = new
    img = img.filter(filter=ImageFilter.GaussianBlur(1))
    images[i] = img

# images = images[1:]  # Skip empty first page.
image_fnames = []
for i, img in enumerate(images):
    fname = f"{OUTPUT_DIR}{i+1}.jpg"
    img.save(fname)
    LOG.info(f"Saved {fname}")
    image_fnames.append(fname)

for img in images:
    fname = "fixed.pdf"
    img.save(fname, append=True)
    LOG.info(f"Saved {fname}")
    image_fnames.append(fname)

# pdf.save("fixed.pdf")
LOG.info("Done!")

07:26:32 INFO - Cropping page 1 (second) to (519, 1163)
07:26:36 INFO - Cropping page 2 (second) to (284, 1295)
07:26:41 INFO - Cropping page 3 (first) to (385, 1309)
07:26:41 INFO - Cropping page 3 (second) to (295, 1262)
07:26:49 INFO - Cropping page 4 (first) to (379, 1294)
07:26:49 INFO - Cropping page 4 (second) to (161, 1372)
07:26:54 INFO - Cropping page 5 (second) to (180, 1387)
07:26:54 INFO - 5% done
07:27:03 INFO - Cropping page 6 (first) to (227, 1436)
07:27:03 INFO - Cropping page 6 (second) to (172, 1375)
07:27:11 INFO - Cropping page 7 (second) to (188, 1401)
07:27:23 INFO - Cropping page 8 (first) to (212, 1421)
07:27:23 INFO - Cropping page 8 (second) to (160, 1366)
07:27:33 INFO - Cropping page 9 (second) to (176, 1385)
07:27:46 INFO - Cropping page 10 (first) to (224, 1436)
07:27:46 INFO - Cropping page 10 (second) to (157, 1366)
07:27:46 INFO - 10% done
07:27:49 INFO - Cropping page 11 (second) to (247, 1379)
07:27:56 INFO - Cropping page 12 (second) to (160, 1578)


In [None]:
for 

In [None]:
def trim_janky(pil_image):
    width, height = pil_image.size
    pixels = list(pil_image.getdata())
    pixels = [pixels[i * width:(i + 1) * width] for i in range(height)]
    left = 0
    right = width
    for i in range(width):
        column = pixels[i]
        column = [0 if p < 1 else 255 for p in column]
        c = Counter(column)
        if c[0] >= THRESHOLD:
            left = i
            break
    for i in range(width-1, 0, -1):
        column = pixels[i]
        column = [0 if p < 1 else 255 for p in column]
        c = Counter(column)
        print(c)
        if c[0] >= THRESHOLD:
            right = i
            break
    return pil_image.crop((left, 0, right, height))