In [2]:
!pip install -q easyocr Pillow PyMuPDF

In [3]:
import easyocr                        # for performing OCR on images with high accuracy
from PIL import Image, ImageDraw      # adds image processing capabilities
import fitz                           # provides a simple interface to the PDF-related tools
import os
import glob

# Tools

In [4]:
def draw_boxes(img, bounds):
  draw = ImageDraw.Draw(img)
  for bound in bounds:
    # Draw a line around the specified bounding box using given 4 coordinates
    p0 , p1, p2, p3 = bound[0]
    # Draw a rectangle around the specified bounding box
    draw.rectangle((p0[0],p0[1],p2[0],p2[1]), outline="red", width=2)
  return img

In [5]:
def crop_image(img, height_ratio):
  # Rectangle bounds for cropping
  box = (0, 0, img.width, int(img.height * height_ratio))

  # Crop the image
  cropped_image = img.crop(box)

  return cropped_image

In [6]:
def convert_pdf_to_images(pdf_path):
    all_images = []

    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # Iterate through pages
    for page_number in range(pdf_document.page_count):
        # Get the page
        page = pdf_document.load_page(page_number)

        # Convert the page to an image (pixmap)
        pix = page.get_pixmap(dpi=150)      # dpi for quality
        #pix = page.get_pixmap()

        # Convert the pixmap to a Pillow image
        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        cropped_img = crop_image(image, 1)

        all_images.append(cropped_img)

    # Close the PDF document
    pdf_document.close()
    return all_images

# OCR

In [7]:
def detect_img(image, temp_im_path="temp_ocred_image_result.png", langs=["en"], reader=None):

  """
  Purpose:
    This function performs Optical Character Recognition (OCR) on an input image using the easyocr library and returns the image with detected text boxes drawn on it and the corresponding text detection results.

  Inputs:
    image: The input image on which OCR is to be performed.
    temp_im_path (optional): The path where the input image will be saved temporarily as a PNG file. The default value is "/content/drive/MyDrive/Kasra/Project/temp_ocred_image_result.png".
    langs (optional): A list of languages to be used for OCR. The default value is ["en"], which means English is used for OCR.
    reader (optional): An instance of easyocr.Reader that performs the OCR. If not provided, a new instance is created using the langs parameter.

  Outputs:
    ocred_image: The input image with detected text boxes drawn on it.
    detection_results: A list of text detection results, where each result is a tuple containing the detected text, its bounding box coordinates, and its confidence score.
  """

  if reader is None:
    reader = easyocr.Reader(langs)
  elif reader.model_lang!="english":
    reader = easyocr.Reader(langs)

  image.save(temp_im_path, "PNG")
  detection_results = reader.readtext(
    temp_im_path,
    #text_threshold = 0.0,
    contrast_ths=0.05,
    adjust_contrast=2.0,
    add_margin=0.0,                # detected box margin
    width_ths=1.5,                  # represents how close the pixels are to be in the same detection
    #ycenter_ths=1.0,
    #height_ths=0.9,
    #low_text = 0.1,
    #link_threshold = 0.1,
    mag_ratio = 3,
    slope_ths = 0.7,
    canvas_size=5000,
    batch_size=100,
    bbox_min_size = 0,
    min_size = 0,
    decoder='beamsearch'
  )

  """
(image: Any, decoder: str = 'greedy', beamWidth: int = 5, batch_size: int = 1, workers: int = 0,
allowlist: Any | None = None, blocklist: Any | None = None, detail: int = 1, rotation_info: Any | None = None, paragraph: bool = False,
min_size: int = 20, contrast_ths: float = 0.1, adjust_contrast: float = 0.5, filter_ths: float = 0.003, text_threshold: float = 0.7, low_text: float = 0.4,
link_threshold: float = 0.4, canvas_size: int = 2560, mag_ratio: float = 1, slope_ths: float = 0.1, ycenter_ths: float = 0.5, height_ths: float = 0.5,
width_ths: float = 0.5, y_ths: float = 0.5, x_ths: float = 1, add_margin: float = 0.1, threshold: float = 0.2, bbox_min_score: float = 0.2,
bbox_min_size: int = 3, max_candidates: int = 0, output_format: str = 'standard') -> Any

  """

  im = Image.open(temp_im_path)
  ocred_image = draw_boxes(im, detection_results)
  # im.close()

  return ocred_image, detection_results

In [8]:
def detect_pdf(pdf_path, temp_dir_path="./OCR_images", langs=["en"], reader=None):
  """
Purpose:
The detect_pdf function is used to perform OCR on a given PDF file and return the OCRed text for each page as a list of lists. It also saves the OCRed images in temp_dir_path

Inputs:

    pdf_path: A string representing the file path of the PDF file to be OCRed.
    temp_dir_path: (Optional) A string representing the file path of the directory where the temporary image files will be stored during the OCR process. The default value is "/content/drive/MyDrive/Kasra/Project/temp img/OCR_images".
    langs: (Optional) A list of strings representing the languages to be used for OCR. The default value is ["en"], which means English is used for OCR.
    reader: (Optional) An instance of easyocr.Reader that performs the OCR. If not provided, a new instance is created using the langs parameter.

Outputs:

    all_ocred_text: A list of lists, where each inner list contains the OCRed text for a single page of the PDF file.
  """

  # empty the result path
  files = glob.glob(os.path.join(temp_dir_path, '*'))
  for file in files:
    if os.path.isfile(file):
      os.remove(file)

  images = convert_pdf_to_images(pdf_path)
  #check if new reader is needed
  if reader is None:
    reader = easyocr.Reader(langs)
  elif reader.model_lang!="english":
    reader = easyocr.Reader(langs)

  all_ocred_text = []
  for i,img in enumerate(images):
    temp_im_path = os.path.join(temp_dir_path,str(i)+'.png')
    # detect ocr
    ocred_image, detection_results = detect_img(img, temp_im_path=temp_im_path, langs=langs, reader=reader)
    ocred_image.save(temp_im_path)
    all_ocred_text.append(detection_results)

  return all_ocred_text

In [16]:
def get_bboxes(detection_results):
  # extract bounding boxes from OCR result
  bboxes = []
  for r in detection_results:
    bboxes.append(r[0])
  return bboxes