In [10]:
from pdf2image import convert_from_path
import pytesseract
from fpdf import FPDF
import cv2
import numpy as np
from concurrent.futures import ThreadPoolExecutor

In [11]:
# Step 1: Convert PDF to images
def pdf_to_images(pdf_path):
    return convert_from_path(pdf_path)

In [12]:
# Step 2: Preprocess images for better OCR performance using advanced techniques
def preprocess_image(image):
    # Convert PIL image to OpenCV format
    img_cv = np.array(image)
    
    # Convert to grayscale
    img_gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
    
    # Denoise using fast non-local means denoising
    img_denoised = cv2.fastNlMeansDenoising(img_gray, h=30)
    
    # Apply adaptive thresholding for better contrast
    img_bin = cv2.adaptiveThreshold(img_denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    
    # Optionally increase resolution (super-resolution can be implemented via deep learning models)
    return img_bin

In [13]:
# Step 3: Extract text from images using OCR with parallel processing
def extract_text_from_image(image):
    preprocessed_image = preprocess_image(image)
    return pytesseract.image_to_string(preprocessed_image)

def extract_text_from_images(images):
    extracted_text = ""
    
    # Using ThreadPoolExecutor to run OCR on multiple images in parallel
    with ThreadPoolExecutor() as executor:
        results = executor.map(extract_text_from_image, images)
    
    for result in results:
        extracted_text += result
    
    return extracted_text

In [15]:
def create_text_pdf(extracted_text, output_pdf_path):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    
    # Improved font and text style
    pdf.set_font("Arial", size=12)
    pdf.set_left_margin(10)
    pdf.set_right_margin(10)

    # Split extracted text into paragraphs, keeping line breaks
    paragraphs = extracted_text.split("\n\n")
    
    # Add extracted text to PDF with better formatting
    for para in paragraphs:
        # Ensure encoding is handled gracefully
        try:
            pdf.multi_cell(0, 10, para.encode('latin-1', 'replace').decode('latin-1'))
        except Exception as e:
            print(f"Error processing paragraph: {e}")
            continue
    
    pdf.output(output_pdf_path)

In [16]:
# Main function to tie everything together
def main(input_pdf_path, output_pdf_path):
    images = pdf_to_images(input_pdf_path)           # Convert PDF to images
    extracted_text = extract_text_from_images(images) # Extract text from images
    
    # Save corrected extracted text to a file (optional for debugging purposes)
    with open("corrected_extracted_text.txt", "w", encoding="utf-8") as f:
        f.write(extracted_text)
    
    # Create formatted PDF from the corrected extracted text
    create_text_pdf(extracted_text, output_pdf_path)

# Usage
input_pdf_path = 'paper4.pdf'
output_pdf_path = 'output5.pdf'
main(input_pdf_path, output_pdf_path)