In [1]:
from pdf2image import convert_from_path
import pytesseract
from fpdf import FPDF

# Set the path to the Tesseract OCR executable (if necessary)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


In [2]:
# Step 1: Convert PDF to images
def pdf_to_images(pdf_path):
    return convert_from_path(pdf_path)

In [3]:
# Step 2: Extract text from images using OCR
def extract_text_from_images(images):
    extracted_text = ""
    for image in images:
        extracted_text += pytesseract.image_to_string(image)
    return extracted_text

In [4]:
# Step 3: Create a textual PDF from extracted text
def create_text_pdf(extracted_text, output_pdf_path):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Add extracted text to PDF
    for line in extracted_text.splitlines():
        try:
            pdf.multi_cell(0, 10, line.encode('latin-1', 'replace').decode('latin-1'))
        except Exception as e:
            print(f"Error processing line: {e}")
            continue
    
    pdf.output(output_pdf_path)

In [5]:
# Main function to tie everything together
def main(input_pdf_path, output_pdf_path):
    images = pdf_to_images(input_pdf_path)           # Convert PDF to images
    extracted_text = extract_text_from_images(images) # Extract text from images
    with open("extracted_text.txt", "w", encoding="utf-8") as f:
        f.write(extracted_text) 
    create_text_pdf(extracted_text, output_pdf_path) # Generate a textual PDF

# Usage
input_pdf_path = 'paper4.pdf'
output_pdf_path = 'output4.pdf'
main(input_pdf_path, output_pdf_path)
