In [9]:
from pdf2image import convert_from_path
import pytesseract
from fpdf import FPDF
import cv2
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import os

In [10]:
# Step 1: Convert PDF to images
def pdf_to_images(pdf_path):
    return convert_from_path(pdf_path)

In [11]:
# Step 2: Preprocess images for better OCR performance using advanced techniques
def preprocess_image(image):
    # Convert PIL image to OpenCV format
    img_cv = np.array(image)
    
    # Convert to grayscale
    img_gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
    
    # Denoise using fast non-local means denoising
    img_denoised = cv2.fastNlMeansDenoising(img_gray, h=30)
    
    # Apply adaptive thresholding for better contrast
    img_bin = cv2.adaptiveThreshold(img_denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    
    # Optionally increase resolution (super-resolution can be implemented via deep learning models)
    return img_bin

In [12]:
# Step 3: Extract text from images using OCR with parallel processing
def extract_text_from_image(image):
    preprocessed_image = preprocess_image(image)
    return pytesseract.image_to_string(preprocessed_image)

def extract_text_from_images(images):
    extracted_text = ""
    
    # Using ThreadPoolExecutor to run OCR on multiple images in parallel
    with ThreadPoolExecutor() as executor:
        results = executor.map(extract_text_from_image, images)
    
    for result in results:
        extracted_text += result
    
    return extracted_text

In [13]:
def create_text_pdf(extracted_text, output_pdf_path):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    
    # Improved font and text style
    pdf.set_font("Arial", size=12)
    pdf.set_left_margin(10)
    pdf.set_right_margin(10)

    # Split extracted text into paragraphs, keeping line breaks
    paragraphs = extracted_text.split("\n\n")
    
    # Add extracted text to PDF with better formatting
    for para in paragraphs:
        # Ensure encoding is handled gracefully
        try:
            pdf.multi_cell(0, 10, para.encode('latin-1', 'replace').decode('latin-1'))
        except Exception as e:
            print(f"Error processing paragraph: {e}")
            continue
    
    pdf.output(output_pdf_path)

In [14]:
def process_single_pdf(input_pdf_path, output_pdf_path):
    images = pdf_to_images(input_pdf_path)
    extracted_text = extract_text_from_images(images)
    create_text_pdf(extracted_text, output_pdf_path)
    print(f"Processed: {input_pdf_path} -> {output_pdf_path}")


In [15]:
def main(input_folder, output_folder):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Get all PDF files from the input folder
    pdf_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.pdf')]

    # Process each PDF file
    for pdf_file in pdf_files:
        input_path = os.path.join(input_folder, pdf_file)
        output_filename = f"output_{os.path.splitext(pdf_file)[0]}.pdf"
        output_path = os.path.join(output_folder, output_filename)
        process_single_pdf(input_path, output_path)

# Usage
input_folder = "chemistry_mcqs_input_pdfs"
output_folder = "chemistry_mcqs_output_pdfs"
main(input_folder, output_folder)

Processed: chemistry_mcqs_input_pdfs\9th Class Chemistry 2018 Lahore Board Group 1 English Medium Objective.pdf -> chemistry_mcqs_output_pdfs\output_9th Class Chemistry 2018 Lahore Board Group 1 English Medium Objective.pdf
Processed: chemistry_mcqs_input_pdfs\Past Paper 2023 9th Class Lahore Board Chemistry Objective Group I English Medium.pdf -> chemistry_mcqs_output_pdfs\output_Past Paper 2023 9th Class Lahore Board Chemistry Objective Group I English Medium.pdf
Processed: chemistry_mcqs_input_pdfs\Past Paper 2023 9th Class Lahore Board Chemistry Objective Group II English Medium.pdf -> chemistry_mcqs_output_pdfs\output_Past Paper 2023 9th Class Lahore Board Chemistry Objective Group II English Medium.pdf
Processed: chemistry_mcqs_input_pdfs\Past Paper 2024 Lahore Board Class 9th Chemistry Group I Objective English Medium (1).pdf -> chemistry_mcqs_output_pdfs\output_Past Paper 2024 Lahore Board Class 9th Chemistry Group I Objective English Medium (1).pdf
Processed: chemistry_mcqs_in