In [None]:
''' 

Not just tabula, I have also used Tesseract OCR along with pdfplumber
to extract tables from the specified PDF pages. 
The process first attempts to extract tables using pdfplumber, 
which works well for PDFs with structured and selectable text. 
If no tables are detected, the page is converted into an image using pdf2image,
preprocessed with OpenCV for enhanced clarity, and then processed using 
Tesseract OCR to extract text from scanned pages. After performing OCR,
the extracted data is structured into a DataFrame and saved in an Excel file, 
following the format provided in sample.xlsx. 
This approach ensures accurate extraction while preserving the table structure and layout.

'''

In [1]:
pip install pdfplumber pytesseract pdf2image pillow pandas openpyxl opencv-python numpy


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.2 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pdfplumber
import pandas as pd
import pytesseract
from pdf2image import convert_from_path
import cv2
import numpy as np
import re

# Set the correct Tesseract OCR path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Define PDFs and pages for OCR
pdf_files = {
    "D:\\Maiora\\cardio_structured.pdf": 6,
    "D:\\Maiora\\prot_sap_102.pdf": 50,
    "D:\\Maiora\\prot_sap_1.pdf": 14
}

output_file = "D:\\Maiora\\extracted_tables.xlsx"
writer = pd.ExcelWriter(output_file, engine='openpyxl')

# Function to sanitize sheet names
def sanitize_sheet_name(name):
    # Replace invalid characters with underscores
    return re.sub(r'[\\/:*?"<>|\[\]]', '_', name)

# Function to extract tables from a PDF using pdfplumber
def extract_tables_from_pdf(pdf_path, page_number):
    tables_data = []
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[page_number - 1]  # Pages are zero-indexed
        tables = page.extract_tables()
        for table in tables:
            df = pd.DataFrame(table)  # Convert table to DataFrame
            tables_data.append(df)
    return tables_data

# Function to preprocess images for OCR
def preprocess_image(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)  # Convert to grayscale
    gray = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)  # Thresholding
    gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)  # Resize for better text recognition
    return gray

# Function to perform OCR on a PDF page
def perform_ocr_on_page(pdf_path, page_number):
    images = convert_from_path(pdf_path, first_page=page_number, last_page=page_number)
    extracted_text = []
    for image in images:
        processed_img = preprocess_image(image)
        text = pytesseract.image_to_string(processed_img, config="--psm 6")
        extracted_text.append(text)
    return "\n".join(extracted_text)

# Process each PDF file
for pdf_name, page_number in pdf_files.items():
    print(f"Processing {pdf_name} - Page {page_number}...")
    pdf_path = pdf_name  

    # Try extracting tables first
    tables = extract_tables_from_pdf(pdf_path, page_number)

    if tables:
        for i, table in enumerate(tables):
            # Sanitize the sheet name
            sanitized_sheet_name = sanitize_sheet_name(f"{pdf_name}_Page{page_number}_Table{i+1}")
            table.to_excel(writer, sheet_name=sanitized_sheet_name, index=False)
    else:
        # If no tables are found, use OCR
        extracted_text = perform_ocr_on_page(pdf_path, page_number)
        df_text = pd.DataFrame([extracted_text.split("\n")])  # Convert text into a DataFrame
        # Sanitize the sheet name
        sanitized_sheet_name = sanitize_sheet_name(f"{pdf_name}_Page{page_number}_OCR")
        df_text.to_excel(writer, sheet_name=sanitized_sheet_name, index=False)

writer.close()
print(f"Extraction completed! Saved as {output_file}")

Processing D:\Maiora\cardio_structured.pdf - Page 6...




Processing D:\Maiora\prot_sap_102.pdf - Page 50...
Processing D:\Maiora\prot_sap_1.pdf - Page 14...
Extraction completed! Saved as D:\Maiora\extracted_tables.xlsx
