In [18]:
pip install PyMuPDF pytesseract pandas opencv-python



In [19]:
!sudo apt-get install tesseract-ocr


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.


In [20]:
!pip install pytesseract fitz PyMuPDF opencv-python pandas openpyxl




In [21]:
!pip install --upgrade pymupdf pdfplumber pytesseract pandas




In [12]:
!pip install pdfplumber




In [13]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'


In [15]:
!pip uninstall pymupdf -y
!pip install --no-cache-dir pymupdf


Found existing installation: PyMuPDF 1.25.2
Uninstalling PyMuPDF-1.25.2:
  Successfully uninstalled PyMuPDF-1.25.2
Collecting pymupdf
  Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m229.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.2


In [24]:

import pdfplumber
import fitz
import pytesseract
import pandas as pd
from PIL import Image, ImageEnhance, ImageFilter
import io
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


def extract_table(pdf_path, page_number):
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[page_number - 1]
        tables = page.extract_tables()

    return [pd.DataFrame(table[1:], columns=table[0]) for table in tables if table]

def extract_text(pdf_path, page_number):
    doc = fitz.open(pdf_path)
    page = doc[page_number - 1]
    images = page.get_images(full=True)
    extracted_text = []

    for img_index, img in enumerate(images):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            image = image.convert("L")  #grayscale
            image = image.resize((image.width * 2, image.height * 2))  #Upscale
            image = ImageEnhance.Contrast(image).enhance(2.0)  #Increase contrast
            image = image.filter(ImageFilter.MedianFilter())  #Reduce noise

            ocr_text = pytesseract.image_to_string(image, config="--psm 6")
            extracted_text.append(ocr_text if ocr_text else "[No Text Found in Image]")
        except Exception as e:
            extracted_text.append(f"Error processing image {img_index + 1}: {e}")

    return "\n".join(extracted_text)

def process_pdfs_excel(pdf_pages, output_excel):
    with pd.ExcelWriter(output_excel, engine="openpyxl") as writer:
        for pdf_path, page_number, sheet_name in pdf_pages:
            print(f"Processing {pdf_path} - Page {page_number}")

            tables = extract_table(pdf_path, page_number)
            for idx, df in enumerate(tables):
                df.to_excel(writer, sheet_name=f"{sheet_name}_Table_{idx+1}", index=False)

            ocr_text = extract_text(pdf_path, page_number)
            if ocr_text:
                with open(output_excel.replace(".xlsx", "_ocr_text.txt"), "a", encoding="utf-8") as f:
                    f.write(f"\n--- {sheet_name} (Page {page_number}) ---\n")
                    f.write(ocr_text)

pdf_files = [
    ("/content/cardio_structured.pdf", 6, "cardio_structured"),
    ("/content/prot_sap_102.pdf", 50, "prot_sap_102"),
    ("/content/prot_sap_1.pdf", 14, "prot_sap_1")
]

output_file = "/content/extracted_tables.xlsx"
process_pdfs_excel(pdf_files, output_file)

print(f"Extracted tables saved to {output_file}")









Processing /content/cardio_structured.pdf - Page 6
Processing /content/prot_sap_102.pdf - Page 50
Processing /content/prot_sap_1.pdf - Page 14
Extracted tables saved to /content/extracted_tables.xlsx
