In [1]:
import os
from gmft.pdf_bindings import PyPDFium2Document
from gmft.auto import AutoTableDetector, AutoTableFormatter
from gmft.table_function import TATRFormatConfig
from PIL import Image
import pytesseract
import matplotlib.pyplot as plt
import pandas as pd

# Initialize the table detector and formatter
detector = AutoTableDetector()

# Configuration for complex tables
config = TATRFormatConfig()
config.force_large_table_assumption = True  # Enable for complex/large tables
config.enable_multi_header = True  # Enable hierarchical headers
config.semantic_spanning_cells = True  # Merge spanning cells
config.verbosity = 3  # Verbose logs for debugging
formatter = AutoTableFormatter(config=config)

# Function to preprocess an image for OCR
def preprocess_image(image):
    """
    Apply preprocessing to improve OCR results.
    """
    return image.convert('L')  # Convert to grayscale

# Function to extract tables from a PDF
def ingest_pdf(pdf_path):
    """
    Detect tables in a PDF document and return them with the document object.
    """
    doc = PyPDFium2Document(pdf_path)
    tables = []
    for page in doc:
        tables += detector.extract(page)
    return tables, doc

# Function to extract data from an image using OCR
def extract_text_from_image(image, dpi=144):
    """
    Extract text from a table image using OCR.
    """
    preprocessed_image = preprocess_image(image)
    return pytesseract.image_to_string(preprocessed_image)

# Function to display and process tables
def display_tables(tables, output_dir=None):
    """
    Display tables as images and print them as dataframes.
    Optionally saves the table images to the specified output directory.
    """
    for i, table in enumerate(tables):
        print(f"Processing Table {i + 1}:")

        # Visualize and save the table image
        image = table.image(dpi=144)
        if output_dir:
            os.makedirs(output_dir, exist_ok=True)
            image_path = os.path.join(output_dir, f"table_{i + 1}.png")
            image.save(image_path)
            print(f"Table image saved to {image_path}")

        # Attempt to extract table data
        try:
            formatted_table = formatter.extract(table)
            df = formatted_table.df()
            if not df.empty:
                print(df.to_string(index=False))
            else:
                # Fallback to OCR if table is empty
                print("Table is empty. Attempting OCR...")
                text = extract_text_from_image(image)
                print("OCR Extracted Text:")
                print(text)
        except Exception as e:
            print(f"Failed to extract data from table {i + 1}: {e}")
        print("-" * 80)

# Main function to process a PDF
def extract_and_display_pdf_tables(pdf_path, output_dir=None):
    """
    Extract and display tables from a PDF.
    """
    print(f"Processing {pdf_path}...")
    tables, doc = ingest_pdf(pdf_path)
    print(f"Detected {len(tables)} tables.")
    display_tables(tables, output_dir=output_dir)
    doc.close()
    print(f"Finished processing {pdf_path}.")

# Example Usage
# PDF path and output directory for table images
pdf_path = "/home/tolis/Desktop/tolis/DNN/project/cs_ai_2023_pdfs/2004.14254.pdf"  # Replace with the actual PDF file path
extract_and_display_pdf_tables(pdf_path)


  from .autonotebook import tqdm as notebook_tqdm


Processing /home/tolis/Desktop/tolis/DNN/project/cs_ai_2023_pdfs/2004.14254.pdf...
Detected 2 tables.
Processing Table 1:
Invoking large table row guess! set TATRFormatConfig.force_large_table_assumption to False to disable this.
        Name ♯ of user goal ♯ of diseases avg. ♯ of im. sym. ♯ of sym.
        MZ-4          1,733             4               5.46       230
       MZ-10          4,116            10               6.60       331
         Dxy            527             5               1.67        41
SymCat-SD-90         30,000            90               2.60       266
--------------------------------------------------------------------------------
Processing Table 2:
Invoking large table row guess! set TATRFormatConfig.force_large_table_assumption to False to disable this.
Group id Success rate Ave intrinsic reward Match rate Activation times
       1        48.6%                0.031     16.74%            0.615
       4        54.6%               -0.150      5.02%           