In [1]:
import os
from tabled.extract import extract_tables
from tabled.fileinput import load_pdfs_images
from tabled.inference.models import load_detection_models, load_recognition_models, load_layout_models

def extract_tables_from_pdf(pdf_path, output_dir, output_format="markdown"):
    """
    Extract tables from a PDF and save them in the specified format.

    Args:
        pdf_path (str): Path to the PDF file.
        output_dir (str): Directory where the output files will be saved.
        output_format (str): Format for the output files (markdown, csv, or html). Default is 'markdown'.

    Returns:
        None
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Load the detection, recognition, and layout models
    det_models = load_detection_models()
    rec_models = load_recognition_models()
    layout_models = load_layout_models()

    # Load PDF pages as images
    images, highres_images, names, text_lines = load_pdfs_images(pdf_path)

    # Extract tables from the PDF pages
    page_results = extract_tables(images, highres_images, text_lines, det_models, layout_models, rec_models)

    # Process and save the tables
    for i, page_result in enumerate(page_results):
        for table_index, table in enumerate(page_result):
            table_data = table["cells"]

            # Save as Markdown, CSV, or HTML
            output_file = os.path.join(output_dir, f"page_{i + 1}_table_{table_index + 1}.{output_format}")
            with open(output_file, "w", encoding="utf-8") as f:
                if output_format == "markdown":
                    f.write("|" + "|".join([cell["text"] for cell in table_data[0]]) + "|\n")
                    f.write("|" + "|".join(["---" for _ in table_data[0]]) + "|\n")
                    for row in table_data:
                        f.write("|" + "|".join([cell["text"] for cell in row]) + "|\n")
                elif output_format == "csv":
                    for row in table_data:
                        f.write(",".join([cell["text"] for cell in row]) + "\n")
                elif output_format == "html":
                    f.write("<table>\n")
                    f.write("<tr>" + "".join([f"<th>{cell['text']}</th>" for cell in table_data[0]]) + "</tr>\n")
                    for row in table_data:
                        f.write("<tr>" + "".join([f"<td>{cell['text']}</td>" for cell in row]) + "</tr>\n")
                    f.write("</table>")

    print(f"Tables extracted and saved to {output_dir}")
import torch
if __name__ == "__main__":
    #clear cuda cache
    torch.cuda.empty_cache()
    pdf_path = "/home/tolis/Desktop/tolis/DNN/project/cs_ai_2023_pdfs/2004.14254.pdf"
    output_dir = "/home/tolis/Desktop/tolis/DNN/project/DeepLearning_2024_2025_DSIT/utils/output"
    output_format = "markdown"

    extract_tables_from_pdf(pdf_path, output_dir, output_format)


  from .autonotebook import tqdm as notebook_tqdm


Loaded detection model vikp/surya_det3 on device cuda with dtype torch.float16
Loaded table recognition model vikp/surya_tablerec on device cuda with dtype torch.float16
Loaded recognition model vikp/surya_rec2 on device cuda with dtype torch.float16
Loaded layout model datalab-to/surya_layout on device cuda with dtype torch.float16


Recognizing layout: 100%|██████████| 1/1 [00:01<00:00,  1.03s/it]
Recognizing tables: 100%|██████████| 1/1 [00:00<00:00,  4.18it/s]


TypeError: tuple indices must be integers or slices, not str

: 