### Workflow Diagram

<img src="workflow.png" alt="workflow" width="750"/>

### Implementation

>Importing libraries

In [18]:
import os
from PyPDF2 import PdfMerger
import fitz
import camelot
from pathlib import Path

>Combine PDFs

In [2]:
folder = "pdfs"
files_and_folders = os.listdir(folder)
files = [f for f in files_and_folders if f.lower().endswith(".pdf")]
files.sort()
merger = PdfMerger()

for pdf in files:
    merger.append(os.path.join(folder, pdf))  

merger.write("result.pdf")
merger.close()

Now we have the resulting PDF with 195 pages.

In [7]:
doc = fitz.open("result.pdf")
for page in doc:
    text = page.get_text("blocks")  

In [None]:
tables = camelot.read_pdf("result.pdf", pages = "all")
for i, table in enumerate(tables):
    df = table.df
    df.to_csv(f"table_{i}.csv", index=False)

In [None]:
os.makedirs("images", exist_ok=True)

doc = fitz.open("result.pdf")

for page_number, page in enumerate(doc):
    # get all image objects on the page
    image_list = page.get_images(full=True)
    
    for img_index, img in enumerate(image_list):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]
        
        with open(f"images/page{page_number+1}_img{img_index}.{image_ext}", "wb") as f:
            f.write(image_bytes)

In [None]:
def extract_text_and_tables(pdf_path):

    pdf_file = Path(pdf_path)
    if not pdf_file.is_file() or pdf_file.suffix.lower() != ".pdf":
        raise FileNotFoundError("Provided file path is not a valid PDF.")

    doc = fitz.open(str(pdf_file))
    result = []

    # text extraction
    for page_num, page in enumerate(doc, start = 1):
        page_blocks = []

        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            if block["type"] == 0: # type 0 is text
                text_content = " ".join(
                    span["text"] for line in block["lines"] for span in line["spans"]
                ).strip()
                if text_content:
                    y = block["bbox"][1]
                    page_blocks.append({
                        "type": "TEXT DATA",
                        "y": y,
                        "content": text_content
                    })

        # table extraction
        try:
            tables = camelot.read_pdf(str(pdf_file), pages = str(page_num), flavor = 'lattice') # lattice flavor to extract tables
        except Exception as e:
            print(f"Failed to read tables on page {page_num}: {e}")
            tables = []

        for table in tables:
            table_data = table.data
            bbox = table._bbox
            y = float(bbox[1])
            page_blocks.append({
                "type": "TABLE DATA",
                "y": y,
                "content": table_data
            })

        page_blocks.sort(key = lambda b: b["y"]) # sort contents on current page
        result.extend(page_blocks) # append content to result list

    return result