# This Python-based tool extracts data from PDF documents containing a mix of text, tables, images, and graphs, and outputs the data in JSON format.

In [1]:
pip install PyMuPDF pdfplumber Pillow

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.11-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Downloading PyMuPDF-1.24.11-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)

In [2]:
import fitz  # PyMuPDF
import pdfplumber
import json
import os


In [3]:
def extract_text_from_pdf(pdf_path):
    text_data = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text_data.append(page.get_text())
    return text_data

In [4]:
def extract_images_from_pdf(pdf_path):
    images_data = []
    with fitz.open(pdf_path) as doc:
        for page_number in range(len(doc)):
            page = doc[page_number]
            images = page.get_images(full=True)
            for img_index, img in enumerate(images):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
                image_filename = f"image_page_{page_number + 1}_{img_index + 1}.{image_ext}"
                with open(image_filename, "wb") as img_file:
                    img_file.write(image_bytes)
                images_data.append({
                    "page": page_number + 1,
                    "image_filename": image_filename
                })
    return images_data


In [5]:
def extract_tables_from_pdf(pdf_path):
    tables_data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_number in range(len(pdf.pages)):
            page = pdf.pages[page_number]
            tables = page.extract_tables()
            for table in tables:
                tables_data.append({
                    "page": page_number + 1,
                    "table": table
                })
    return tables_data

In [6]:
def extract_pdf_data(pdf_path):
    data = {
        "text": extract_text_from_pdf(pdf_path),
        "images": extract_images_from_pdf(pdf_path),
        "tables": extract_tables_from_pdf(pdf_path)
    }
    return data


In [7]:
def main(pdf_path):
    if not os.path.exists(pdf_path):
        print(f"File {pdf_path} does not exist.")
        return

    extracted_data = extract_pdf_data(pdf_path)
    json_output = json.dumps(extracted_data, indent=4)

    output_json_file = pdf_path.replace('.pdf', '_extracted_data.json')
    with open(output_json_file, 'w') as json_file:
        json_file.write(json_output)

    print(f"Data extracted and saved to {output_json_file}")

if __name__ == "__main__":
    pdf_file_path = "your_pdf_file.pdf"  # Replace with your PDF file path
    main(pdf_file_path)

File your_pdf_file.pdf does not exist.


In [8]:
pdf_file_path = "attention_paper.pdf"  # Replace with your PDF file path /content/attention_paper.pdf
main(pdf_file_path)

Data extracted and saved to attention_paper_extracted_data.json
