In [15]:
!pip install pdfplumber




[notice] A new release of pip is available: 23.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import pdfplumber
import fitz  # PyMuPDF
import json
import tkinter as tk
from tkinter import filedialog, messagebox

def extract_sections(page):
    lines = {}
    for char in page.chars:
        y0 = round(char['top'], 1)
        lines.setdefault(y0, []).append(char)

    line_infos = []
    for y0 in sorted(lines.keys()):
        chars = lines[y0]
        text = ''.join(c['text'] for c in chars).strip()
        if not text:
            continue
        font_sizes = [c['size'] for c in chars]
        max_font_size = max(font_sizes)
        is_bold = any('Bold' in c['fontname'] or 'bold' in c['fontname'] for c in chars)
        line_infos.append({
            'text': text,
            'font_size': max_font_size,
            'is_bold': is_bold,
            'y0': y0
        })
    return line_infos

def classify_sections(line_infos):
    if not line_infos:
        return []

    font_sizes = sorted(set([line['font_size'] for line in line_infos]), reverse=True)
    section_size = font_sizes[0] if len(font_sizes) > 0 else None
    sub_section_size = font_sizes[1] if len(font_sizes) > 1 else None

    classified = []
    for line in line_infos:
        if line['font_size'] == section_size and line['is_bold']:
            classified.append({'type': 'section', 'text': line['text'], 'y0': line['y0']})
        elif sub_section_size and line['font_size'] == sub_section_size and line['is_bold']:
            classified.append({'type': 'sub_section', 'text': line['text'], 'y0': line['y0']})
        else:
            classified.append({'type': 'paragraph', 'text': line['text'], 'y0': line['y0']})
    return classified

def extract_tables(page):
    tables = []
    for table in page.extract_tables():
        clean_table = []
        for row in table:
            clean_row = [cell.strip() if cell else "" for cell in row]
            clean_table.append(clean_row)
        tables.append(clean_table)
    return tables

def extract_charts(doc, page_number):
    charts = []
    page = doc.load_page(page_number - 1)
    images = page.get_images(full=True)
    for img in images:
        xref = img[0]
        charts.append({
            "description": "Chart/image detected on page",
            "image_xref": xref
        })
    return charts

def parse_pdf(pdf_path):
    result = {"pages": []}
    with pdfplumber.open(pdf_path) as pdf:
        doc = fitz.open(pdf_path)
        for i, page in enumerate(pdf.pages, start=1):
            page_content = []

            line_infos = extract_sections(page)
            classified = classify_sections(line_infos)

            current_section = None
            current_sub_section = None
            paragraph_buffer = []

            def flush_paragraph():
                nonlocal paragraph_buffer
                if paragraph_buffer:
                    text = ' '.join(paragraph_buffer)
                    page_content.append({
                        "type": "paragraph",
                        "section": current_section,
                        "sub_section": current_sub_section,
                        "text": text
                    })
                    paragraph_buffer = []

            for item in classified:
                if item['type'] == 'section':
                    flush_paragraph()
                    current_section = item['text']
                    current_sub_section = None
                elif item['type'] == 'sub_section':
                    flush_paragraph()
                    current_sub_section = item['text']
                else:
                    paragraph_buffer.append(item['text'])
            flush_paragraph()

            tables = extract_tables(page)
            for table in tables:
                page_content.append({
                    "type": "table",
                    "section": current_section,
                    "description": None,
                    "table_data": table
                })

            charts = extract_charts(doc, i)
            for chart in charts:
                page_content.append({
                    "type": "chart",
                    "section": current_section,
                    "description": chart.get("description"),
                    "table_data": None
                })

            result["pages"].append({
                "page_number": i,
                "content": page_content
            })

    return result

def select_pdf_and_save_json():
    root = tk.Tk()
    root.withdraw()

    pdf_path = filedialog.askopenfilename(
        title="Select PDF file",
        filetypes=[("PDF files", "*.pdf")]
    )
    if not pdf_path:
        messagebox.showinfo("Cancelled", "No PDF file selected. Exiting.")
        return

    try:
        data = parse_pdf(pdf_path)
    except Exception as e:
        messagebox.showerror("Error", f"Failed to extract PDF content:\n{e}")
        return

    json_path = filedialog.asksaveasfilename(
        title="Save JSON file",
        defaultextension=".json",
        filetypes=[("JSON files", "*.json")]
    )
    if not json_path:
        messagebox.showinfo("Cancelled", "No save location selected. Exiting.")
        return

    try:
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
        messagebox.showinfo("Success", f"JSON saved successfully:\n{json_path}")
    except Exception as e:
        messagebox.showerror("Error", f"Failed to save JSON file:\n{e}")

if __name__ == "__main__":
    select_pdf_and_save_json()
