In [None]:
from logging import config
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
# from marker.schema import BlockTypes

'''
-- page_range: List of page numbers to process. By default, all pages are processed. eg. [0,1,5-6,...]
-- ignore_TOC: Whether to ignore the page with Table of Contents page if detected. By default, False.
-- use_llm: Whether to use LLM to enhance the parsing of the document. By default, False.
-- ollama_base_url: Base URL for the Ollama LLM service. Default is 'http://localhost:11434'.
-- llm_service: The LLM service class to use. Default is 'marker.services.ollama.OllamaService'.
-- renderer: The format of the output document can be one of the following ('markdown|pageMarkdown|chunks|pageMarkdown+chunks'). Default is 'pageMarkdown+chunks'.
    - for 'markdown', the output dict with keys 'markdown', Dict[str, str].
    - for 'pageMarkdown', the output dict with keys 'page_renders' containing 'page numbers' as keys and 'markdown' or 'html' as values, Dict[int, Dict[str, str]].
    - for 'chunks', the output dict with keys 'chunks'containing 'page_id' as keys and 'html' as values for the text, Dict[str, Dict[str, Any]].
    - for 'pageMarkdown+chunks', the output dict with keys 'page_renders' and 'chunks'.
    Output json format contains: 
    --- 'page_structure': List[Dict[str, Any]]  (List of page-wise structure with text and blocks)
    --- 'page_renders'/'chunks' contain the information of the document.
'''

converter = PdfConverter(
    artifact_dict = create_model_dict(),
    config = {
        # "use_llm":True,
        # "ollama_base_url":"http://ollama-keda.mobiusdtaas.ai",
        # llm_service='marker.services.ollama.OllamaService',
        "page_range": [0, 4, 5],
        "ignore_TOC": True,
        "renderer": "markdown+chunks+pageMarkdown"
    }
)


In [11]:
document = converter.build_document("./PDFs/ISO_IEC_22237-1_2021(en) (1).pdf")

Recognizing layout: 100%|██████████| 1/1 [00:02<00:00,  2.27s/it]


Resolved PageHeader = SectionHeader issue, height =  42.719914643093944


Running OCR Error Detection: 100%|██████████| 1/1 [00:00<00:00,  4.31it/s]
Detecting bboxes: 0it [00:00, ?it/s]


In [12]:
out_render = converter.render_document(document)

import json

# print("output keys:", out_render.keys())
# print(out_render)
print(json.dumps(out_render, indent=2))

{
  "page_structure": {
    "0": [
      "/page/0/Text/0",
      "/page/0/Picture/1",
      "/page/0/Picture/2",
      "/page/0/Text/3",
      "/page/0/SectionHeader/4",
      "/page/0/SectionHeader/5",
      "/page/0/Text/6",
      "/page/0/Text/7",
      "/page/0/Picture/8",
      "/page/0/Text/9",
      "/page/0/Picture/10",
      "/page/0/Text/11",
      "/page/0/Text/12"
    ],
    "4": [
      "/page/4/PageHeader/1",
      "/page/4/SectionHeader/0",
      "/page/4/SectionHeader/2",
      "/page/4/Text/3",
      "/page/4/Text/4",
      "/page/4/Text/5",
      "/page/4/Text/6",
      "/page/4/Text/7",
      "/page/4/Text/8",
      "/page/4/Text/9",
      "/page/4/Text/10",
      "/page/4/ListGroup/209",
      "/page/4/Text/16",
      "/page/4/Text/17",
      "/page/4/PageFooter/18",
      "/page/4/PageFooter/19"
    ],
    "5": [
      "/page/5/Text/0",
      "/page/5/Text/1",
      "/page/5/SectionHeader/2",
      "/page/5/Text/3",
      "/page/5/Text/4",
      "/page/5/Text/5",
 

In [14]:
import json
from textwrap import indent

# assuming your dictionary is stored in a variable called `doc_data`

def display_document_info(doc_data: dict):
    print("=" * 80)
    print("📘 Document Overview")
    print("=" * 80)

    # --- Licensed Info / Metadata ---
    if "chunks" in doc_data and "/page/0/Text/0" in doc_data["chunks"]:
        license_info = doc_data["chunks"]["/page/0/Text/0"]["html"]
        print("\n🔒 License Information:")
        print(indent(license_info, "  "))

    # --- Table of Contents ---
    if "metadata" in doc_data and "table_of_contents" in doc_data["metadata"]:
        print("\n📑 Table of Contents:")
        for item in doc_data["metadata"]["table_of_contents"]:
            print(f"  - {item['title']} (page {item['page_id']})")

    # --- Page Structure ---
    if "page_structure" in doc_data:
        print("\n📄 Page Structure:")
        for page, elements in doc_data["page_structure"].items():
            print(f"\n  Page {page}:")
            for elem in elements:
                print(f"    • {elem}")

    # --- Chunk Text (sample) ---
    if "chunks" in doc_data:
        print("\n📝 Extracted Chunks (sample):")
        for key, value in list(doc_data["chunks"].items()):  # limit for brevity
            text = value.get("html", "").strip()
            print(f"  - {key}: {text[:100]}{'...' if len(text) > 100 else ''}")

    # --- Page Stats ---
    if "metadata" in doc_data and "page_stats" in doc_data["metadata"]:
        print("\n📊 Page Statistics:")
        for stat in doc_data["metadata"]["page_stats"]:
            page_id = stat["page_id"]
            counts = ", ".join([f"{typ}={count}" for typ, count in stat["block_counts"]])
            print(f"  Page {page_id}: {counts}")

    print("\n✅ Display Complete")
    print("=" * 80)


display_document_info(out_render)


📘 Document Overview

🔒 License Information:
  <p block-type="Text">Licensed to Mobius Networks Pvt Ltd / Kotaru Ashwini (reddy.s@mobiusdtaas.ai) ISO Store Order: OP-871184 license #1/ Downloaded: 2025-02-11 Single user licence only, copying and networking prohibited.</p>

📑 Table of Contents:
  - Information technology — Data centre 
facilities and infrastructures — (page 0)
  - Part 1:
General concepts (page 0)
  - Licensed to Mobius Networks Pvt Ltd / Kotaru Ashwini (reddy.s@mobiusdtaas.ai)
ISO Store Order: OP-871184 license #1/ Downloaded: 2025-02-11
Single user licence only, copying and networking prohibited. (page 4)
  - Foreword (page 4)
  - Introduction (page 5)

📄 Page Structure:

  Page 0:
    • /page/0/Text/0
    • /page/0/Picture/1
    • /page/0/Picture/2
    • /page/0/Text/3
    • /page/0/SectionHeader/4
    • /page/0/SectionHeader/5
    • /page/0/Text/6
    • /page/0/Text/7
    • /page/0/Picture/8
    • /page/0/Text/9
    • /page/0/Picture/10
    • /page/0/Text/11
    • /p