In [1]:
from pathlib import Path
import os

In [2]:
PROJECT_ROOT = Path(os.getcwd()).resolve().parents[0]
paper_folder = os.path.join(PROJECT_ROOT, 'docs')

files = [i for i in os.listdir(paper_folder) if i.endswith('.pdf')]
files

['2412.17149v1.pdf']

In [3]:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered

In [4]:
# Load PDF and render
pdf_file = os.path.join(paper_folder, files[0])

converter = PdfConverter(artifact_dict=create_model_dict())
rendered = converter(pdf_file)  # replace with your PDF path

Recognizing layout: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.55s/it]
Running OCR Error Detection: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00, 25.17it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing Text: 100%|█████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  5.55it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing tables: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.50it/s]


In [5]:
from dataclasses import dataclass

@dataclass
class MarkdownChunk:
    content: str
    type: str
    page: int
    metadata: dict


In [6]:
doc = converter.build_document(pdf_file)  # Use .render()

Recognizing layout: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.39s/it]
Running OCR Error Detection: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00, 24.68it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing Text: 100%|█████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  6.40it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing tables: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.61it/s]


chunks = []
current_section = None
for page in doc.pages:
    for block in page.contained_blocks(doc):
        if str(block.block_type) == "SectionHeader":
            current_section = block.raw_text(doc).strip()
    
        if str(block.block_type) == "Text":
            content = block.raw_text(doc)
            bbox = block.polygon.bbox if block.polygon else None
            chunks.append({
                "content": content,
                "section": current_section,
                "page": page.page_id,
                "bbox": bbox 
            })


In [7]:
chunks = []
current_section = None
target_types = {"Text", "Title", "List", "Table", "Caption", "Equation"}
image_types = {"Image", "Figure"}  # Adjust based on your PDF library
fallback_types = {
    "Reference", "TextInlineMath", "ListItem", "TableCell",
     "PageHeader", "PageFooter"
}

# "Span" "Line", 
all_types = []
for page in doc.pages:
    for block in page.contained_blocks(doc):
        block_type = str(block.block_type)
        all_types.append(block_type)
        if block_type == "SectionHeader":
            current_section = block.raw_text(doc).strip()

        if block_type in target_types:
            content = block.raw_text(doc).strip()
            bbox = block.polygon.bbox if block.polygon else None

            chunks.append({
                "content": content,
                "section": current_section,
                "page": page.page_id,
                "bbox": bbox,
                "block_type": block_type
            })
        
            # Extract images (only metadata here; content comes later)
        elif block_type in image_types:
            bbox = block.polygon.bbox if block.polygon else None

            chunks.append({
                "content": '',  # You can add a filename or a base64 if you extract it
                "section": current_section,
                "page": page.page_id,
                "bbox": bbox,
                "block_type": block_type
            })

                # Already useful content
        if block_type in fallback_types:
            content = block.raw_text(doc).strip()
            bbox = block.polygon.bbox if block.polygon else None

            chunks.append({
                "content": content,
                "section": current_section,
                "page": page.page_id,
                "bbox": bbox,
                "block_type": block_type,
                "include_in_index": block_type in target_types  # helpful flag
            })

In [8]:
all_types_set = set(all_types)

In [9]:
all_types_set

{'Caption',
 'Equation',
 'Figure',
 'FigureGroup',
 'Line',
 'ListGroup',
 'ListItem',
 'PageFooter',
 'PageHeader',
 'Reference',
 'SectionHeader',
 'Span',
 'Table',
 'TableCell',
 'Text',
 'TextInlineMath'}

In [10]:
[i for i in all_types_set if i not in {*target_types,*image_types}]

['FigureGroup',
 'Line',
 'Span',
 'PageHeader',
 'ListItem',
 'TableCell',
 'ListGroup',
 'SectionHeader',
 'PageFooter',
 'Reference',
 'TextInlineMath']

In [11]:
len(chunks)

213

In [12]:
chunks[1]

{'content': 'arXiv:2412.17149v1 [cs.CL] 22 Dec 2024',
 'section': None,
 'page': 0,
 'bbox': [18.34000015258789, 249.146484375, 36.5224609375, 609.2599945068359],
 'block_type': 'PageHeader',
 'include_in_index': False}

In [13]:
def get_batches_from_iterator(iterator, batch_size=100):
    batch = []    
    for doc in iterator:
        batch.append(doc)
        if len(batch) == batch_size:
            yield batch
            batch = []
    if batch:
        yield batch  # Yield any remaining documents

In [14]:
from pymongo import MongoClient, ASCENDING, UpdateOne

In [15]:
def get_mongo_db_client():
    client = MongoClient("mongodb://localhost:27017/")
    assert client.admin.command("ping") == {'ok': 1.0}
    return client

In [16]:
mongo_db_client = get_mongo_db_client()
database = mongo_db_client['chatbot_ui_v5']  

In [17]:
database['test'].drop()
collection = database['test']

In [18]:
for batch in get_batches_from_iterator(chunks, batch_size = 32):
    result = collection.insert_many(batch)

In [19]:
collection.count_documents({})

213

In [26]:
import json

In [29]:
help(json.dumps)

Help on function dumps in module json:

dumps(obj, *, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, default=None, sort_keys=False, **kw)
    Serialize ``obj`` to a JSON formatted ``str``.
    
    If ``skipkeys`` is true then ``dict`` keys that are not basic types
    (``str``, ``int``, ``float``, ``bool``, ``None``) will be skipped
    instead of raising a ``TypeError``.
    
    If ``ensure_ascii`` is false, then the return value can contain non-ASCII
    characters if they appear in strings contained in ``obj``. Otherwise, all
    such characters are escaped in JSON strings.
    
    If ``check_circular`` is false, then the circular reference check
    for container types will be skipped and a circular reference will
    result in an ``RecursionError`` (or worse).
    
    If ``allow_nan`` is false, then it will be a ``ValueError`` to
    serialize out of range ``float`` values (``nan``, ``inf``, ``-inf``) in
    stri

In [32]:
for i in collection.find({}, {'_id':0}).limit(8):
    j = json.dumps(i, indent=4)
    print(j)
    print(5*'-')

{
    "content": "",
    "section": null,
    "page": 0,
    "bbox": [
        282.7216796875,
        791.841796875,
        288.5419921875,
        804.998046875
    ],
    "block_type": "PageHeader",
    "include_in_index": false
}
-----
{
    "content": "arXiv:2412.17149v1 [cs.CL] 22 Dec 2024",
    "section": null,
    "page": 0,
    "bbox": [
        18.34000015258789,
        249.146484375,
        36.5224609375,
        609.2599945068359
    ],
    "block_type": "PageHeader",
    "include_in_index": false
}
-----
{
    "content": "Kamer Ali Yuksel and Hassan Sawaf",
    "section": "A Multi-AI Agent System for Autonomous Optimization of Agentic AI\nSolutions via Iterative Refinement and LLM-Driven Feedback Loops",
    "page": 0,
    "bbox": [
        199.0546875,
        135.43719482421875,
        394.56561279296875,
        148.316162109375
    ],
    "block_type": "Text"
}
-----
{
    "content": "aiXplain Inc., San Jose, CA, USA\n{kamer, hassan}@aixplain.com",
    "section": "