In [1]:
from pathlib import Path
import os

In [2]:
PROJECT_ROOT = Path(os.getcwd()).resolve().parents[0]
paper_folder = os.path.join(PROJECT_ROOT, 'docs')

files = [i for i in os.listdir(paper_folder) if i.endswith('.pdf')]
files

['2412.17149v1.pdf']

In [3]:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered

In [4]:
# Load PDF and render
pdf_file = os.path.join(paper_folder, files[0])

converter = PdfConverter(artifact_dict=create_model_dict())
rendered = converter(pdf_file)  # replace with your PDF path

Recognizing layout: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.37s/it]
Running OCR Error Detection: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00, 24.47it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing Text: 100%|█████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  5.55it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing tables: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.90it/s]


In [5]:
from dataclasses import dataclass

@dataclass
class MarkdownChunk:
    content: str
    type: str
    page: int
    metadata: dict


In [6]:
doc = converter.build_document(pdf_file)  # Use .render()

Recognizing layout: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.48s/it]
Running OCR Error Detection: 100%|██████████████████████████████████████████████████████| 1/1 [00:00<00:00, 24.51it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing Text: 100%|█████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  6.63it/s]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing tables: 100%|███████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.80it/s]


chunks = []
current_section = None
for page in doc.pages:
    for block in page.contained_blocks(doc):
        if str(block.block_type) == "SectionHeader":
            current_section = block.raw_text(doc).strip()
    
        if str(block.block_type) == "Text":
            content = block.raw_text(doc)
            bbox = block.polygon.bbox if block.polygon else None
            chunks.append({
                "content": content,
                "section": current_section,
                "page": page.page_id,
                "bbox": bbox 
            })


In [7]:
chunks = []
current_section = None
target_types = {"Text", "Title", "List", "Table", "Caption", "Equation"}

for page in doc.pages:
    for block in page.contained_blocks(doc):
        block_type = str(block.block_type)

        if block_type == "SectionHeader":
            current_section = block.raw_text(doc).strip()

        if block_type in target_types:
            content = block.raw_text(doc).strip()
            bbox = block.polygon.bbox if block.polygon else None

            chunks.append({
                "content": content,
                "section": current_section,
                "page": page.page_id,
                "bbox": bbox,
                "block_type": block_type
            })


In [8]:
len(chunks)

66

In [9]:
chunks[1]

{'content': 'aiXplain Inc., San Jose, CA, USA\n{kamer, hassan}@aixplain.com',
 'section': 'A Multi-AI Agent System for Autonomous Optimization of Agentic AI\nSolutions via Iterative Refinement and LLM-Driven Feedback Loops',
 'page': 0,
 'bbox': [211.568359375, 149.49334716796875, 386.177734375, 175.245361328125],
 'block_type': 'Text'}

In [10]:
def get_batches_from_iterator(iterator, batch_size=100):
    batch = []    
    for doc in iterator:
        batch.append(doc)
        if len(batch) == batch_size:
            yield batch
            batch = []
    if batch:
        yield batch  # Yield any remaining documents

In [11]:
from pymongo import MongoClient, ASCENDING, UpdateOne

In [12]:
def get_mongo_db_client():
    client = MongoClient("mongodb://localhost:27017/")
    assert client.admin.command("ping") == {'ok': 1.0}
    return client

In [13]:
mongo_db_client = get_mongo_db_client()
database = mongo_db_client['chatbot_ui_v5']  

In [14]:
database['test'].drop()
collection = database['test']

In [15]:
for batch in get_batches_from_iterator(chunks, batch_size = 32):
    result = collection.insert_many(batch)

In [16]:
collection.count_documents({})

66

In [17]:
for i in collection.find().limit(2):
    print(i)

{'_id': ObjectId('6874924d39b4009dc474eefb'), 'content': 'Kamer Ali Yuksel and Hassan Sawaf', 'section': 'A Multi-AI Agent System for Autonomous Optimization of Agentic AI\nSolutions via Iterative Refinement and LLM-Driven Feedback Loops', 'page': 0, 'bbox': [199.0546875, 135.43719482421875, 394.56561279296875, 148.316162109375], 'block_type': 'Text'}
{'_id': ObjectId('6874924d39b4009dc474eefc'), 'content': 'aiXplain Inc., San Jose, CA, USA\n{kamer, hassan}@aixplain.com', 'section': 'A Multi-AI Agent System for Autonomous Optimization of Agentic AI\nSolutions via Iterative Refinement and LLM-Driven Feedback Loops', 'page': 0, 'bbox': [211.568359375, 149.49334716796875, 386.177734375, 175.245361328125], 'block_type': 'Text'}
