In [31]:
import re
from pathlib import Path

from pdf2image import convert_from_path
from bs4 import BeautifulSoup

# Docling
from docling_core.types.doc import ImageRefMode, PictureItem
from docling_core.types.doc.document import DocTagsDocument, DoclingDocument

# MLX-VLM
from mlx_vlm import load, stream_generate
from mlx_vlm.prompt_utils import apply_chat_template
from mlx_vlm.utils import load_config

In [32]:
# 기본 설정
MODEL_PATH = "ibm-granite/granite-docling-258M-mlx"   # Docling 모델 경로
PROMPT = "Convert this page to docling."             # 페이지 변환 프롬프트
PDF_PATH = "paper3.pdf"                               # 입력 PDF 파일
OUT_IMG_DIR = Path("figures")                         # 이미지 저장 폴더
OUT_IMG_DIR.mkdir(exist_ok=True)


In [33]:
print("Loading model...")
model, processor = load(MODEL_PATH)   # 모델과 프로세서 로드
config = load_config(MODEL_PATH)      # 설정값 로드

Loading model...


Fetching 13 files: 100%|██████████| 13/13 [00:00<00:00, 196844.59it/s]
Fetching 13 files: 100%|██████████| 13/13 [00:00<00:00, 168289.98it/s]


In [34]:
print(f"Converting PDF: {PDF_PATH}")
pages = convert_from_path(PDF_PATH, dpi=200)   # PDF to Image
print(f"Total pages converted: {len(pages)}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Converting PDF: paper3.pdf
Total pages converted: 5


In [35]:
def clean_doctags(tokens: str) -> str:
    soup = BeautifulSoup(tokens, "html.parser")

    for tag in ["page_header", "page_footer", "footnote"]:
        for node in soup.find_all(tag):
            node.decompose()

    section_headers = soup.find_all("section_header_level_1")

    if section_headers:
        first_header = section_headers[0]
        abstract_header = None
        for node in section_headers[1:]:
            if node.get_text(strip=True).lower() == "abstract":
                abstract_header = node
                break

        if abstract_header:
            current = first_header.find_next_sibling()
            while current and current != abstract_header:
                nxt = current.find_next_sibling()
                current.decompose()
                current = nxt

        for node in section_headers:
            if node.get_text(strip=True).lower() == "references":
                current = node
                while current:
                    nxt = current.find_next_sibling()
                    current.decompose()
                    current = nxt
                break

    return str(soup)

In [36]:
docs = []
all_markdown_pages = []

for i, page in enumerate(pages):
    print(f"\n--- Page {i+1}/{len(pages)} ---")

    # DocTags 추출
    formatted_prompt = apply_chat_template(processor, config, PROMPT, num_images=1)
    output = ""
    for token in stream_generate(
        model, processor, formatted_prompt, [page], max_tokens=4096, verbose=False
    ):
        output += token.text
        if "</doctag>" in token.text:
            break

    # 원본/클린 텍스트 저장
    raw_txt_path = Path(f"./page_{i+1}_raw.txt")
    clean_txt_path = Path(f"./page_{i+1}_cleaned.txt")
    raw_txt_path.write_text(output, encoding="utf-8")
    cleaned_output = clean_doctags(output)
    clean_txt_path.write_text(cleaned_output, encoding="utf-8")

    # DoclingDocument 로드
    doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], [page])
    doc = DoclingDocument.load_from_doctags(doctags_doc, document_name=f"Page {i+1}")
    docs.append(doc)

    # Figure 추출
    fig_paths = []
    for idx, (element, _) in enumerate(doc.iterate_items(), start=1):
        if isinstance(element, PictureItem):
            out_path = OUT_IMG_DIR / f"page{i+1}_figure{idx}.png"
            element.get_image(doc).save(out_path, format="PNG")
            fig_paths.append(f"./figures/page{i+1}_figure{idx}.png")

    # Markdown 변환 + 이미지 삽입
    md_text = doc.export_to_markdown().replace("<!-- image -->", "")
    output_lines, figure_index = [], 1
    for line in md_text.splitlines():
        # "Figure N:" 또는 "FIGURE N:" 만 매칭
        if re.match(r"^\s*Figure\s+\d+:", line, re.IGNORECASE):
            if figure_index <= len(fig_paths):
                img_path = fig_paths[figure_index - 1]
                output_lines.append(f"![Figure]({img_path})")
                figure_index += 1
        output_lines.append(line)

    all_markdown_pages.append("\n".join(output_lines))

    print(f"Page {i+1} done.")


--- Page 1/5 ---
Page 1 done.

--- Page 2/5 ---
Page 2 done.

--- Page 3/5 ---
Page 3 done.

--- Page 4/5 ---
Page 4 done.

--- Page 5/5 ---
Page 5 done.


In [37]:
# 모든 페이지 마크다운 합치기 (조건: 마지막 문장이 . ! ? 로 안끝나면 이어붙임)
merged_pages = []

for i, page_md in enumerate(all_markdown_pages):
    if merged_pages:
        prev = merged_pages[-1].rstrip()
        # 이전 페이지의 마지막 줄
        last_line = prev.splitlines()[-1].strip() if prev.splitlines() else ""
        if last_line and not last_line.endswith((".", "!", "?")):
            # 끝이 문장 부호가 아니면 다음 페이지와 바로 이어붙임
            merged_pages[-1] = prev + " " + page_md.lstrip()
        else:
            merged_pages.append(page_md)
    else:
        merged_pages.append(page_md)

full_markdown = "\n\n".join(merged_pages)

print(full_markdown)

## LAION-400M: Open Dataset of CLIP-Filtered 400 Million Image-Text Pairs

## Abstract

Multi-modal language-vision models trained on hundreds of millions of image-text pairs (e.g. CLIP, DALL-E) gained a recent surge, showing remarkable capability to perform zero- or few-shot learning and transfer even in absence of per-sample labels on target image data. Despite this trend, to date there has been no publicly available datasets of sufficient scale for training such models from scratch. To address this issue, in a community effort we build and release for public LAION-400M, a dataset with CLIP-filtered 400 million image-text pairs, their CLIP embeddings and kNN indices that allow efficient similarity search. 1

## 1 Introduction

Multi-modal language-vision models demonstrated recently strong transfer capability to novel datasets in absense of per-sample labels [1, 2, 3]. This capability requires sufficiently large model and data scale during pre-training. Increasing data scale alone ca

In [38]:
# 최종 마크다운과 이미지 디렉터리 경로 출력
MD_PATH = Path("./output.md")
with open(MD_PATH, "w", encoding="utf-8") as f:
    f.write(full_markdown)

print(MD_PATH)
print(OUT_IMG_DIR)

output.md
figures
