# OCR with SmallDocling

https://huggingface.co/ds4sd/SmolDocling-256M-preview

In [2]:
# Prerequisites:
# pip install torch
# pip install docling_core
# pip install transformers

from pathlib import Path

import torch
from docling_core.types.doc import DoclingDocument
from docling_core.types.doc.document import DocTagsDocument
from transformers import AutoModelForVision2Seq, AutoProcessor
from transformers.image_utils import load_image

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load images
image = load_image(
    "images/us_stock_market_news.png"
    # "https://upload.wikimedia.org/wikipedia/commons/7/76/GazettedeFrance.jpg"
)

# Initialize processor and model
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForVision2Seq.from_pretrained(
    "ds4sd/SmolDocling-256M-preview",
    torch_dtype=torch.bfloat16,
    _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager",
).to(DEVICE)

# Create input messages
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "Convert this page to docling."},
        ],
    },
]

# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
inputs = inputs.to(DEVICE)

# Generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=8192)
prompt_length = inputs.input_ids.shape[1]
trimmed_generated_ids = generated_ids[:, prompt_length:]
doctags = processor.batch_decode(
    trimmed_generated_ids,
    skip_special_tokens=False,
)[0].lstrip()

# Populate document
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
print(doctags)
# create a docling document
doc = DoclingDocument(name="Document")
doc.load_from_doctags(doctags_doc)

# export as any format
# HTML
# Path("Out/").mkdir(parents=True, exist_ok=True)
# output_path_html = Path("Out/") / "example.html"
# doc.save_as_html(output_path_html)
# MD
print(doc.export_to_markdown())

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/486 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/27.4k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.55M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/513M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<doctag><picture><loc_40><loc_14><loc_110><loc_32><logo></picture>
<section_header_level_1><loc_40><loc_61><loc_421><loc_94>US equity bull market remains intact despite fragile sentiment</section_header_level_1>
<section_header_level_1><loc_40><loc_97><loc_176><loc_107>UBS House View - Daily US</section_header_level_1>
<text><loc_40><loc_116><loc_285><loc_147>Ulrike Hoffmann-Burchardi, Head CIO Global Equities, UBS Financial Services Inc. (UBS FS) Solita Marcelli, GWM Chief Investment Officer Americas, UBS Financial Services Inc. (UBS FS) Mark Haeefe, Global Wealth Management Chief Investment Officer, UBS AG David Lefkowitz, CFA, CIO Head of US Equities, UBS Financial Services Inc. (UBS FS) Sundeep Gantori, CFA, CAIA, Equity Strategist, UBS AG Singapore Branch Daisy Teng, Strategist, UBS AG Singapore Branch</text>
<section_header_level_1><loc_40><loc_170><loc_103><loc_176>From the studio:</section_header_level_1>
<text><loc_40><loc_178><loc_300><loc_192>Video: CIO Mark Haeefele's on in