# Parsing PDF with docling

In [1]:
import json
import time
from pathlib import Path

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

input_doc_path = "./data/chapter02-05.pdf"

# Docling Parse without EasyOCR
# -------------------------
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

###########################################################################

start_time = time.time()
conv_result = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time

print(f"Document converted in {end_time:.2f} seconds.")

## Export results
output_dir = Path("scratch-docling-parse")
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_result.input.file.stem

# Export Docling document JSON format:
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
    fp.write(json.dumps(conv_result.document.export_to_dict(), indent=2, ensure_ascii=False))

# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_markdown())

# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_doctags())

2025-10-31 09:11:50,493 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-31 09:11:50,530 - INFO - Going to convert document batch...
2025-10-31 09:11:50,531 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 75463f421d05cb4304e1f714cf00d35d
2025-10-31 09:11:50,546 - INFO - Loading plugin 'docling_defaults'
2025-10-31 09:11:50,548 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-10-31 09:11:50,562 - INFO - Loading plugin 'docling_defaults'
2025-10-31 09:11:50,566 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-10-31 09:11:50,597 - INFO - Accelerator device: 'cuda:0'
2025-10-31 09:11:53,086 - INFO - Accelerator device: 'cuda:0'
2025-10-31 09:11:54,441 - INFO - Processing document chapter02-05.pdf
2025-10-31 09:13:01,317 - INFO - Finished converting document chapter02-05.pdf in 70.83 sec.


Document converted in 70.83 seconds.


In [2]:
import pickle

# checkpoint
with open('checkpoints/chapter02-04-doclingdoc.pkl', 'wb') as file:
    pickle.dump(conv_result.document, file)

# Normalizing text

In [23]:
import pickle

# read from checkpoitn
with open('checkpoints/chapter02-04-doclingdoc.pkl', 'rb') as file:
    docling_document = pickle.load(file)


In [None]:
!pip install -q shekar

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [24]:
import os
# https://github.com/huggingface/transformers/issues/5486:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [25]:
from  shekar import Normalizer
import re

def fix_persian_numbers(s: str) -> str:
    # Match sequences of Persian digits
    pattern = re.compile(r"[۰-۹]+")
    
    def reverse_num(match):
        num = match.group(0)
        return num[::-1]  # reverse characters
    
    return pattern.sub(reverse_num, s)

normalizer = Normalizer()
for text in docling_document.texts:
    text.orig = fix_persian_numbers(normalizer(text.orig))
    text.text = fix_persian_numbers(normalizer(text.text))

# Chunking

In [26]:
from docling.chunking import HybridChunker

chunker = HybridChunker(tokenizer="google/gemma-3-12b-it", max_tokens=250)
chunks = chunker.chunk(docling_document)
chunks = list(chunks)
len(chunks)



755

In [1]:
import pickle

# checkpoint
with open('checkpoints/chapter02-04-chunks.pkl', 'wb') as file:
    pickle.dump(chunks, file)

FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints/chapter02-04-chunks.pkl'

# Embedding

In [None]:
# import pickle

# # Now, to read the JSON file back into a Python object
# with open('checkpoints/chapter02-04-chunks.pkl', 'rb') as json_file:
#     chunks = pickle.load(json_file)

In [28]:
import torch
from FlagEmbedding import BGEM3FlagModel
from tqdm.notebook import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using:", device)

model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True, devices=device)


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [29]:
# https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoTokenizer

from transformers import AutoTokenizer
from huggingface_hub import login
from collections import Counter

tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-12b-it")

# chunks_sizes = []
# for chunk in chunks:
#     chunks_sizes.append(len(tokenizer.tokenize(chunk.text)))

In [30]:
for chunk in chunks:
    if len(tokenizer.tokenize(chunk.text)) < 20 and chunk.meta.doc_items[0].label == "text":
        print(chunker.contextualize(chunk))
        print()

In [31]:
# Assuming chunks is a list of dictionaries
CHUNK_SIZE_LIMIT = 20
chunks = [chunk for chunk in chunks if len(tokenizer.tokenize(chunk.text)) > CHUNK_SIZE_LIMIT and chunk.meta.doc_items[0].label == "text"]

In [14]:
len(chunks)

639

In [32]:
# Example: chunks = [...]  # Your list of dicts loaded earlier
texts = [c.text for c in chunks]

In [33]:
print("Embedding chunks...")
import numpy as np

embeddings = model.encode(
    texts,
    return_dense=True,
    return_sparse=True,
    return_colbert_vecs=True,
    batch_size=16,
    max_length=512,
)

dense_vectors = np.array(embeddings["dense_vecs"], dtype=np.float32)
sparse_vectors = embeddings["lexical_weights"]  # scipy.sparse.csr_matrix

print("Done ✅")


pre tokenize: 100%|██████████| 40/40 [00:00<00:00, 97.28it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 40/40 [00:02<00:00, 19.81it/s]


In [17]:
import pickle

# checkpoint
with open('checkpoints/embeddings-bge-m3.pkl', 'wb') as file:
    pickle.dump(embeddings, file)

# Retreival

In [34]:
import pickle
import numpy as np

# read from checkpoint
# with open('checkpoints/embeddings-bge-m3.pkl', 'rb') as file:
#     embeddings = pickle.load(file)

dense_vectors = np.array(embeddings["dense_vecs"], dtype=np.float32)
sparse_vectors = embeddings["lexical_weights"]  # scipy.sparse.csr_matrix

In [49]:
with open('checkpoints/clean_chunks.pkl', 'wb') as file:
    pickle.dump(chunks, file)

In [19]:
embeddings

{'dense_vecs': array([[ 0.01131  ,  0.02258  ,  0.02101  , ...,  0.003475 , -0.0383   ,
          0.007477 ],
        [-0.00756  ,  0.01842  ,  0.02678  , ..., -0.01678  , -0.01942  ,
         -0.00736  ],
        [-0.0002164, -0.003395 ,  0.000597 , ..., -0.0091   , -0.0675   ,
         -0.0368   ],
        ...,
        [-0.0634   , -0.009    , -0.01765  , ...,  0.007633 , -0.04935  ,
         -0.01394  ],
        [-0.02493  ,  0.01738  , -0.01643  , ..., -0.02248  , -0.07874  ,
         -0.063    ],
        [-0.00525  ,  0.0308   , -0.02425  , ..., -0.00778  , -0.07043  ,
         -0.02719  ]], shape=(639, 1024), dtype=float16),
 'lexical_weights': [defaultdict(int,
              {'54754': np.float16(0.1525),
               '26468': np.float16(0.254),
               '258': np.float16(0.0587),
               '71471': np.float16(0.2417),
               '746': np.float16(0.1243),
               '35934': np.float16(0.265),
               '140': np.float16(0.1904),
               '584': n

In [20]:
dense_vectors.shape

(639, 1024)

In [21]:
sparse_vectors

[defaultdict(int,
             {'54754': np.float16(0.1525),
              '26468': np.float16(0.254),
              '258': np.float16(0.0587),
              '71471': np.float16(0.2417),
              '746': np.float16(0.1243),
              '35934': np.float16(0.265),
              '140': np.float16(0.1904),
              '584': np.float16(0.1322),
              '14969': np.float16(0.265),
              '27263': np.float16(0.1757),
              '65': np.float16(0.0736),
              '11850': np.float16(0.1985),
              '412': np.float16(0.0593),
              '89606': np.float16(0.05844),
              '27284': np.float16(0.1246),
              '96411': np.float16(0.03052),
              '1287': np.float16(0.0869),
              '35117': np.float16(0.09186),
              '4145': np.float16(0.08215),
              '5150': np.float16(0.01976),
              '69029': np.float16(0.1627),
              '8383': np.float16(0.0687),
              '89107': np.float16(0.175),
         

In [35]:
from sklearn.preprocessing import normalize

# Normalize dense vectors for cosine similarity
dense_vectors = normalize(dense_vectors, axis=1)

In [36]:
from scipy import sparse
import numpy as np

def lexical_to_csr(lexical_weights, vocab_size):
    """
    Convert list[defaultdict] → csr_matrix of shape (num_docs, vocab_size)
    """
    data = []
    rows = []
    cols = []

    for i, word_dict in enumerate(lexical_weights):
        for token_id_str, weight in word_dict.items():
            token_id = int(token_id_str)
            data.append(float(weight))         # convert np.float16 → float
            rows.append(i)
            cols.append(token_id)

    matrix = sparse.csr_matrix((data, (rows, cols)), shape=(len(lexical_weights), vocab_size))
    return matrix

In [37]:
vocab_size = model.tokenizer.vocab_size
sparse_vectors = lexical_to_csr(embeddings["lexical_weights"], vocab_size)

In [38]:
def hybrid_search(query, top_k=5, alpha=0.5):
    """
    alpha = weight for dense similarity (0-1)
    """
    q = model.encode([query], return_dense=True, return_sparse=True)

    # ----- Dense -----
    q_dense = np.array(q["dense_vecs"], dtype=np.float32).reshape(1, -1)
    q_dense = normalize(q_dense, axis=1)
    dense_scores = (dense_vectors @ q_dense.T).squeeze()
    
    # ----- Sparse -----
    q_sparse = lexical_to_csr([q["lexical_weights"][0]], vocab_size)
    sparse_scores = (sparse_vectors @ q_sparse.T).toarray().squeeze()

    # ----- Hybrid Score -----
    scores = alpha * dense_scores + (1 - alpha) * sparse_scores

    # ----- Top-K -----
    top_idx = np.argsort(scores)[::-1][:top_k]
    return [(chunks[i], float(scores[i])) for i in top_idx]


In [39]:
from rich import print 

query = "شرایط عمومی احراز صلاحیت افراد امتیازآور"
results = hybrid_search(query, top_k=5, alpha=0.7)

for i, (chunk, score) in enumerate(results, 1):
    print(f"\n[{i}] score={score:.4f}\n{chunk}")


In [40]:
from rich import print 

query = "رشته کامپیوتر"
results = hybrid_search(query, top_k=5, alpha=0.7)

for i, (chunk, score) in enumerate(results, 1):
    print(f"\n[{i}] score={score:.4f}\n{chunker.contextualize(chunk)}")


In [41]:
def get_clean_chunk(chunk):
    items = []
    for doc_item in chunk.meta.doc_items:
        items.append({
            "label": doc_item.label,
            "page_number": doc_item.prov[0].page_no,
            "charspan": doc_item.prov[0].charspan
         })
                
    clean_chunk = {
        "meta": {"filename": chunk.meta.origin.filename, "headings": chunk.meta.headings},
        "text": chunk.text
    }
    if items:
        clean_chunk["meta"]["chunk_items"] = items

    return clean_chunk

In [None]:
from rich import print
from ollama import chat  # or any LLM API you use, e.g., OpenAI, HuggingFace
import json

def augment_prompt(related_chunks):
    # Combine retrieved texts for LLM input
    context_texts = []
    for i, (chunk, score) in enumerate(related_chunks, 1):
        context_texts.append(str(get_clean_chunk(chunk)))

    # ---- Prepare Context ----
    context = "\n\n---\n\n".join(context_texts)

    # ---- LLM Generation ----
    prompt = f"""
    سؤال کاربر:
    {query}

    تو یک دستیار هوشمند برای پاسخ‌دهی به پرسش‌ها بر اساس اسناد بازیابی‌شده هستی.
    فقط و فقط بر اساس متن‌های داده‌شده پاسخ بده.

    اگر پاسخ در متن‌ها وجود نداشت، حتماً فقط بنویس:
    «پاسخی یافت نشد.»

    وظایف:

    محتوای پرسش را بفهم و فقط بر اساس بخش‌های متنی ارائه‌شده پاسخ بده.

    هیچ دانش بیرونی، حدسی یا استنباط فراتر از متن مجاز نیست.

    در پاسخ نهایی، منبع را با این فرمت ذکر کن:
    
    نام heading (اگر وجود دارد)

    شماره صفحه (page_number از JSON)

    متن‌های بازیابی‌شده  قالب json:
    {context}
    """
    return prompt


# ---- Retrieval ----
query = "شرایط عمومی احراز صلاحیت افراد امتیازآور"
related_chunks = hybrid_search(query, top_k=5, alpha=0.7)
prompt = augment_prompt(related_chunks)
print(prompt)

# # Example with Ollama (local model)
response = chat(model="Gemma3:12b", messages=[{"role": "user", "content": prompt}])

# # ---- Display ----
print("\n[bold green]Answer:[/bold green]\n")
print(response['message']['content'])
