# Dependencies

In [2]:
# DOCUMENT EXTRACTION
from docling.document_converter import DocumentConverter
# CHUNKING
from docling.chunking import HybridChunker
from docling.document_converter import DocumentConverter
from dotenv import load_dotenv
from openai import OpenAI

converter = DocumentConverter()

# Tools inializations

In [3]:
import tiktoken
import logging
from tiktoken import Encoding
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from typing import Dict, List, Tuple, Optional

# Set up a logger for this module
logger = logging.getLogger(__name__)

class TikTokenWrapper(PreTrainedTokenizerBase):
    """
    A robust adapter class to make OpenAI's `tiktoken` library compatible
    with the Hugging Face `PreTrainedTokenizerBase` interface.

    This class is a **special-purpose wrapper** primarily designed for
    fast token counting and chunking operations.

    ---
    **⚠️ IMPORTANT LIMITATIONS (By Design):**
    ---
    1.  **"Tokens" are Stringified IDs:** The `tokenize()` method does NOT
        return human-readable tokens (e.g., "Hello"). It returns the
        stringified *integer token IDs* (e.g., "9906").
    2.  **Purpose:** This is a deliberate performance optimization for chunkers
        and counters that only need a list of "things" and their `len()`.
    3.  **Not a Full Tokenizer:** Do NOT use this for tasks that need
        to inspect the actual token string content (e.g., finding "##"
        subwords, aligning tokens to text).
    4.  **Vocab is a Stub:** `get_vocab()` returns a simple integer map,
        not the true BPE vocabulary.
    ---

    Args:
        model_name (str):
            The name of the OpenAI model to load the tokenizer for
            (e.g., "gpt-4o", "text-embedding-3-large") OR a direct
            encoding name (e.g., "cl100k_base").
        max_length (int, optional):
            The maximum sequence length for this model.
            Defaults to 8191 (a common `cl100k_base` limit).
    """

    def __init__(
        self,
        model_name: str = "gpt-4",  # A safer, modern default
        max_length: int = 8191,
        **kwargs,
    ):
        try:
            self.tokenizer: Encoding = tiktoken.encoding_for_model(model_name)
            self.encoding_name = self.tokenizer.name
        except KeyError:
            try:
                self.tokenizer: Encoding = tiktoken.get_encoding(model_name)
                self.encoding_name = self.tokenizer.name
                logger.warning(
                    f"Could not find model name '{model_name}'. "
                    f"Treating as direct encoding name '{self.encoding_name}'."
                )
            except KeyError:
                logger.error(
                    f"Invalid model or encoding name: '{model_name}'. "
                    f"Defaulting to 'cl100k_base'."
                )
                self.tokenizer: Encoding = tiktoken.get_encoding("cl100k_base")
                self.encoding_name = "cl100k_base"

        # Correctly calculate vocab size (it's max_token_value + 1)
        self._vocab_size = self.tokenizer.max_token_value + 1

        super().__init__(
            model_max_length=max_length,
            **kwargs,
        )

    def tokenize(self, text: str, **kwargs) -> List[str]:
        if not isinstance(text, str):
            logger.warning(f"Input to tokenize was not a string, received {type(text)}.")
            return []
            
        return [str(t) for t in self.tokenizer.encode(text)]

    def _tokenize(self, text: str) -> List[str]:
        return self.tokenize(text)

    def _convert_token_to_id(self, token: str) -> int:
        try:
            return int(token)
        except ValueError:
            logger.warning(f"Invalid token '{token}' passed to _convert_token_to_id.")
            return 0

    def _convert_id_to_token(self, index: int) -> str:
        return str(index)

    def get_vocab(self) -> Dict[str, int]:
        return {str(i): i for i in range(self.vocab_size)}

    @property
    def vocab_size(self) -> int:
        """Returns the true size of the vocabulary."""
        return self._vocab_size

    # --- THIS IS THE NEWLY ADDED METHOD ---
    def __len__(self) -> int:
        """
        Returns the size of the vocabulary.
        This is required by libraries (like docling) that call `len(tokenizer)`.
        """
        return self.vocab_size
    # -------------------------------------

    def save_vocabulary(self, save_directory: str) -> Tuple[str]:
        return ()

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: str,
        max_length: Optional[int] = None,
        **kwargs,
    ):
        init_kwargs = kwargs.copy()
        if max_length is not None:
            init_kwargs["max_length"] = max_length

        return cls(
            model_name=pretrained_model_name_or_path,
            **init_kwargs
        )

# EXTRACTION

In [4]:

# --------------------------------------------------------------
# Basic PDF extraction
# --------------------------------------------------------------

result = converter.convert("/home/youssef/github/Modular_RAG/PDFs/1H2025_Earnings_Release.pdf")

document = result.document
markdown_output = document.export_to_markdown()

print(markdown_output)

2025-11-05 19:56:25,594 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-05 19:56:25,670 - INFO - Going to convert document batch...
2025-11-05 19:56:25,672 - INFO - Initializing pipeline for StandardPdfPipeline with options hash f9730ffaa6e7f8d4fb0c98c8df3f18cb
2025-11-05 19:56:25,691 - INFO - Loading plugin 'docling_defaults'
2025-11-05 19:56:25,696 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-05 19:56:25,715 - INFO - Loading plugin 'docling_defaults'
2025-11-05 19:56:25,732 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-05 19:56:25,811 - INFO - rapidocr cannot be used because onnxruntime is not installed.
2025-11-05 19:56:25,819 - INFO - easyocr cannot be used because it is not installed.
2025-11-05 19:56:26,554 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-11-05 19:56:26,629 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-11-05 19:56:26,688 [RapidOCR] download_f

<!-- image -->

Beltone Holding - Investor Relations

## 1H2025 Earnings Release

## Beltone

## Earnings Release

Beltone Holding Reports EGP6.3 billion in Operating Revenues, a 115% YoY Surge, with Net Profits Reaching EGP1.3 Billion in 1H2025

Cairo, August 11, 2025: Beltone Holding ('Beltone' or the 'Company'), one of the fastest growing financial institutions, announces its consolidated financial and operational results for the six-month period ending 30 June 2025.

## 1H2025 Key Highlights

<!-- image -->

- Beltone has maintained its strong growth momentum into the first half of 2025, with consolidated operating revenue more than doubling  year-on-year  (YoY)  to  EGP6.3  billion.  The  robust  expansion  in  operating  revenue  translated  directly  to  enhanced profitability, evidenced by a 60% YoY increase in net profit after tax and minority interest, which reached EGP1.3 billion in 1H2025.
- Operating revenue for Beltone Investment Bank grew 137% year-on-year to EGP1.1 bill

# CHUNKING

In [5]:
import os
load_dotenv()

# Initialize OpenAI client (make sure you have OPENAI_API_KEY in your environment variables)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


tokenizer = TikTokenWrapper()  # Load our custom tokenizer for OpenAI
MAX_TOKENS = 8191  # text-embedding-3-large's maximum context length


# --------------------------------------------------------------
# Apply hybrid chunking
# --------------------------------------------------------------

chunker = HybridChunker(
    tokenizer=tokenizer,
    max_tokens=MAX_TOKENS,
    merge_peers=True,
)

chunk_iter = chunker.chunk(dl_doc=result.document)
chunks = list(chunk_iter)

len(chunks)

21

In [6]:
print(f"--- Chunk {2} ---")
print(chunks[2].text)
print("================================================")
print(chunks[2].meta)

--- Chunk 2 ---
- Beltone has maintained its strong growth momentum into the first half of 2025, with consolidated operating revenue more than doubling  year-on-year  (YoY)  to  EGP6.3  billion.  The  robust  expansion  in  operating  revenue  translated  directly  to  enhanced profitability, evidenced by a 60% YoY increase in net profit after tax and minority interest, which reached EGP1.3 billion in 1H2025.
- Operating revenue for Beltone Investment Bank grew 137% year-on-year to EGP1.1 billion in 1H2025. This growth was driven by a seven-fold year-on-year increase in Investment Banking revenue, alongside continued growth in the Securities Brokerage division, which saw its market share expand to 4.3%. The Investment Bank's outstanding portfolio recorded remarkable growth of 267% year-on-year, closing the period at EGP5.4 billion.
- On the asset management front, Beltone retained its leadership position as Egypt's largest non -bank affiliated asset management with assets under managem

In [7]:
for i, chunk in enumerate(chunks):
    print(f"--- Chunk {i+1} ---")
    print(chunk.text)
    print(f"=====================================")

--- Chunk 1 ---
Beltone Holding - Investor Relations
--- Chunk 2 ---
Beltone Holding Reports EGP6.3 billion in Operating Revenues, a 115% YoY Surge, with Net Profits Reaching EGP1.3 Billion in 1H2025
Cairo, August 11, 2025: Beltone Holding ('Beltone' or the 'Company'), one of the fastest growing financial institutions, announces its consolidated financial and operational results for the six-month period ending 30 June 2025.
--- Chunk 3 ---
- Beltone has maintained its strong growth momentum into the first half of 2025, with consolidated operating revenue more than doubling  year-on-year  (YoY)  to  EGP6.3  billion.  The  robust  expansion  in  operating  revenue  translated  directly  to  enhanced profitability, evidenced by a 60% YoY increase in net profit after tax and minority interest, which reached EGP1.3 billion in 1H2025.
- Operating revenue for Beltone Investment Bank grew 137% year-on-year to EGP1.1 billion in 1H2025. This growth was driven by a seven-fold year-on-year increas

In [8]:
print(chunks[2])

text="- Beltone has maintained its strong growth momentum into the first half of 2025, with consolidated operating revenue more than doubling  year-on-year  (YoY)  to  EGP6.3  billion.  The  robust  expansion  in  operating  revenue  translated  directly  to  enhanced profitability, evidenced by a 60% YoY increase in net profit after tax and minority interest, which reached EGP1.3 billion in 1H2025.\n- Operating revenue for Beltone Investment Bank grew 137% year-on-year to EGP1.1 billion in 1H2025. This growth was driven by a seven-fold year-on-year increase in Investment Banking revenue, alongside continued growth in the Securities Brokerage division, which saw its market share expand to 4.3%. The Investment Bank's outstanding portfolio recorded remarkable growth of 267% year-on-year, closing the period at EGP5.4 billion.\n- On the asset management front, Beltone retained its leadership position as Egypt's largest non -bank affiliated asset management with assets under management reac