In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
tokens = tokenizer.encode("Hello world!")
print(len(tokens))  # [101, 7592, 2088, 999, 102]


5


In [None]:
from typing import List
import nltk
from transformers import AutoTokenizer

nltk.download('punkt')  # For sentence tokenization

def chunk_text_for_embedding(
    text: str,
    model_name: str = "BAAI/bge-base-en-v1.5",
    max_tokens: int = 512,
    buffer_tokens: int = 20
) -> List[str]:
    """
    Splits input text into chunks that fit within the embedding model's token limit.

    Args:
        text: The input text.
        model_name: Name of the HuggingFace model to load the tokenizer.
        max_tokens: Max tokens allowed per chunk.
        buffer_tokens: Safety margin for tokenizer variations.

    Returns:
        List of text chunks, each within token limits.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    sentences = nltk.sent_tokenize(text)
    
    chunks = []
    current_chunk = []
    current_tokens = 0

    for sentence in sentences:
        token_count = len(tokenizer.encode(sentence, add_special_tokens=False))
        if current_tokens + token_count <= max_tokens - buffer_tokens:
            current_chunk.append(sentence)
            current_tokens += token_count
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_tokens = token_count

    # Append last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks
