In [None]:
# print current environment python
import sys
print(sys.executable)

import os

In [2]:
import json
import tiktoken
import math
from transformers import AutoTokenizer
from tqdm import tqdm
import random

# Initialize tokenizers
print("Initializing tokenizers...")

# GPT-4o tokenizer
gpt4_enc = tiktoken.encoding_for_model("gpt-4o")

# HuggingFace tokenizers
gemma_tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-4b-it")
qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

print("Tokenizers initialized successfully")



Initializing tokenizers...
Tokenizers initialized successfully


In [3]:
def chunk_single_document(doc_text, max_chunk_tokens=16000):
    """
    Chunk a single document into smaller pieces based on token count.
    Uses multiple tokenizers and takes the maximum token count.
    
    Args:
        doc_text: Document string to chunk
        max_chunk_tokens: Maximum tokens per chunk (default: 16000)
    
    Returns:
        Tuple of (chunks_list, tokens_list) where:
        - chunks_list: List of document chunks
        - tokens_list: List of token counts for each chunk
    """
    # Get token counts from each tokenizer
    gpt4_tokens = gpt4_enc.encode(doc_text)
    gemma_tokens = gemma_tokenizer.encode(doc_text, add_special_tokens=False)
    qwen_tokens = qwen_tokenizer.encode(doc_text, add_special_tokens=False)
    
    # Take the maximum token count
    gpt4_count = len(gpt4_tokens)
    gemma_count = len(gemma_tokens)
    qwen_count = len(qwen_tokens)
    max_token_count = max(gpt4_count, gemma_count, qwen_count)
    
    # Determine which tokenizer gave the maximum
    if max_token_count == gpt4_count:
        tokens = gpt4_tokens
        tokenizer_type = 'gpt4'
    elif max_token_count == gemma_count:
        tokens = gemma_tokens
        tokenizer_type = 'gemma'
    else:
        tokens = qwen_tokens
        tokenizer_type = 'qwen'
    
    # If document is within limit, return as single chunk
    if max_token_count <= max_chunk_tokens:
        return [doc_text], [max_token_count]
    
    # Otherwise, chunk the document
    chunks = []
    chunk_tokens = []
    
    # Calculate number of chunks needed
    num_chunks = math.ceil(max_token_count / max_chunk_tokens)
    
    for i in range(num_chunks):
        start_idx = i * max_chunk_tokens
        end_idx = min((i + 1) * max_chunk_tokens, len(tokens))
        
        chunk_token_ids = tokens[start_idx:end_idx]
        
        # Decode chunk based on tokenizer type
        if tokenizer_type == 'gpt4':
            chunk_text = gpt4_enc.decode(chunk_token_ids)
        elif tokenizer_type == 'gemma':
            chunk_text = gemma_tokenizer.decode(chunk_token_ids, skip_special_tokens=True)
        else:  # qwen
            chunk_text = qwen_tokenizer.decode(chunk_token_ids, skip_special_tokens=True)
        
        # Ensure we don't cut in middle of a word (for last chunk of each split)
        if i < num_chunks - 1 and chunk_text and not chunk_text[-1].isspace():
            # Find the last complete word boundary
            last_boundary = max(
                chunk_text.rfind(' '),
                chunk_text.rfind('\n'),
                chunk_text.rfind('\t'),
                chunk_text.rfind('.'),
                chunk_text.rfind(','),
                chunk_text.rfind('!'),
                chunk_text.rfind('?'),
                chunk_text.rfind(';'),
                chunk_text.rfind(':')
            )
            
            if last_boundary > 0:
                # Adjust the chunk text and recalculate tokens
                chunk_text = chunk_text[:last_boundary + 1]
                
                # Recalculate actual token count for this chunk
                actual_gpt4 = len(gpt4_enc.encode(chunk_text))
                actual_gemma = len(gemma_tokenizer.encode(chunk_text, add_special_tokens=False))
                actual_qwen = len(qwen_tokenizer.encode(chunk_text, add_special_tokens=False))
                actual_tokens = max(actual_gpt4, actual_gemma, actual_qwen)
            else:
                actual_tokens = len(chunk_token_ids)
        else:
            # For the last chunk, calculate actual tokens
            actual_gpt4 = len(gpt4_enc.encode(chunk_text))
            actual_gemma = len(gemma_tokenizer.encode(chunk_text, add_special_tokens=False))
            actual_qwen = len(qwen_tokenizer.encode(chunk_text, add_special_tokens=False))
            actual_tokens = max(actual_gpt4, actual_gemma, actual_qwen)
        
        chunks.append(chunk_text)
        chunk_tokens.append(actual_tokens)
    
    return chunks, chunk_tokens

In [None]:
# Settings
input_file_name = "20_human_eval_cases"  # Update this as needed (without .json extension)
order_by_date = True
max_chunk_tokens = 16000  # Maximum tokens per chunk

# Input path - from data/full_case_data folder
input_path = f"../../../../data/full_case_data/{input_file_name}.json"

# Saving path - to the local data folder
saving_path = f"./data/{input_file_name}.json"

print(f"Input file: {input_path}")
print(f"Order by date: {order_by_date}")
print(f"Max tokens per chunk: {max_chunk_tokens}")
print(f"Output path: {saving_path}")

In [None]:
# Load the data
with open(input_path, "r") as f:
    data = json.load(f)

print(f"Loaded {len(data)} cases from {input_path}")

In [6]:
# Process documents and create chunks
random.seed(42)

keys = []  # List of (case_id, document_name) tuples
chunks = []  # List of list of chunks
chunks_tokens = []  # List of list of token counts

# Process each case
for item in tqdm(data, desc="Processing cases"):
    case_id = item["case_id"]
    case_documents_title = item["case_documents_title"]
    case_documents_date = item["case_documents_date"]
    case_documents_text = item["case_documents_text"]
    
    if order_by_date:
        # Zip and sort by date
        zipped_data = list(zip(case_documents_date, case_documents_title, case_documents_text))
        sorted_data = sorted(zipped_data, key=lambda x: (x[0] is None, x[0]))
        
        # Unzip back to lists
        case_documents_date, case_documents_title, case_documents_text = zip(*sorted_data)
        case_documents_date = list(case_documents_date)
        case_documents_title = list(case_documents_title)
        case_documents_text = list(case_documents_text)
        
        # Move "Docket" entries to the end
        docket_indices = [i for i, title in enumerate(case_documents_title) if "Docket" in title]
        
        docket_items = []
        for idx in reversed(docket_indices):
            docket_items.append((
                case_documents_date.pop(idx),
                case_documents_title.pop(idx),
                case_documents_text.pop(idx)
            ))
        
        for date, title, text in reversed(docket_items):
            case_documents_date.append(date)
            case_documents_title.append(title)
            case_documents_text.append(text)
    else:
        # Random order with seed 42
        indices = list(range(len(case_documents_title)))
        random.shuffle(indices)
        
        case_documents_title = [case_documents_title[i] for i in indices]
        case_documents_text = [case_documents_text[i] for i in indices]
        case_documents_date = [case_documents_date[i] for i in indices]
    
    # Process each document in the case
    for doc_title, doc_text in zip(case_documents_title, case_documents_text):
        # Chunk the document
        doc_chunks, doc_tokens = chunk_single_document(doc_text, max_chunk_tokens)
        
        # Add to lists
        keys.append((case_id, doc_title))
        chunks.append(doc_chunks)
        chunks_tokens.append(doc_tokens)
        
        # Print info for documents that were chunked
        if len(doc_chunks) > 1:
            total_tokens = sum(doc_tokens)
            print(f"  Case {case_id}, Document '{doc_title}': {len(doc_chunks)} chunks, {total_tokens} total tokens")

print(f"\nProcessed {len(keys)} documents total")
print(f"Documents chunked: {sum(1 for c in chunks if len(c) > 1)}")
print(f"Total chunks created: {sum(len(c) for c in chunks)}")

Processing cases:   0%|                                                                                                                                                                                                                                 | 0/20 [00:00<?, ?it/s]

  Case 46507, Document 'Complaint for Declaratory and Injunctive Relief': 6 chunks, 82441 total tokens
  Case 46507, Document 'Order Granting Temporary Restraining Order and Compelling Certain Discovery Production': 4 chunks, 63047 total tokens
  Case 46507, Document 'Order Granting Temporary Restraining Order and Compelling Certain Discovery Production': 3 chunks, 35856 total tokens


Token indices sequence length is longer than the specified maximum sequence length for this model (298491 > 131072). Running this sequence through the model will result in indexing errors


  Case 46507, Document 'Addendum to Emergency Motion for Stay Pending Appeal/Petition for Writ of Mandamus': 20 chunks, 314770 total tokens
  Case 46507, Document 'Plaintiffs' Memorandum in Support of Motion for Preliminary Injunction': 3 chunks, 42629 total tokens
  Case 46507, Document 'Defendants' Opposition to Plaintiffs' Motion for Preliminary Injunction; Memorandum of Points and Authorities': 2 chunks, 25500 total tokens
  Case 46507, Document 'Order Granting Preliminary Injunction': 3 chunks, 43390 total tokens
  Case 46507, Document 'Order': 2 chunks, 20940 total tokens
  Case 46507, Document 'Application to Stay the Order Issued by the United States District Court for the Northern District of California and Request for an Immediate Administrative Stay': 6 chunks, 91362 total tokens


Processing cases:   5%|██████████▊                                                                                                                                                                                                              | 1/20 [00:08<02:35,  8.17s/it]

  Case 46507, Document 'Docket': 4 chunks, 51493 total tokens
  Case 46329, Document 'Complaint for Declaratory and Injunctive Relief': 3 chunks, 39241 total tokens
  Case 46329, Document 'Memorandum of Law in Support of Plaintiffs' Motion for Preliminary Injunction': 3 chunks, 37418 total tokens
  Case 46329, Document 'Memorandum of Points and Authorities in Support of Defendants' Motion to Dismiss Plaintiffs' Unreasonable Delay Claims': 2 chunks, 17212 total tokens
  Case 46329, Document 'Findings of Fact, Rulings of Law, and Order for Partial Separate and Final Judgment': 3 chunks, 32416 total tokens
  Case 46329, Document '': 2 chunks, 17568 total tokens
  Case 46329, Document '': 2 chunks, 20947 total tokens


Processing cases:  10%|█████████████████████▋                                                                                                                                                                                                   | 2/20 [00:10<01:24,  4.69s/it]

  Case 46329, Document 'Docket': 2 chunks, 22972 total tokens
  Case 46758, Document 'Order': 2 chunks, 28187 total tokens


Processing cases:  15%|████████████████████████████████▌                                                                                                                                                                                        | 3/20 [00:11<00:49,  2.89s/it]

  Case 46758, Document 'Docket': 3 chunks, 36625 total tokens
  Case 46666, Document 'Class Action Complaint for Declaratory and Injunctive Relief': 5 chunks, 79604 total tokens
  Case 46666, Document 'Plaintiffs' Motion for Class Certification and Memorandum of Points and Authorities in Support': 2 chunks, 18780 total tokens
  Case 46666, Document 'Order Granting Motion for Preliminary Injunction and Provisional Class Certification': 3 chunks, 40500 total tokens
  Case 46666, Document 'Motion for Partial Stay Pending Appeal Relief Requested by August 4, 2025': 4 chunks, 54468 total tokens
  Case 46666, Document 'Amended Class Action Complaint for Declaratory and Injunctive Relief': 7 chunks, 99430 total tokens
  Case 46666, Document 'Motion for Preliminary Injunction and Provisional Class Certification as to Additional Agency Defendants': 2 chunks, 26672 total tokens


Processing cases:  20%|███████████████████████████████████████████▍                                                                                                                                                                             | 4/20 [00:15<00:53,  3.33s/it]

  Case 46666, Document 'Docket': 2 chunks, 21991 total tokens
  Case 46746, Document 'Omnibus Order': 4 chunks, 52345 total tokens
  Case 46746, Document '': 9 chunks, 141571 total tokens


Processing cases:  25%|██████████████████████████████████████████████████████▎                                                                                                                                                                  | 5/20 [00:17<00:46,  3.08s/it]

  Case 46746, Document 'Motion to Stay Preliminary Injunction Pending Appeal and for Administrative Stay': 2 chunks, 23515 total tokens
  Case 46746, Document 'Docket': 2 chunks, 16786 total tokens
  Case 46678, Document 'Plaintiffs' Ex Parte Motion for a Temporary Restraining Order': 2 chunks, 20154 total tokens
  Case 46678, Document 'Defendants' Corrected Opposition to Plaintiffs' Motion for a Temporary Restraining Order': 2 chunks, 23082 total tokens
  Case 46678, Document 'Order Granting Plaintiffs' Application for Temporary Restraining Order': 2 chunks, 28542 total tokens
  Case 46678, Document 'Plaintiffs' Motion for Preliminary Injunction': 2 chunks, 29552 total tokens
  Case 46678, Document 'Order': 2 chunks, 18560 total tokens


Processing cases:  30%|█████████████████████████████████████████████████████████████████                                                                                                                                                        | 6/20 [00:19<00:36,  2.58s/it]

  Case 46678, Document 'Docket': 2 chunks, 25482 total tokens
  Case 46390, Document 'Order': 2 chunks, 20264 total tokens
  Case 46390, Document 'Emergency Motion for a Stay Pending Appeal': 4 chunks, 53799 total tokens


Processing cases:  35%|███████████████████████████████████████████████████████████████████████████▉                                                                                                                                             | 7/20 [00:20<00:27,  2.12s/it]

  Case 46348, Document 'Memorandum of Points and Authorities Supporting Plaintiff American Foreign Service Association's Motion for Preliminary Injunction': 3 chunks, 35509 total tokens
  Case 46348, Document 'Opinion': 2 chunks, 22021 total tokens


Processing cases:  40%|██████████████████████████████████████████████████████████████████████████████████████▊                                                                                                                                  | 8/20 [00:21<00:21,  1.80s/it]

  Case 46348, Document 'Emergency Motion for an Immediate Administrative Stay and Stay Pending Appeal': 3 chunks, 33534 total tokens
  Case 46340, Document 'Complaint for Declaratory and Injunctive Relief': 2 chunks, 29645 total tokens
  Case 46340, Document 'Order Re Preliminary Injunction': 2 chunks, 22843 total tokens


Processing cases:  45%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                                                       | 9/20 [00:22<00:16,  1.49s/it]

  Case 46340, Document 'Docket': 2 chunks, 19109 total tokens
  Case 46482, Document 'Complaint': 2 chunks, 17346 total tokens
  Case 46482, Document 'Plaintiffs’ Memorandum of Law in Support of Their Motion for a Preliminary Injunction': 2 chunks, 20919 total tokens


Processing cases:  50%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                            | 10/20 [00:23<00:13,  1.30s/it]

  Case 46482, Document 'Memorandum in Opposition to Plaintiff's Motion for a Preliminary Injunction': 3 chunks, 32962 total tokens


Processing cases:  55%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                                 | 11/20 [00:24<00:09,  1.10s/it]

  Case 46755, Document 'Memorandum Opinion': 4 chunks, 48721 total tokens
  Case 46341, Document 'Complaint for Declaratory and Injunctive Relief': 3 chunks, 45576 total tokens
  Case 46341, Document 'Amended Complaint': 4 chunks, 51113 total tokens
  Case 46341, Document 'Memorandum of Law in Support of Plaintiffs’ Motion for a Preliminary Injunction': 3 chunks, 35384 total tokens
  Case 46341, Document 'Time Sensistive Motion for Stay Pending Appeal and Immediate Administrative Stay': 15 chunks, 240098 total tokens
  Case 46341, Document '': 2 chunks, 17568 total tokens
  Case 46341, Document '': 2 chunks, 20947 total tokens


Processing cases:  60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                      | 12/20 [00:29<00:18,  2.32s/it]

  Case 46341, Document 'Docket': 2 chunks, 28828 total tokens


Processing cases:  65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 13/20 [00:29<00:12,  1.75s/it]

  Case 46342, Document 'First Amended Complaint for Declaratory and Injunctive Relief': 2 chunks, 16763 total tokens


Processing cases:  70%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                | 14/20 [00:29<00:07,  1.33s/it]

  Case 46651, Document 'Docket': 2 chunks, 17264 total tokens
  Case 46351, Document 'Complaint for Declaratory and Injunctive Relief': 2 chunks, 22505 total tokens


Processing cases:  75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 15/20 [00:30<00:05,  1.14s/it]

  Case 46351, Document 'First Amended Complaint for Declaratory and Injunctive Relief': 2 chunks, 25981 total tokens
  Case 46620, Document 'Memorandum Opinion': 2 chunks, 19704 total tokens
  Case 46620, Document 'Time-Sensitive Motion for Stay Pending Appeal and an Administrative Stay': 4 chunks, 48129 total tokens


Processing cases:  80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 16/20 [00:31<00:04,  1.12s/it]

  Case 46620, Document 'Opposition to Application to Stay the Judgment of the United States District Court for the District of Maryland': 2 chunks, 17706 total tokens
  Case 46625, Document 'Class Action Complaint for Declaratory and Injunctive Relief': 2 chunks, 17654 total tokens
  Case 46625, Document 'Defendants' Motion to Dismiss and Memorandum in Opposition to Plaintiffs' Motion for Preliminary Injunction': 3 chunks, 36302 total tokens
  Case 46625, Document 'Memorandum Opinion': 2 chunks, 22753 total tokens


Processing cases:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 17/20 [00:34<00:04,  1.54s/it]

  Case 46625, Document 'Plaintiffs-Appellants' Emergency Motion for an Injunction Pending Appeal': 8 chunks, 119965 total tokens
  Case 46499, Document 'Notice of Motion and Motion for Temporary Restraining Order; Memorandum of Law in Support of Plaintiffs' Motion for Nationwide Temporary Restraining Order': 2 chunks, 23256 total tokens
  Case 46499, Document 'Order Granting Motions for Preliminary Injunctions and Setting Case Management Conferences': 2 chunks, 17431 total tokens


Processing cases:  90%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 18/20 [00:35<00:02,  1.38s/it]

  Case 46499, Document 'Motion for Amended Preliminary Injunction': 2 chunks, 30984 total tokens
  Case 46602, Document 'Complaint': 2 chunks, 20233 total tokens
  Case 46602, Document 'Amended Complaint for Injunctive and Declaratory Relief': 2 chunks, 23945 total tokens
  Case 46602, Document 'Memorandum in Support of Plaintiffs' Motion for Preliminary Injunction': 2 chunks, 24334 total tokens
  Case 46602, Document 'Brief of Amicus Curiae States Of Oregon, Maryland, Washington, Arizona, Colorado, Connecticut; Delaware, Hawai‘i, Maine, Michigan, Nevada, New Mexico, Rhode Island, Wisconsin, Vermont, And The District Of Columbia in Support of Plaintiffs’ Motion for Preliminary Injunction': 2 chunks, 23380 total tokens


Processing cases:  95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 19/20 [00:36<00:01,  1.44s/it]

  Case 46602, Document 'Opinion and Order': 4 chunks, 49091 total tokens


Processing cases: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:37<00:00,  1.87s/it]

  Case 46805, Document 'Complaint for Declaratory and Injunctive Relief': 3 chunks, 44731 total tokens

Processed 176 documents total
Documents chunked: 75
Total chunks created: 344





In [7]:
# Verify the data structure
print(f"Number of keys: {len(keys)}")
print(f"Number of chunk lists: {len(chunks)}")
print(f"Number of token lists: {len(chunks_tokens)}")

# Show sample of the data structure
print("\nSample of first 3 entries:")
for i in range(min(3, len(keys))):
    key = keys[i]
    chunk_list = chunks[i]
    token_list = chunks_tokens[i]
    print(f"  Key: {key}")
    print(f"  Number of chunks: {len(chunk_list)}")
    print(f"  Token counts: {token_list}")
    if len(chunk_list) > 0:
        print(f"  First chunk preview (first 100 chars): {chunk_list[0][:100]}...")
    print()

Number of keys: 176
Number of chunk lists: 176
Number of token lists: 176

Sample of first 3 entries:
  Key: ('46507', 'Complaint for Declaratory and Injunctive Relief')
  Number of chunks: 6
  Token counts: [16000, 16000, 16000, 16000, 15997, 2444]
  First chunk preview (first 100 chars): Case 3:25-cv-03698     Document 1     Filed 04/28/25     Page 1 of 115
 
 
1  Stacey M. Leyton (SBN ...

  Key: ('46507', 'Order Granting Temporary Restraining Order and Compelling Certain Discovery Production')
  Number of chunks: 4
  Token counts: [16000, 16000, 16000, 15047]
  First chunk preview (first 100 chars): No. 24A          
 
 
In the Supreme Court of the United States 
 
 
─────────── 
 
 
DONALD J. TRUM...

  Key: ('46507', 'Order Granting Temporary Restraining Order and Compelling Certain Discovery Production')
  Number of chunks: 3
  Token counts: [16000, 16000, 3856]
  First chunk preview (first 100 chars): Case 3:25-cv-03698-SI     Document 85     Filed 05/09/25     Page 1 of 42
 
1

In [8]:
# Save results for batch processing
saving_folder = os.path.dirname(saving_path)

if not os.path.exists(saving_folder):
    os.makedirs(saving_folder)

results_dict = {
    "keys": keys,
    "chunks": chunks,
    "chunks_tokens": chunks_tokens
}

with open(saving_path, "w") as f:
    json.dump(results_dict, f, indent=4)

print(f"Saved chunk data to {saving_path}")
print(f"File contains:")
print(f"  - {len(keys)} document entries")
print(f"  - {sum(len(c) for c in chunks)} total chunks")
print(f"  - Average chunks per document: {sum(len(c) for c in chunks) / len(chunks):.2f}")

Saved chunk data to ../../../../batch_api/chunk_by_chunk_iterative_updating/data/20_human_eval_cases_2.json
File contains:
  - 176 document entries
  - 344 total chunks
  - Average chunks per document: 1.95


In [9]:
# Statistics summary
print("=" * 50)
print("CHUNKING STATISTICS SUMMARY")
print("=" * 50)

total_docs = len(keys)
chunked_docs = sum(1 for c in chunks if len(c) > 1)
single_chunk_docs = total_docs - chunked_docs
total_chunks = sum(len(c) for c in chunks)

print(f"Total documents processed: {total_docs}")
print(f"Documents kept as single chunk: {single_chunk_docs} ({single_chunk_docs/total_docs*100:.1f}%)")
print(f"Documents split into multiple chunks: {chunked_docs} ({chunked_docs/total_docs*100:.1f}%)")
print(f"Total chunks created: {total_chunks}")
print(f"Average chunks per document: {total_chunks/total_docs:.2f}")

if chunked_docs > 0:
    avg_chunks_for_chunked = sum(len(c) for c in chunks if len(c) > 1) / chunked_docs
    print(f"Average chunks for chunked documents: {avg_chunks_for_chunked:.2f}")

# Token statistics
all_tokens = [t for token_list in chunks_tokens for t in token_list]
if all_tokens:
    print(f"\nToken Statistics:")
    print(f"  Min tokens per chunk: {min(all_tokens):,}")
    print(f"  Max tokens per chunk: {max(all_tokens):,}")
    print(f"  Avg tokens per chunk: {sum(all_tokens)/len(all_tokens):,.0f}")
    print(f"  Total tokens: {sum(all_tokens):,}")

CHUNKING STATISTICS SUMMARY
Total documents processed: 176
Documents kept as single chunk: 101 (57.4%)
Documents split into multiple chunks: 75 (42.6%)
Total chunks created: 344
Average chunks per document: 1.95
Average chunks for chunked documents: 3.24

Token Statistics:
  Min tokens per chunk: 81
  Max tokens per chunk: 16,184
  Avg tokens per chunk: 10,989
  Total tokens: 3,780,068
