# CAPM Document Token Size Analysis

This notebook analyzes the token sizes of various components of the CAPM documents in the database.
We'll examine:
1. Catalog descriptions
2. Section summaries
3. Section content

The results will help us understand the token consumption in the CAPM subagent pipeline.

In [None]:
# Import required modules
import sys
import os
import pandas as pd
from IPython.display import display, HTML
import tiktoken

# Add the project root to the path if needed
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import directly from the module
import iris.src.initial_setup.db_config as db_config
from iris.src.chat_model.model_settings import ENVIRONMENT

In [None]:
# OpenAI tokenizer (using cl100k_base which is used by GPT-4 and Claude)
def count_tokens(string: str) -> int:
    """Count the number of tokens in a string using tiktoken."""
    if not string:
        return 0
    encoding = tiktoken.get_encoding("cl100k_base")
    return len(encoding.encode(string))

In [None]:
# Function to connect to the database
def connect_to_db(env=None):
    """Connect to the database in the specified environment."""
    if env is None:
        # Use the default environment from model_settings
        env = ENVIRONMENT
    
    print(f"Connecting to database in '{env}' environment...")
    conn = db_config.connect_to_db(env)
    if conn:
        print("Connected successfully! 🎉\n")
        return conn
    else:
        print("Failed to connect to database.")
        return None

# Connect to default environment
conn = connect_to_db()

## 1. Analyze CAPM Catalog Descriptions

In [None]:
def analyze_capm_catalog_descriptions(conn):
    """Analyze token size of CAPM document descriptions in the catalog."""
    if not conn:
        return
    
    print("Analyzing CAPM catalog descriptions...")
    with conn.cursor() as cur:
        cur.execute("""
            SELECT id, document_name, document_description
            FROM apg_catalog
            WHERE document_source = 'internal_capm'
            ORDER BY document_name
        """)
        
        records = cur.fetchall()
        if not records:
            print("No CAPM records found in catalog.")
            return
        
        results = []
        total_tokens = 0
        
        for doc_id, name, desc in records:
            tokens = count_tokens(desc or "")
            total_tokens += tokens
            results.append({
                "document_id": doc_id,
                "document_name": name,
                "description_tokens": tokens,
                "description_first_50_chars": (desc or "")[:50] + ("..." if desc and len(desc) > 50 else "")
            })
        
        df = pd.DataFrame(results)
        
        # Summary statistics
        print(f"Total documents: {len(records)}")
        print(f"Total tokens in all descriptions: {total_tokens}")
        print(f"Average tokens per description: {total_tokens / len(records):.2f}")
        print(f"Max tokens in a description: {df['description_tokens'].max()}")
        print(f"Min tokens in a description: {df['description_tokens'].min()}")
        print(f"Median tokens in a description: {df['description_tokens'].median()}\n")
        
        # Show distribution
        token_ranges = [
            (0, 50), (51, 100), (101, 200), 
            (201, 300), (301, 400), (401, 500), 
            (501, 1000), (1001, float('inf'))
        ]
        
        print("Distribution of description token sizes:")
        for low, high in token_ranges:
            count = ((df['description_tokens'] >= low) & (df['description_tokens'] <= high)).sum()
            if high == float('inf'):
                print(f"  {low}+ tokens: {count} documents ({count/len(records)*100:.1f}%)")
            else:
                print(f"  {low}-{high} tokens: {count} documents ({count/len(records)*100:.1f}%)")
        
        # Return the DataFrame for further analysis
        return df

catalog_df = analyze_capm_catalog_descriptions(conn)
display(catalog_df.sort_values(by='description_tokens', ascending=False).head(10))

## 2. Analyze CAPM Section Summaries

In [None]:
def analyze_capm_section_summaries(conn):
    """Analyze token size of CAPM section summaries."""
    if not conn:
        return
    
    print("Analyzing CAPM section summaries...")
    with conn.cursor() as cur:
        cur.execute("""
            SELECT c.document_name, c.section_id, c.section_name, c.section_summary
            FROM apg_content c
            JOIN apg_catalog cat ON c.document_name = cat.document_name AND c.document_source = cat.document_source
            WHERE c.document_source = 'internal_capm'
            ORDER BY c.document_name, c.section_id
        """)
        
        records = cur.fetchall()
        if not records:
            print("No CAPM section summaries found.")
            return
        
        results = []
        total_tokens = 0
        total_by_doc = {}
        
        for doc_name, section_id, section_name, summary in records:
            tokens = count_tokens(summary or "")
            total_tokens += tokens
            
            # Track totals by document
            if doc_name not in total_by_doc:
                total_by_doc[doc_name] = 0
            total_by_doc[doc_name] += tokens
            
            results.append({
                "document_name": doc_name,
                "section_id": section_id,
                "section_name": section_name,
                "summary_tokens": tokens,
                "summary_first_50_chars": (summary or "")[:50] + ("..." if summary and len(summary) > 50 else "")
            })
        
        df = pd.DataFrame(results)
        
        # Summary statistics
        print(f"Total sections: {len(records)}")
        print(f"Total tokens in all summaries: {total_tokens}")
        print(f"Average tokens per summary: {total_tokens / len(records):.2f}")
        print(f"Max tokens in a summary: {df['summary_tokens'].max()}")
        print(f"Min tokens in a summary: {df['summary_tokens'].min()}")
        print(f"Median tokens in a summary: {df['summary_tokens'].median()}\n")
        
        # Document-level statistics
        doc_stats = pd.DataFrame([
            {"document_name": doc, "total_summary_tokens": tokens}
            for doc, tokens in total_by_doc.items()
        ])
        
        print(f"Number of unique documents: {len(doc_stats)}")
        print(f"Average tokens in summaries per document: {doc_stats['total_summary_tokens'].mean():.2f}")
        print(f"Max tokens in summaries per document: {doc_stats['total_summary_tokens'].max()}")
        print(f"Min tokens in summaries per document: {doc_stats['total_summary_tokens'].min()}")
        print(f"Median tokens in summaries per document: {doc_stats['total_summary_tokens'].median()}\n")
        
        # Return the DataFrame for further analysis
        return df, doc_stats

summary_df, doc_summary_stats = analyze_capm_section_summaries(conn)
display(summary_df.sort_values(by='summary_tokens', ascending=False).head(10))
print("\nDocuments with highest total summary tokens:")
display(doc_summary_stats.sort_values(by='total_summary_tokens', ascending=False).head(10))

## 3. Analyze CAPM Section Content

In [None]:
def analyze_capm_section_content(conn):
    """Analyze token size of CAPM section content."""
    if not conn:
        return
    
    print("Analyzing CAPM section content...")
    with conn.cursor() as cur:
        cur.execute("""
            SELECT c.document_name, c.section_id, c.section_name, 
                   length(c.section_content) as content_length
            FROM apg_content c
            JOIN apg_catalog cat ON c.document_name = cat.document_name AND c.document_source = cat.document_source
            WHERE c.document_source = 'internal_capm'
            ORDER BY c.document_name, c.section_id
        """)
        
        records = cur.fetchall()
        if not records:
            print("No CAPM section content found.")
            return
        
        results = []
        total_char_len = 0
        total_by_doc = {}
        
        for doc_name, section_id, section_name, char_len in records:
            # Estimate tokens from character length
            tokens = int(char_len * 0.25)  # approximate conversion
            total_char_len += char_len
            
            # Track totals by document
            if doc_name not in total_by_doc:
                total_by_doc[doc_name] = {"char_len": 0, "num_sections": 0}
            total_by_doc[doc_name]["char_len"] += char_len
            total_by_doc[doc_name]["num_sections"] += 1
            
            results.append({
                "document_name": doc_name,
                "section_id": section_id,
                "section_name": section_name,
                "content_char_length": char_len,
                "content_estimated_tokens": tokens
            })
        
        df = pd.DataFrame(results)
        total_tokens = int(total_char_len * 0.25)
        
        # Sample a few sections to get exact token counts for verification
        print("Sampling a few sections for exact token counts...")
        sampled_sections = 0
        total_chars_sampled = 0
        total_tokens_sampled = 0
        
        cur.execute("""
            SELECT document_name, section_id, section_content
            FROM apg_content
            WHERE document_source = 'internal_capm'
            ORDER BY random()
            LIMIT 10
        """)
        
        for doc_name, section_id, content in cur.fetchall():
            if content:
                sampled_sections += 1
                chars = len(content)
                tokens = count_tokens(content)
                total_chars_sampled += chars
                total_tokens_sampled += tokens
                print(f"  Section {doc_name}/{section_id}: {chars} chars, {tokens} tokens (ratio: {tokens/chars:.4f})")
        
        if sampled_sections > 0:
            avg_ratio = total_tokens_sampled / total_chars_sampled
            print(f"\nAverage tokens/char ratio from sample: {avg_ratio:.4f}")
            adjusted_total_tokens = int(total_char_len * avg_ratio)
            print(f"Adjusted total token estimate: {adjusted_total_tokens} (vs. {total_tokens} using 0.25 ratio)\n")
            # Update the 0.25 estimate with the sampled ratio
            tokens_per_char = avg_ratio
            total_tokens = adjusted_total_tokens
        else:
            tokens_per_char = 0.25
        
        # Summary statistics
        print(f"Total sections: {len(records)}")
        print(f"Total characters in all content: {total_char_len:,}")
        print(f"Estimated total tokens in all content: {total_tokens:,}")
        print(f"Average characters per section: {total_char_len / len(records):,.2f}")
        print(f"Average estimated tokens per section: {total_tokens / len(records):,.2f}")
        print(f"Max characters in a section: {df['content_char_length'].max():,}")
        print(f"Min characters in a section: {df['content_char_length'].min():,}")
        print(f"Median characters in a section: {df['content_char_length'].median():,.2f}\n")
        
        # Document-level statistics
        doc_stats = pd.DataFrame([
            {
                "document_name": doc, 
                "num_sections": stats["num_sections"],
                "total_char_length": stats["char_len"],
                "total_estimated_tokens": int(stats["char_len"] * tokens_per_char)
            }
            for doc, stats in total_by_doc.items()
        ])
        
        print(f"Number of unique documents: {len(doc_stats)}")
        print(f"Average characters per document: {doc_stats['total_char_length'].mean():,.2f}")
        print(f"Average estimated tokens per document: {doc_stats['total_estimated_tokens'].mean():,.2f}")
        print(f"Max characters in a document: {doc_stats['total_char_length'].max():,}")
        print(f"Min characters in a document: {doc_stats['total_char_length'].min():,}")
        print(f"Median characters in a document: {doc_stats['total_char_length'].median():,.2f}\n")
        
        # Token size distribution
        token_ranges = [
            (0, 1000), (1001, 2000), (2001, 5000), 
            (5001, 10000), (10001, 20000), (20001, 50000), 
            (50001, float('inf'))
        ]
        
        print("Distribution of document token sizes:")
        for low, high in token_ranges:
            count = ((doc_stats['total_estimated_tokens'] >= low) & (doc_stats['total_estimated_tokens'] <= high)).sum()
            if high == float('inf'):
                print(f"  {low:,}+ tokens: {count} documents ({count/len(doc_stats)*100:.1f}%)")
            else:
                print(f"  {low:,}-{high:,} tokens: {count} documents ({count/len(doc_stats)*100:.1f}%)")
        
        # Return the DataFrames for further analysis
        return df, doc_stats

content_df, doc_content_stats = analyze_capm_section_content(conn)
display(content_df.sort_values(by='content_estimated_tokens', ascending=False).head(10))
print("\nDocuments with highest total content tokens:")
display(doc_content_stats.sort_values(by='total_estimated_tokens', ascending=False).head(10))

## 4. Analyze Overall CAPM Token Usage in Query Pipeline

In [None]:
def summarize_token_usage(catalog_df, summary_df, content_df, doc_summary_stats, doc_content_stats):
    """Summarize token usage in the CAPM subagent pipeline."""
    print("\n=== CAPM Subagent Pipeline Token Usage ===\n")
    
    # Document Catalog Phase
    total_catalog_tokens = catalog_df['description_tokens'].sum()
    print(f"1. Document Catalog Phase")
    print(f"   Total tokens in all catalog descriptions: {total_catalog_tokens:,}")
    print(f"   Average tokens per document description: {catalog_df['description_tokens'].mean():.2f}")
    print(f"   This is the total token count sent to the LLM for document selection.\n")
    
    # Section Summary Phase
    avg_per_doc = doc_summary_stats['total_summary_tokens'].mean()
    median_per_doc = doc_summary_stats['total_summary_tokens'].median()
    max_per_doc = doc_summary_stats['total_summary_tokens'].max()
    
    print(f"2. Section Summary Phase")
    print(f"   Average tokens in section summaries per document: {avg_per_doc:.2f}")
    print(f"   Median tokens in section summaries per document: {median_per_doc:.2f}")
    print(f"   Maximum tokens in section summaries for a single document: {max_per_doc}")
    print(f"   This is the typical token count sent to the LLM for section selection within a document.\n")
    
    # Document Content Phase
    avg_content_per_doc = doc_content_stats['total_estimated_tokens'].mean()
    median_content_per_doc = doc_content_stats['total_estimated_tokens'].median()
    max_content_per_doc = doc_content_stats['total_estimated_tokens'].max()
    
    print(f"3. Document Content Phase")
    print(f"   Average tokens in content per document: {avg_content_per_doc:,.2f}")
    print(f"   Median tokens in content per document: {median_content_per_doc:,.2f}")
    print(f"   Maximum tokens in content for a single document: {max_content_per_doc:,}")
    print(f"   Based on document size distribution:")
    
    # Calculate number of documents that would exceed token limits
    token_limits = [8000, 16000, 32000, 64000, 128000]
    for limit in token_limits:
        exceeds = (doc_content_stats['total_estimated_tokens'] > limit).sum()
        percent = exceeds / len(doc_content_stats) * 100
        print(f"   - {exceeds} documents ({percent:.1f}%) exceed {limit:,} token limit")
    
    # Realistic scenario with multi-document synthesis
    print("\n4. Realistic Multi-Document Synthesis Scenarios")
    
    # Sort documents by token size and create cumulative scenarios
    sorted_docs = doc_content_stats.sort_values('total_estimated_tokens', ascending=False).reset_index()
    
    # Typical scenarios (top N documents)
    for n_docs in [1, 2, 3, 5, 10]:
        if n_docs <= len(sorted_docs):
            total_tokens = sorted_docs.iloc[:n_docs]['total_estimated_tokens'].sum()
            print(f"   Top {n_docs} largest documents: {total_tokens:,} tokens total")
    
    # Middle-sized documents
    if len(sorted_docs) >= 10:
        middle_start = max(0, len(sorted_docs)//2 - 5)
        middle_tokens = sorted_docs.iloc[middle_start:middle_start+10]['total_estimated_tokens'].sum()
        print(f"   10 medium-sized documents (from middle): {middle_tokens:,} tokens total")
    
    # Bottom-sized documents
    if len(sorted_docs) >= 10:
        bottom_tokens = sorted_docs.iloc[-10:]['total_estimated_tokens'].sum()
        print(f"   10 smallest documents: {bottom_tokens:,} tokens total")

if 'catalog_df' in locals() and 'summary_df' in locals() and 'content_df' in locals():
    summarize_token_usage(catalog_df, summary_df, content_df, doc_summary_stats, doc_content_stats)

In [None]:
# Close the database connection
if conn:
    conn.close()
    print("Database connection closed.")