## Import Libraries

In [1]:
import base64
import requests
from collections import Counter

In [2]:
def get_vocab(url):
    print(f"Downloading from {url}")
    response = requests.get(url)
    
    tokens = []
    for line in response.content.splitlines():
        if line:
            token_b64, rank = line.split()
            try:
                # Decode base64 token
                token_bytes = base64.b64decode(token_b64)
                token = token_bytes.decode('utf-8')
                tokens.append(token)
            except:
                # Keep problematic tokens as bytes representation
                tokens.append(str(token_bytes))
    
    return set(tokens)

In [4]:
def analyze_token_types(tokens):
    categories = {
        'english_words': [],
        'numbers': [],
        'punctuation': [],
        'whitespace': [],
        'non_english': [],
        'mixed': [],
        'other': []
    }
    
    for token in tokens:
        if token.isalpha() and token.isascii():
            categories['english_words'].append(token)
        elif token.isdigit():
            categories['numbers'].append(token)
        elif all(not c.isalnum() and c.isascii() for c in token) and token.strip():
            categories['punctuation'].append(token)
        elif token.isspace():
            categories['whitespace'].append(token)
        elif not token.isascii():
            categories['non_english'].append(token)
        elif any(c.isalpha() for c in token) and any(c.isdigit() for c in token):
            categories['mixed'].append(token)
        else:
            categories['other'].append(token)
    
    return categories

In [5]:
def main():
    print("🔍 Comparing GPT Tokenizer Vocabularies")
    print("=" * 50)
    
    # Download vocabularies
    o200k_tokens = get_vocab("https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken")
    cl100k_tokens = get_vocab("https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken")
    
    print(f"\n📊 Vocabulary Sizes:")
    print(f"o200k_base (GPT-4o): {len(o200k_tokens):,} tokens")
    print(f"cl100k_base (GPT-4): {len(cl100k_tokens):,} tokens")
    
    # Find differences
    only_o200k = o200k_tokens - cl100k_tokens
    only_cl100k = cl100k_tokens - o200k_tokens
    common = o200k_tokens & cl100k_tokens
    
    print(f"\n🔄 Token Overlap:")
    print(f"Common tokens: {len(common):,}")
    print(f"Only in o200k: {len(only_o200k):,}")
    print(f"Only in cl100k: {len(only_cl100k):,}")
    
    # Analyze what's unique to each
    print(f"\n🆕 New in o200k_base (examples):")
    o200k_categories = analyze_token_types(only_o200k)
    for category, tokens in o200k_categories.items():
        if tokens:
            print(f"  {category}: {len(tokens)} tokens")
            # Show first few examples
            examples = sorted(tokens)[:3]
            print(f"    Examples: {examples}")
    
    print(f"\n🗑️ Removed from cl100k_base (examples):")
    cl100k_categories = analyze_token_types(only_cl100k)
    for category, tokens in cl100k_categories.items():
        if tokens:
            print(f"  {category}: {len(tokens)} tokens")
            examples = sorted(tokens)[:3]
            print(f"    Examples: {examples}")
    
    # Key insights
    print(f"\n💡 Key Differences:")
    
    # Count non-English tokens
    o200k_non_english = len([t for t in o200k_tokens if not t.isascii()])
    cl100k_non_english = len([t for t in cl100k_tokens if not t.isascii()])
    
    print(f"• Non-English tokens:")
    print(f"  o200k: {o200k_non_english:,} ({o200k_non_english/len(o200k_tokens)*100:.1f}%)")
    print(f"  cl100k: {cl100k_non_english:,} ({cl100k_non_english/len(cl100k_tokens)*100:.1f}%)")
    
    print(f"• o200k_base has {len(only_o200k):,} new tokens for better multilingual support")
    print(f"• Vocabulary doubled in size for improved token efficiency")

In [6]:
if __name__ == "__main__":
    main()

🔍 Comparing GPT Tokenizer Vocabularies
Downloading from https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken
Downloading from https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken

📊 Vocabulary Sizes:
o200k_base (GPT-4o): 199,998 tokens
cl100k_base (GPT-4): 100,256 tokens

🔄 Token Overlap:
Common tokens: 85,033
Only in o200k: 114,965
Only in cl100k: 15,223

🆕 New in o200k_base (examples):
  english_words: 15418 tokens
    Examples: ['ABD', 'ABE', 'ABET']
  numbers: 164 tokens
    Examples: ['٠', '١', '٢']
  punctuation: 426 tokens
    Examples: ['\x00\x00', '\n\n\n//', '\n\n//']
  whitespace: 48 tokens
    Examples: ['\n\n\r\n', '\n\n \n', '\n\n \n\n']
  non_english: 65341 tokens
    Examples: [' (§', ' («', ' (‘']
  mixed: 875 tokens
    Examples: ["b' \\xc6'", "b' \\xd2'", "b' \\xd3'"]
  other: 32693 tokens
    Examples: ['\x01E', '\tAccount', '\tArrays']

🗑️ Removed from cl100k_base (examples):
  english_words: 4908 tokens
    Examples: ['ACCE