In [1]:
import time
import tiktoken
from statistics import mean
from src.HinglishBPE import HinglishBPE 

# ---------------------------------------------------------
# 1. CONFIGURATION
# ---------------------------------------------------------
YOUR_MODEL_PREFIX = "models/hinglish_32k"  

COMPARISON_MODELS = ["o200k_base", "cl100k_base", "p50k_base"]

# ---------------------------------------------------------
# 2. EXPANDED DATASET (Now includes Pure Hindi)
# ---------------------------------------------------------
PROMPT_STYLES = {
    "Casual_Chat": [
        "Are bhai tu tension na le, maine usko bol diya hai ki kal tak payment clear kar de.",
        "Kya scene hai aaj raat ka? Netflix and chill ka bhi option hai waise.",
        "Oye sun, wo jo kal notes bheje the na tune, wo incomplete hain. Unit 4 miss ho gaya.",
        "Bhai tera birthday bash kab hai? Party toh banti hai boss!",
        "Ghar pe sab kaise hain? Aunty ko meri taraf se namaste bolna."
    ],
    "Tech_Discussion": [
        "Model train kiya tha but validation loss kam nahi ho raha. Learning rate adjust karna padega.",
        "React ka naya update dekha? Hooks ka usage simplify kar diya hai inhone.",
        "Python virtual environment setup karte waqt error aa raha hai 'pip not found'.",
        "Backend API response time bahut high hai, database indexing check karni padegi.",
        "Deployment ke liye Docker container use kar rahe ho ya direct server pe push karoge?"
    ],
    "Emotional_Rant": [
        "Zindagi mein peace hi nahi bacha hai yaar. Subah utho, office jao, boss ki chik-chik suno.",
        "Mujhe samajh nahi aata log itna fake kyun hote hain. Munh pe kuch aur, peeth peeche kuch.",
        "Sach bataun toh I am exhausted. Kabhi kabhi lagta hai sab chhod ke Himalayas chala jaun.",
        "Dil tootne ka dard wahi samajh sakta hai jisne sacha pyaar kiya ho.",
        "Itna effort daalne ke baad bhi appreciation nahi milta toh frustration hoti hai."
    ],
    "Code_Mixing_Heavy": [
        "Basically, problem ye hai ki jab hum API call karte hain, toh response time high hai.",
        "Actually, mujhe laga tha ki event 5 baje start hoga. But apparently, schedule change ho gaya.",
        "Obviously, agar tum hard work nahi karoge toh success kaise milegi?",
        "Frankly speaking, ye idea practical nahi lag raha implementation ke perspective se.",
        "Technically, ye bug nahi feature hai, but client ko kaise samjhayein?"
    ],
    "Hardcore_Hinglish": [
        "Humne socha tha ki wo aayega, par wo aaya hi nahi.",
        "Tum kahan jaa rahe ho? Ruko, main bhi chalta hoon tumhare saath.",
        "Khana kha liya kya? Aaj maine biryani banayi hai, aa jao.",
        "Ye raaz bhi uske saath hi chala gaya.",
        "Zindagi ek safar hai suhana, yahan kal kya ho kisne jaana."
    ],
    "Pure_Hindi": [
        "‡§≠‡§æ‡§∞‡§§ ‡§è‡§ï ‡§µ‡§ø‡§∂‡§æ‡§≤ ‡§¶‡•á‡§∂ ‡§π‡•à ‡§î‡§∞ ‡§á‡§∏‡§ï‡•Ä ‡§∏‡§Ç‡§∏‡•ç‡§ï‡•É‡§§‡§ø ‡§¨‡§π‡•Å‡§§ ‡§µ‡§ø‡§µ‡§ø‡§ß ‡§π‡•à‡•§",
        "‡§µ‡§ø‡§ú‡•ç‡§û‡§æ‡§® ‡§î‡§∞ ‡§™‡•ç‡§∞‡•å‡§¶‡•ç‡§Ø‡•ã‡§ó‡§ø‡§ï‡•Ä ‡§ï‡•á ‡§ï‡•ç‡§∑‡•á‡§§‡•ç‡§∞ ‡§Æ‡•á‡§Ç ‡§π‡§Æ‡§®‡•á ‡§¨‡§π‡•Å‡§§ ‡§™‡•ç‡§∞‡§ó‡§§‡§ø ‡§ï‡•Ä ‡§π‡•à‡•§",
        "‡§ï‡•É‡§™‡§Ø‡§æ ‡§Æ‡•Å‡§ù‡•á ‡§™‡§æ‡§®‡•Ä ‡§ï‡§æ ‡§è‡§ï ‡§ó‡§ø‡§≤‡§æ‡§∏ ‡§¶‡•Ä‡§ú‡§ø‡§Ø‡•á‡•§",
        "‡§∏‡§§‡•ç‡§Ø‡§Æ‡•á‡§µ ‡§ú‡§Ø‡§§‡•á‡•§",
        "‡§Ü‡§ú ‡§Æ‡•å‡§∏‡§Æ ‡§¨‡§π‡•Å‡§§ ‡§∏‡•Å‡§π‡§æ‡§µ‡§®‡§æ ‡§π‡•à, ‡§ö‡§≤‡•ã ‡§¨‡§æ‡§π‡§∞ ‡§ò‡•Ç‡§Æ‡§®‡•á ‡§ö‡§≤‡§§‡•á ‡§π‡•à‡§Ç‡•§"
    ]
}

def run_comprehensive_benchmark():
    print(f"\n{'='*95}")
    print(f"COMPREHENSIVE HINGLISH TOKENIZER BENCHMARK")
    print(f"{'='*95}\n")

    # 1. Load Your Model
    print(f"Loading Local Model: {YOUR_MODEL_PREFIX}...")
    my_tok = HinglishBPE()
    try:
        my_tok.load(YOUR_MODEL_PREFIX)
    except Exception as e:
        print(f"Error loading model: {e}")
        return

    # 2. Load Comparison Models
    others = {}
    for model_name in COMPARISON_MODELS:
        try:
            others[model_name] = tiktoken.get_encoding(model_name)
        except:
            print(f"Warning: Could not load {model_name}. Skipping.")

    print("\nStarting comparison against industry standards...\n")

    # 3. Print Detail Table Header
    header = f"{'CATEGORY':<18} | {'Hinglish':^8} | {'o200k':^7} | {'cl100k':^7} | {'p50k':^7} | {'vs o200k':^10}"
    print("-" * len(header))
    print(header)
    print("-" * len(header))

    total_mine = 0
    total_o200k = 0
    
    # Store category-wise statistics
    category_stats = {} 

    # 4. Run Benchmark
    for category, texts in PROMPT_STYLES.items():
        cat_mine = 0
        cat_o200k = 0
        
        for text in texts:
            my_ids = my_tok.encode(text)
            len_my = len(my_ids)
            total_mine += len_my
            cat_mine += len_my

            counts = {}
            for name, tokenizer in others.items():
                counts[name] = len(tokenizer.encode(text))
            
            o200k_val = counts.get("o200k_base", 0)
            total_o200k += o200k_val
            cat_o200k += o200k_val

            # Calculating Efficiency vs GPT-4o (o200k)
            ratio = o200k_val / len_my
            
            if ratio > 1.05:
                imp_str = f"‚úÖ {ratio:.2f}x"
            elif ratio < 0.95:
                imp_str = f"üîª {ratio:.2f}x"
            else:
                imp_str = f"~ {ratio:.2f}x"

            print(f"{category:<18} | {len_my:^8} | {o200k_val:^7} | {counts.get('cl100k_base',0):^7} | {counts.get('p50k_base',0):^7} | {imp_str:^10}")
        
        # Save aggregated stats for this category
        category_stats[category] = (cat_mine, cat_o200k)

    # 5. Category Breakdown Table
    print("\n" + "="*60)
    print(f"TASK-WISE PERFORMANCE BREAKDOWN")
    print("="*60)
    cat_header = f"{'TASK CATEGORY':<20} | {'Hinglish':^7} | {'GPT-4o':^7} | {'IMPROVEMENT':^12}"
    print(cat_header)
    print("-" * len(cat_header))
    
    for cat, (mine, other) in category_stats.items():
        ratio = other / mine
        if ratio > 1.0:
            status = f"‚úÖ {ratio:.2f}x"
        else:
            status = f"üîª {ratio:.2f}x"
        print(f"{cat:<20} | {mine:^7} | {other:^7} | {status:^12}")

    # 6. Final Summary
    avg_efficiency = total_o200k / total_mine
    print("\n" + "="*60)
    print(f"FINAL AGGREGATE STATS")
    print("="*60)
    print(f"   Total Tokens (You):    {total_mine}")
    print(f"   Total Tokens (GPT-4o): {total_o200k}")
    print(f"   Global Efficiency:     {avg_efficiency:.3f}x")

    if avg_efficiency > 1.0:
        print(f"\n VERDICT: HinglishBPE is {((avg_efficiency-1)*100):.1f}% more efficient than GPT-4o.")
    else:
        print(f"\n VERDICT: HinglishBPE is slightly less efficient.")

if __name__ == "__main__":
    run_comprehensive_benchmark()


COMPREHENSIVE HINGLISH TOKENIZER BENCHMARK

Loading Local Model: models/hinglish_32k...
Loaded tokenizer from models/hinglish_32k. (vocab size: 32,768)

Starting comparison against industry standards...

------------------------------------------------------------------------
CATEGORY           | Hinglish |  o200k  | cl100k  |  p50k   |  vs o200k 
------------------------------------------------------------------------
Casual_Chat        |    20    |   23    |   24    |   28    |  ‚úÖ 1.15x  
Casual_Chat        |    16    |   20    |   21    |   23    |  ‚úÖ 1.25x  
Casual_Chat        |    25    |   25    |   28    |   28    |  ~ 1.00x  
Casual_Chat        |    14    |   15    |   17    |   20    |  ‚úÖ 1.07x  
Casual_Chat        |    18    |   20    |   22    |   24    |  ‚úÖ 1.11x  
Tech_Discussion    |    19    |   21    |   23    |   25    |  ‚úÖ 1.11x  
Tech_Discussion    |    18    |   18    |   20    |   24    |  ~ 1.00x  
Tech_Discussion    |    19    |   16    |   18    |   2