In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q sentencepiece transformers datasets huggingface_hub

In [None]:
import json
import os
import sys
from pathlib import Path
from typing import List, Dict, Tuple
import pandas as pd
import numpy as np
import sentencepiece as spm
from transformers import AutoTokenizer

class TokenizerEvaluator:
    def __init__(self):
        self.test_sentences = [
            "नेपालमा राजनीतिकस्थिरता नहुँदा विकास रोकिन्छ।",
            "नमस्कार, तपाईं कसो हुनुहुन्छ?",
            "काठमाडौं नेपालको राजधानी हो।"
        ]

        self.results = []

    def count_words(self, sentence: str) -> int:
        import re
        cleaned = re.sub(r'[।॥।,\?\!\.]+', '', sentence)
        words = cleaned.strip().split()
        return len([w for w in words if w.strip()])

    def load_simple_bpe(self, model_path: str):
        try:
            model_file = os.path.join(model_path, 'simple_bpe_model.json')
            if not os.path.exists(model_file):
                return None

            with open(model_file, 'r', encoding='utf-8') as f:
                model_data = json.load(f)

            class SimpleBPETokenizer:
                def __init__(self, vocab, merges):
                    self.vocab = vocab
                    self.merges = merges
                    self.inv_vocab = {v: k for k, v in vocab.items()}

                def tokenize_word(self, word):
                    word = '▁' + word
                    tokens = list(word)

                    for merge_rule in self.merges:
                        left = merge_rule['left']
                        right = merge_rule['right']

                        new_tokens = []
                        i = 0
                        while i < len(tokens):
                            if i < len(tokens) - 1 and tokens[i] == left and tokens[i + 1] == right:
                                merged = left + right
                                new_tokens.append(merged)
                                i += 2
                            else:
                                new_tokens.append(tokens[i])
                                i += 1
                        tokens = new_tokens

                    return tokens

                def tokenize(self, text):
                    words = text.split()
                    all_tokens = []
                    for word in words:
                        word_tokens = self.tokenize_word(word)
                        all_tokens.extend(word_tokens)
                    return all_tokens

            vocab = model_data['vocab']
            merges = model_data['merges']

            return SimpleBPETokenizer(vocab, merges)

        except Exception as e:
            print(f"Error loading SimpleBPE: {e}")
            return None

    def load_picky_bpe(self, model_path: str):
        try:
            tokenizer_file = os.path.join(model_path, 'tokenizer.json')
            if os.path.exists(tokenizer_file):
                added_tokens_file = os.path.join(model_path, 'added_tokens.json')
                if os.path.exists(added_tokens_file):
                    with open(added_tokens_file, 'r', encoding='utf-8') as f:
                        added_tokens = json.load(f)

                    if isinstance(added_tokens, list):
                        with open(added_tokens_file, 'w', encoding='utf-8') as f:
                            json.dump({}, f, indent=2, ensure_ascii=False)
                        print(f"Fixed added_tokens.json format for PickyBPE")

                return AutoTokenizer.from_pretrained(model_path)
        except Exception as e:
            print(f"HuggingFace loading failed: {e}")

        try:
            picky_model_file = os.path.join(model_path, 'picky_bpe_model.json')
            if not os.path.exists(picky_model_file):
                print(f"PickyBPE native model file not found at {picky_model_file}")
                return None

            with open(picky_model_file, 'r', encoding='utf-8') as f:
                model_data = json.load(f)

            class PickyBPETokenizer:
                def __init__(self, model_data):
                    self.vocab = {}
                    self.inv_vocab = {}

                    token_id = 0
                    for token_data in model_data['tokens']:
                        if token_data.get('present', True):
                            token_str = token_data['str']
                            self.vocab[token_str] = token_id
                            self.inv_vocab[token_id] = token_str
                            token_id += 1

                    self.merges = []
                    for merge_data in model_data.get('merges', []):
                        left_token = merge_data['pair'][0]['str']
                        right_token = merge_data['pair'][1]['str']
                        self.merges.append((left_token, right_token))

                def preprocess_text(self, text):
                    import unicodedata
                    text = unicodedata.normalize('NFKC', text)
                    text = text.replace(' ', f' ▁')
                    return text.strip()

                def tokenize_word(self, word):
                    if not word.startswith('▁'):
                        word = '▁' + word

                    tokens = []
                    for char in word:
                        if char in self.vocab:
                            tokens.append(char)
                        else:
                            tokens.append('<unk>')

                    for left, right in self.merges:
                        new_tokens = []
                        i = 0
                        while i < len(tokens):
                            if i < len(tokens) - 1 and tokens[i] == left and tokens[i + 1] == right:
                                merged = left + right
                                if merged in self.vocab:
                                    new_tokens.append(merged)
                                    i += 2
                                else:
                                    new_tokens.append(tokens[i])
                                    i += 1
                            else:
                                new_tokens.append(tokens[i])
                                i += 1
                        tokens = new_tokens

                    return tokens

                def tokenize(self, text):
                    processed_text = self.preprocess_text(text)
                    words = processed_text.split()
                    all_tokens = []
                    for word in words:
                        word_tokens = self.tokenize_word(word)
                        all_tokens.extend(word_tokens)
                    return all_tokens

            return PickyBPETokenizer(model_data)

        except Exception as e:
            print(f"Error loading PickyBPE native format: {e}")
            return None

    def load_grapheme_picky_bpe(self, model_path: str):
        try:
            tokenizer_file = os.path.join(model_path, 'tokenizer.json')
            if os.path.exists(tokenizer_file):
                added_tokens_file = os.path.join(model_path, 'added_tokens.json')
                if os.path.exists(added_tokens_file):
                    with open(added_tokens_file, 'r', encoding='utf-8') as f:
                        content = f.read().strip()
                        if not content or content == '[]':
                            with open(added_tokens_file, 'w', encoding='utf-8') as fw:
                                json.dump([], fw, indent=2, ensure_ascii=False)

                return AutoTokenizer.from_pretrained(model_path)
        except Exception as e:
            print(f"HuggingFace loading failed: {e}")

        try:
            grapheme_model_file = os.path.join(model_path, 'grapheme_picky_bpe_model.json')
            if not os.path.exists(grapheme_model_file):
                print(f"Grapheme PickyBPE model file not found at {grapheme_model_file}")
                return None

            with open(grapheme_model_file, 'r', encoding='utf-8') as f:
                model_data = json.load(f)

            class GraphemePickyBPETokenizer:
                def __init__(self, model_data):
                    self.vocab = {}
                    self.inv_vocab = {}

                    token_id = 0
                    for token_data in model_data['tokens']:
                        if token_data.get('present', True):
                            token_str = token_data['str']
                            self.vocab[token_str] = token_id
                            self.inv_vocab[token_id] = token_str
                            token_id += 1

                    self.merges = []
                    for merge_data in model_data.get('merges', []):
                        left_token = merge_data['pair'][0]['str']
                        right_token = merge_data['pair'][1]['str']
                        self.merges.append((left_token, right_token))

                def preprocess_text(self, text):
                    import unicodedata
                    text = unicodedata.normalize('NFC', text)
                    text = text.replace(' ', f' ▁')
                    return text.strip()

                def tokenize_word(self, word):
                    if not word.startswith('▁'):
                        word = '▁' + word

                    import grapheme
                    graphemes = list(grapheme.graphemes(word))

                    tokens = []
                    for g in graphemes:
                        if g in self.vocab:
                            tokens.append(g)
                        else:
                            tokens.append('<unk>')

                    for left, right in self.merges:
                        new_tokens = []
                        i = 0
                        while i < len(tokens):
                            if i < len(tokens) - 1 and tokens[i] == left and tokens[i + 1] == right:
                                merged = left + right
                                if merged in self.vocab:
                                    new_tokens.append(merged)
                                    i += 2
                                else:
                                    new_tokens.append(tokens[i])
                                    i += 1
                            else:
                                new_tokens.append(tokens[i])
                                i += 1
                        tokens = new_tokens

                    return tokens

                def tokenize(self, text):
                    processed_text = self.preprocess_text(text)
                    words = processed_text.split()
                    all_tokens = []
                    for word in words:
                        word_tokens = self.tokenize_word(word)
                        all_tokens.extend(word_tokens)
                    return all_tokens

            return GraphemePickyBPETokenizer(model_data)

        except Exception as e:
            print(f"Error loading Grapheme PickyBPE native format: {e}")
            return None

    def load_sage_tokenizer(self, model_path: str):
        try:
            model_file = os.path.join(model_path, 'nepali_tokenizer.model')
            if not os.path.exists(model_file):
                print(f"SaGe model file not found at {model_file}")
                return None

            sp = spm.SentencePieceProcessor()
            sp.load(model_file)
            return sp

        except Exception as e:
            print(f"Error loading SaGe tokenizer: {e}")
            return None

    def evaluate_tokenizer(self, tokenizer, tokenizer_name: str, tokenizer_type: str):
        print(f"Evaluating {tokenizer_name}...")

        if tokenizer is None:
            print(f"Skipping {tokenizer_name} - not available")
            return

        sentence_results = []

        for i, sentence in enumerate(self.test_sentences, 1):
            try:
                word_count = self.count_words(sentence)

                if tokenizer_type == 'simple_bpe':
                    tokens = tokenizer.tokenize(sentence)
                    token_count = len(tokens)
                elif tokenizer_type == 'picky_bpe':
                    tokens = tokenizer.tokenize(sentence)
                    token_count = len(tokens)
                elif tokenizer_type == 'grapheme_picky_bpe':
                    tokens = tokenizer.tokenize(sentence)
                    token_count = len(tokens)
                elif tokenizer_type == 'sage':
                    tokens = tokenizer.encode_as_pieces(sentence)
                    token_count = len(tokens)
                else:
                    tokens = []
                    token_count = 0

                fertility = token_count / word_count if word_count > 0 else 0

                sentence_results.append({
                    'sentence_id': i,
                    'sentence': sentence,
                    'tokenizer': tokenizer_name,
                    'word_count': word_count,
                    'token_count': token_count,
                    'fertility': fertility,
                    'tokens': tokens[:10]
                })

            except Exception as e:
                print(f"Error processing sentence {i} with {tokenizer_name}: {e}")
                sentence_results.append({
                    'sentence_id': i,
                    'sentence': sentence,
                    'tokenizer': tokenizer_name,
                    'word_count': self.count_words(sentence),
                    'token_count': 0,
                    'fertility': 0,
                    'tokens': []
                })

        self.results.extend(sentence_results)

    def run_evaluation(self, tokenizer_paths: Dict[str, str]):
        print("Starting Nepali Tokenizer Evaluation")
        print("=" * 50)

        if 'simple_bpe' in tokenizer_paths:
            simple_bpe = self.load_simple_bpe(tokenizer_paths['simple_bpe'])
            self.evaluate_tokenizer(simple_bpe, 'SimpleBPE', 'simple_bpe')

        if 'picky_bpe' in tokenizer_paths:
            picky_bpe = self.load_picky_bpe(tokenizer_paths['picky_bpe'])
            self.evaluate_tokenizer(picky_bpe, 'PickyBPE', 'picky_bpe')

        if 'grapheme_picky_bpe' in tokenizer_paths:
            grapheme_picky_bpe = self.load_grapheme_picky_bpe(tokenizer_paths['grapheme_picky_bpe'])
            self.evaluate_tokenizer(grapheme_picky_bpe, 'GraphemePickyBPE', 'grapheme_picky_bpe')

        if 'sage' in tokenizer_paths:
            sage = self.load_sage_tokenizer(tokenizer_paths['sage'])
            self.evaluate_tokenizer(sage, 'SaGe', 'sage')

    def display_results(self):
        if not self.results:
            print("No results to display")
            return

        df = pd.DataFrame(self.results)

        print("\nDetailed Results by Sentence:")
        print("=" * 80)

        for sentence_id in sorted(df['sentence_id'].unique()):
            sentence_data = df[df['sentence_id'] == sentence_id]
            sentence = sentence_data.iloc[0]['sentence']

            print(f"\nSentence {sentence_id}: {sentence}")
            print(f"Words: {sentence_data.iloc[0]['word_count']}")
            print("-" * 60)

            for _, row in sentence_data.iterrows():
                print(f"{row['tokenizer']:>18}: {row['token_count']:>3} tokens | "
                      f"Fertility: {row['fertility']:>5.2f} | "
                      f"Tokens: {row['tokens']}")

        print(f"\n\nSummary Statistics:")
        print("=" * 50)

        summary = df.groupby('tokenizer').agg({
            'token_count': ['mean', 'std', 'sum'],
            'fertility': ['mean', 'std'],
            'word_count': ['sum']
        }).round(3)

        print(summary)

        print(f"\n\nFertility Comparison (Average across all sentences):")
        print("-" * 40)
        fertility_avg = df.groupby('tokenizer')['fertility'].mean().sort_values()
        for tokenizer, fertility in fertility_avg.items():
            print(f"{tokenizer:>18}: {fertility:.3f}")

        print(f"\n\nTotal Tokens (sum across all sentences):")
        print("-" * 35)
        total_tokens = df.groupby('tokenizer')['token_count'].sum().sort_values()
        for tokenizer, total in total_tokens.items():
            print(f"{tokenizer:>18}: {total:>4} tokens")

        return df

def main():
    tokenizer_paths = {
        'simple_bpe': "/content/drive/My Drive/Colab Notebooks/LRLs/nepali/tokenizers/standard_bpe",
        'picky_bpe': "/content/drive/My Drive/Colab Notebooks/LRLs/nepali/tokenizers/picky_bpe",
        'grapheme_picky_bpe': "/content/drive/My Drive/Colab Notebooks/LRLs/nepali/tokenizers/grapheme_picky_bpe",
        'sage': "/content/drive/My Drive/Colab Notebooks/LRLs/nepali/tokenizers/sage_gensim"
    }

    print("Checking tokenizer paths...")
    for name, path in tokenizer_paths.items():
        if os.path.exists(path):
            print(f"{name:>18}: Found at {path}")
        else:
            print(f"{name:>18}: NOT FOUND at {path}")

    evaluator = TokenizerEvaluator()
    evaluator.run_evaluation(tokenizer_paths)

    results_df = evaluator.display_results()

if __name__ == "__main__":
    main()