In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q sentencepiece transformers datasets huggingface_hub

In [None]:
from __future__ import annotations
import os
import json
import time
import random
import logging
import unicodedata
import re
import getpass
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter, defaultdict
from typing import List, Dict, Union, Tuple, Optional
from transformers import AutoTokenizer

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class TokenizerEvaluator:

    def __init__(self, test_texts: List[str]):
        self.test_texts = test_texts

    def calculate_fertility(self, tokenizer, texts: List[str]) -> float:
        """calculate average tokens per word"""
        total_tokens = 0
        total_words = 0

        for text in texts:
            words = text.split()
            total_words += len(words)

            if hasattr(tokenizer, 'encode_as_ids') and 'SentencePiece' in str(type(tokenizer)):
                tokens = tokenizer.encode_as_ids(text)
            elif hasattr(tokenizer, 'encode'):
                try:
                    tokens = tokenizer.encode(text, add_special_tokens=False)
                except TypeError:
                    tokens = tokenizer.encode(text)
            else:
                tokens = tokenizer.tokenize(text)
            total_tokens += len(tokens)

        return total_tokens / total_words if total_words > 0 else 0

    def calculate_compression_ratio(self, tokenizer, texts: List[str]) -> float:
        """calculate compression ratio (chars per token)"""
        total_chars = 0
        total_tokens = 0

        for text in texts:
            total_chars += len(text)

            if hasattr(tokenizer, 'encode_as_ids') and 'SentencePiece' in str(type(tokenizer)):
                tokens = tokenizer.encode_as_ids(text)
            elif hasattr(tokenizer, 'encode'):
                try:
                    tokens = tokenizer.encode(text, add_special_tokens=False)
                except TypeError:
                    tokens = tokenizer.encode(text)
            else:
                tokens = tokenizer.tokenize(text)
            total_tokens += len(tokens)

        return total_chars / total_tokens if total_tokens > 0 else 0

    def calculate_coverage(self, tokenizer, texts: List[str]) -> float:
        """calculate vocabulary coverage (1 - UNK ratio)"""
        total_tokens = 0
        unk_tokens = 0

        for text in texts:
            if hasattr(tokenizer, 'encode_as_ids') and 'SentencePiece' in str(type(tokenizer)):
                tokens = tokenizer.encode_as_ids(text)
                unk_tokens += sum(1 for t in tokens if t == 1)
            elif hasattr(tokenizer, 'encode'):
                try:
                    tokens = tokenizer.encode(text, add_special_tokens=False)
                except TypeError:
                    tokens = tokenizer.encode(text)
                unk_ids = {0, 1, tokenizer.unk_token_id} if hasattr(tokenizer, 'unk_token_id') else {0, 1}
                unk_tokens += sum(1 for t in tokens if t in unk_ids)
            else:
                tokens = tokenizer.tokenize(text)
                unk_tokens += sum(1 for t in tokens if '<unk>' in str(t).lower() or '[unk]' in str(t).lower())

            total_tokens += len(tokens)

        coverage = 1 - (unk_tokens / total_tokens) if total_tokens > 0 else 0
        return max(0, coverage)

    def calculate_token_length_distribution(self, tokenizer, texts: List[str]) -> Dict[str, float]:
        """calculate token length statistics"""
        token_lengths = []
        sample_texts = texts[:100]

        for text in sample_texts:
            if hasattr(tokenizer, 'encode_as_pieces') and 'SentencePiece' in str(type(tokenizer)):
                tokens = tokenizer.encode_as_pieces(text)
            elif hasattr(tokenizer, 'tokenize'):
                tokens = tokenizer.tokenize(text)
            else:
                continue

            for token in tokens:
                token_str = str(token).replace('▁', '').replace('Ġ', '')
                token_lengths.append(len(token_str))

        if not token_lengths:
            return {'mean_length': 0, 'std_length': 0}

        return {
            'mean_length': np.mean(token_lengths),
            'std_length': np.std(token_lengths)
        }

    def evaluate_tokenizer(self, tokenizer, name: str) -> Dict[str, float]:

        logger.info(f"Evaluating tokenizer: {name}")

        eval_texts = self.test_texts[:500] if len(self.test_texts) > 500 else self.test_texts

        metrics = {
            'fertility': self.calculate_fertility(tokenizer, eval_texts),
            'compression_ratio': self.calculate_compression_ratio(tokenizer, eval_texts),
            'coverage': self.calculate_coverage(tokenizer, eval_texts)
        }

        length_stats = self.calculate_token_length_distribution(tokenizer, eval_texts)
        metrics.update(length_stats)

        metrics['composite_score'] = (
            metrics['compression_ratio'] * metrics['coverage'] / metrics['fertility']
            if metrics['fertility'] > 0 else 0
        )

        logger.info(f"Metrics for {name}: {metrics}")
        return metrics

class TamilCorpusProcessor:
    """handles Tamil corpus loading and preprocessing"""

    def __init__(self, max_sentences: int = 5000):
        self.max_sentences = max_sentences
        self.tamil_pattern = re.compile(r'[\u0B80-\u0BFF]')
        self.tamil_vowels = re.compile(r'[அஆஇஈஉஊஎஏஐஒஓஔ]')
        self.tamil_consonants = re.compile(r'[கஙசஞடணதநபமயரலவழளறன]')

    def load_text_file(self, file_path: str, max_sentences: int = None) -> List[str]:
        """load Tamil text file (one sentence per line)"""
        logger.info(f"Loading text file from: {file_path}")

        if max_sentences is None:
            max_sentences = self.max_sentences

        file_path = Path(file_path)

        sentences = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i >= max_sentences:
                    break
                line = line.strip()
                if line:
                    sentences.append(line)

        valid_sentences = [s for s in sentences if self.is_valid_tamil_text(s)]
        logger.info(f"Loaded {len(valid_sentences)} valid sentences from {len(sentences)} total")

        return valid_sentences[:max_sentences]

    def split_into_sentences(self, text: str) -> List[str]:
        sentences = re.split(r'[.!?।]+', text)
        return [s.strip() for s in sentences if len(s.strip()) > 10]

    def is_valid_tamil_text(self, text: str) -> bool:
        """check if text is valid Tamil"""
        if len(text) < 10 or len(text) > 1000:
            return False

        tamil_chars = len(self.tamil_pattern.findall(text))
        total_chars = len([c for c in text if c.isalpha()])

        if total_chars == 0:
            return False

        tamil_ratio = tamil_chars / total_chars

        if tamil_ratio < 0.6:
            return False

        common_tamil_words = ['அது', 'இது', 'நான்', 'நீ', 'அவன்', 'அவள்', 'நாம்', 'நீங்கள்', 'அவர்கள்', 'என்', 'உன்', 'அவன்', 'மற்றும்', 'ஆனால்']
        words = text.split()
        tamil_word_count = sum(1 for word in words if any(tw in word for tw in common_tamil_words))

        return tamil_word_count > 0 or tamil_ratio > 0.7

class TokenizerLoader:

    def __init__(self, hf_token: str = None):
        self.hf_token = hf_token

    def load_huggingface_tokenizer(self, model_name: str):
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            token=self.hf_token,
            trust_remote_code=True
        )
        logger.info(f"Loaded HuggingFace tokenizer: {model_name}")
        return tokenizer

def evaluate_tokenizers(test_texts: List[str], tokenizer_configs: List[Dict], hf_token: str = None) -> pd.DataFrame:

    evaluator = TokenizerEvaluator(test_texts)
    loader = TokenizerLoader(hf_token=hf_token)

    results = []

    for config in tokenizer_configs:
        tokenizer_name = config['name']
        tokenizer_type = config['type']
        tokenizer_path = config['path']

        logger.info(f"Loading tokenizer: {tokenizer_name}")

        if tokenizer_type == 'huggingface':
            tokenizer = loader.load_huggingface_tokenizer(tokenizer_path)
        else:
            logger.error(f"Unknown tokenizer type: {tokenizer_type}")
            continue

        if tokenizer is None:
            logger.warning(f"Skipping {tokenizer_name} - failed to load")
            continue

        metrics = evaluator.evaluate_tokenizer(tokenizer, tokenizer_name)
        results.append({"tokenizer": tokenizer_name, "type": tokenizer_type, **metrics})

    return pd.DataFrame(results)

def main(dataset_path: str = None, max_sentences: int = 2000, hf_token: str = None):

    logger.info("Starting Tamil tokenizer evaluation...")

    random.seed(42)
    np.random.seed(42)

    processor = TamilCorpusProcessor(max_sentences=max_sentences)

    if dataset_path:
        logger.info(f"Using dataset: {dataset_path}")
        test_texts = processor.load_text_file(dataset_path, max_sentences)

    if len(test_texts) == 0:
        logger.error("No valid test texts loaded")
        return

    logger.info(f"Successfully loaded {len(test_texts)} sentences for evaluation")

    tokenizer_configs = [
        {
            'name': 'Llama-2-7B',
            'type': 'huggingface',
            'path': 'meta-llama/Llama-2-7b-hf'
        },
        {
            'name': 'Gemma-7B',
            'type': 'huggingface',
            'path': 'google/gemma-7b'
        }
    ]

    logger.info(f"Configured {len(tokenizer_configs)} tokenizers for evaluation")

    print("Evaluating tokenizers...")
    results_df = evaluate_tokenizers(test_texts, tokenizer_configs, hf_token=hf_token)

    if len(results_df) == 0:
        print("ERROR: No tokenizers were successfully evaluated")
        return

    print("\n" + "="*100)
    print("TAMIL TOKENIZER EVALUATION RESULTS")
    print("="*100)

    display_df = results_df.copy()
    numeric_cols = ['fertility', 'compression_ratio', 'coverage', 'composite_score', 'mean_length']
    for col in numeric_cols:
        if col in display_df.columns:
            display_df[col] = display_df[col].round(4)

    print(display_df[['tokenizer', 'type', 'fertility', 'compression_ratio', 'coverage', 'composite_score']].to_string(index=False))
    print("="*100)

    best_tokenizer = results_df.loc[results_df['composite_score'].idxmax()]
    print(f"\nBest Tokenizer: {best_tokenizer['tokenizer']}")
    print(f"  - Composite Score: {best_tokenizer['composite_score']:.4f}")
    print(f"  - Fertility: {best_tokenizer['fertility']:.4f} (lower is better)")
    print(f"  - Compression Ratio: {best_tokenizer['compression_ratio']:.4f} (higher is better)")
    print(f"  - Coverage: {best_tokenizer['coverage']:.4f} (higher is better)")

def run_evaluation():

    print("Tamil Tokenizer Evaluation Script")
    print("=" * 50)

    hf_token = getpass.getpass("EHugging Face token : ")

    dataset_path = '/content/drive/My Drive/Colab Notebooks/LRLs/tamil/dataset/ta_reduced_eval.txt'
    max_sentences = 5000

    print(f"\nDataset Configuration:")
    print(f"  Path: {dataset_path}")
    print(f"  Max Sentences: {max_sentences}")

    main(dataset_path=dataset_path, max_sentences=max_sentences, hf_token=hf_token)

if __name__ == "__main__":
    run_evaluation()