## Login to hugging face 

In [1]:
import os
from dotenv import load_dotenv
from huggingface_hub import HfApi

In [2]:
load_dotenv()
HF_TOKEN_READ = os.getenv("HF_TOKEN_READ")
api = HfApi(token=HF_TOKEN_READ)
user = api.whoami()

print(user['name'])

abhishekdey


## Function to get an estimate of vocabulary size

In [3]:
from collections import Counter

def estimate_vocab_size(corpus_path, min_n=2, max_n=5, min_ngram_freq=2):
    """
    Estimate a suggested vocabulary size for a BPE tokenizer.

    Args:
        corpus_path (str): Path to the text corpus file.
        min_n (int): Minimum character n-gram length.
        max_n (int): Maximum character n-gram length.
        min_ngram_freq (int): Minimum frequency of n-gram to be considered.

    Returns:
        dict: Suggested vocab sizes based on words and subwords.
    """

    # Read corpus
    with open(corpus_path, "r", encoding="utf-8") as f:
        text = f.read()

    # Split into words
    words = text.split()
    unique_words = set(words)
    num_unique_words = len(unique_words)

    # Count frequent character n-grams (for subword estimate)
    ngrams = Counter()
    for word in words:
        for n in range(min_n, max_n+1):
            for i in range(len(word)-n+1):
                ngram = word[i:i+n]
                ngrams[ngram] += 1

    frequent_ngrams = [ng for ng, freq in ngrams.items() if freq >= min_ngram_freq]

    return {
        "unique_words": num_unique_words,
        "frequent_ngrams": len(frequent_ngrams),
        "suggested_vocab_size": num_unique_words + len(frequent_ngrams)
    }


In [4]:
corpus_file = "hindi_corpus.txt"
vocab_stats = estimate_vocab_size(corpus_file)

print("Number of unique words:", vocab_stats["unique_words"])
print("Number of frequent subword n-grams:", vocab_stats["frequent_ngrams"])
print("Suggested vocabulary size for BPE tokenizer:", vocab_stats["suggested_vocab_size"])

Number of unique words: 47
Number of frequent subword n-grams: 59
Suggested vocabulary size for BPE tokenizer: 106


## Train a BPE tokenizer

In [5]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

### Initialize a BPE tokenizer

In [6]:
tokenizer = Tokenizer(models.BPE())

### Set pre-tokenizer

In [7]:
'''
Pre-tokenizer splits text into initial chunks based on whitespace. 
# E.g., "भारत में" → ["भारत", "में"]
'''

tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

### Setup trainer

In [8]:
'''
Trainer controls how the tokenizer learns merges.
vocab_size` = number of subwords it will learn.
special_tokens` are reserved for unknown words or padding.
'''


trainer = trainers.BpeTrainer(
    vocab_size=106,               # As suggested by estimate_vocab_size function
    special_tokens=["<unk>", "<pad>"]
)

### Train tokenizer on hindi corpus

In [9]:
files = ["hindi_corpus.txt"]
tokenizer.train(files, trainer)






### Save Custom hindi tokenizer

In [10]:
tokenizer.save("custom-hindi-tokenizer.json")

## Load custom BPE hindi tokenizer

In [11]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("custom-hindi-tokenizer.json")

### Test encoding new text

In [12]:
text = "भारत में शिक्षा का महत्व"
output = tokenizer.encode(text)

In [13]:
print("Tokens:", output.tokens)

Tokens: ['भारत', 'मे', 'ं', 'श', 'ि', 'क्ष', 'ा', 'का', 'म', 'ह', 'त्', 'व']


In [14]:
print("IDs:", output.ids)

IDs: [74, 68, 4, 31, 37, 65, 36, 87, 26, 34, 54, 30]


## Load with transformers

In [17]:
from transformers import PreTrainedTokenizerFast


tokenizer = PreTrainedTokenizerFast(tokenizer_file="custom-hindi-tokenizer.json")

In [18]:
text = "भारत में शिक्षा का महत्व"

tokenizer.tokenize(text)

['भारत', 'मे', 'ं', 'श', 'ि', 'क्ष', 'ा', 'का', 'म', 'ह', 'त्', 'व']

In [19]:
input_ids =  tokenizer(text, return_tensors="pt")

In [20]:
input_ids

{'input_ids': tensor([[74, 68,  4, 31, 37, 65, 36, 87, 26, 34, 54, 30]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}