## Login to hugging face 

In [1]:
import os
from dotenv import load_dotenv
from huggingface_hub import HfApi

In [2]:
load_dotenv()
HF_TOKEN_READ = os.getenv("HF_TOKEN_READ")
api = HfApi(token=HF_TOKEN_READ)
user = api.whoami()

print(user['name'])

abhishekdey


## Byte-Pair-Encoding  Tokenization

In [41]:
from transformers import AutoTokenizer

bpe_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", token=HF_TOKEN_READ)


In [42]:
bpe_tokenizer

GPT2TokenizerFast(name_or_path='openai-community/gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [43]:
text = "cat cattle"
bpe_tokenizer.tokenize(text)

['cat', 'Ġcattle']

### BPE Token id

In [44]:
input_ids =  bpe_tokenizer(text, return_tensors="pt")

In [45]:
input_ids

{'input_ids': tensor([[ 9246, 17025]]), 'attention_mask': tensor([[1, 1]])}

## Word-Piece Tokenization

In [25]:
from transformers import AutoTokenizer, AutoModel

wp_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased",token=HF_TOKEN_READ)



In [26]:
wp_tokenizer

BertTokenizerFast(name_or_path='google-bert/bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [38]:
text="cat cattle dog dogs elephant"

wp_tokenizer.tokenize(text)

['cat', 'cattle', 'dog', 'dogs', 'elephant']

### WP Token id

In [39]:
input_ids =  wp_tokenizer(text, return_tensors="pt")

In [40]:
input_ids

{'input_ids': tensor([[  101,  5855,  6937,  3676,  6363, 15172,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

## Indic Tokenizer

In [21]:
from transformers import AutoTokenizer, AutoModel


indic_tokenizer =  AutoTokenizer.from_pretrained("ai4bharat/IndicBERTv2-MLM-only", token=HF_TOKEN_READ)

### Hindi tokenization

In [22]:
text="मुझे एआई पसंद है।"

indic_tokenizer.tokenize(text)

['मुझे', 'एआई', 'पसंद', 'है', '।']

### Bengali tokenization

In [23]:
text="আমি বাংলায় গান গাই।"

indic_tokenizer.tokenize(text)

['আমি', 'বাংলায়', 'গান', 'গাই', '।']

### Assamese tokenization

In [24]:
text="নমস্কাৰ, আপোনাৰ কি খবৰ ?"

indic_tokenizer.tokenize(text)

['নম', '##স্ক', '##াৰ', ',', 'আপোনাৰ', 'কি', 'খবৰ', '?']