<a href="https://colab.research.google.com/github/blackwithwhitegreen/Tokenization/blob/main/SentencePiece_tokenization_technique.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#SentencePiece
- there are so many parameters which have different uses.

In [1]:
import sentencepiece as spm

In [2]:
#write a toy.txt file with some random text
with open("toy.txt","w",encoding="utf-8") as f:
  f.write("""Summary report of the techniques, tutorial code with proper functioning, blog for publication, video demo (optional but recommended)
Implementation code and report for HuggingFace models and inference API
""")

In [3]:
#train a sentencepiece model on it
# The settings here are(best effort) those used for training Llama2
import os

options = dict(
    #input spec
    input="toy.txt",
    input_format="text",
    #output sec
    model_prefix = "tok400",# Output filename prefix
    #algorithm spec
    # BPE alg
    model_type ="bpe",
    vocab_size=400,
    # normalization
    normalization_rule_name="identity",# ew, turn off normalization
    remove_extra_whitespaces=False,
    input_sentence_size=200000000,
    max_sentence_length=4192, # max number of bytes per sentence
    seed_sentencepiece_size = 1000000,
    shuffle_input_sentence=True,
    # rare word treatment
    character_coverage=0.99995,
    byte_fallback=True,# Llama by-default set the byte_fallback true, becuase it help to assign the unknown words.
    #merge rules
    split_digits = True,
    split_by_unicode_script = True,
    split_by_whitespace=True,
    split_by_number= True,
    max_sentencepiece_length=16,
    add_dummy_prefix = True,
    allow_whitespace_only_pieces=True,
    # special tokens
    unk_id=0, # the UNK token MUST exist
    bos_id=1, # the others are optional,set to -1 to turn off
    eos_id=2,
    pad_id=-1,
    # systems
    num_threads= os.cpu_count(), #use ~all system reosurces



)
spm.SentencePieceTrainer.train(**options)

In [4]:
sp = spm.SentencePieceProcessor()
sp.load('tok400.model')
vocab = [[sp.id_to_piece(idx),idx] for idx in range(sp.get_piece_size())]
vocab

[['<unk>', 0],
 ['<s>', 1],
 ['</s>', 2],
 ['<0x00>', 3],
 ['<0x01>', 4],
 ['<0x02>', 5],
 ['<0x03>', 6],
 ['<0x04>', 7],
 ['<0x05>', 8],
 ['<0x06>', 9],
 ['<0x07>', 10],
 ['<0x08>', 11],
 ['<0x09>', 12],
 ['<0x0A>', 13],
 ['<0x0B>', 14],
 ['<0x0C>', 15],
 ['<0x0D>', 16],
 ['<0x0E>', 17],
 ['<0x0F>', 18],
 ['<0x10>', 19],
 ['<0x11>', 20],
 ['<0x12>', 21],
 ['<0x13>', 22],
 ['<0x14>', 23],
 ['<0x15>', 24],
 ['<0x16>', 25],
 ['<0x17>', 26],
 ['<0x18>', 27],
 ['<0x19>', 28],
 ['<0x1A>', 29],
 ['<0x1B>', 30],
 ['<0x1C>', 31],
 ['<0x1D>', 32],
 ['<0x1E>', 33],
 ['<0x1F>', 34],
 ['<0x20>', 35],
 ['<0x21>', 36],
 ['<0x22>', 37],
 ['<0x23>', 38],
 ['<0x24>', 39],
 ['<0x25>', 40],
 ['<0x26>', 41],
 ['<0x27>', 42],
 ['<0x28>', 43],
 ['<0x29>', 44],
 ['<0x2A>', 45],
 ['<0x2B>', 46],
 ['<0x2C>', 47],
 ['<0x2D>', 48],
 ['<0x2E>', 49],
 ['<0x2F>', 50],
 ['<0x30>', 51],
 ['<0x31>', 52],
 ['<0x32>', 53],
 ['<0x33>', 54],
 ['<0x34>', 55],
 ['<0x35>', 56],
 ['<0x36>', 57],
 ['<0x37>', 58],
 ['<0x38>', 5


#here one thing is noticed that hindi words are not assign according to our training dataset but, due to byte_fallback=True the byte code are assign according to the training model.


In [5]:
ids = sp.encode("hello world उन अज्ञात कार्यकर्ताओं का पता लगाएं जो मदद करते हैं")
print(ids)

[368, 302, 381, 381, 370, 314, 260, 381, 375, 368, 227, 167, 140, 227, 167, 171, 368, 227, 167, 136, 227, 167, 159, 227, 168, 144, 227, 167, 161, 227, 167, 193, 227, 167, 167, 368, 227, 167, 152, 227, 167, 193, 227, 167, 179, 227, 168, 144, 227, 167, 178, 227, 167, 152, 227, 167, 179, 227, 168, 144, 227, 167, 167, 227, 167, 193, 227, 167, 150, 227, 167, 133, 368, 227, 167, 152, 227, 167, 193, 368, 227, 167, 173, 227, 167, 167, 227, 167, 193, 368, 227, 167, 181, 227, 167, 154, 227, 167, 193, 227, 167, 146, 227, 167, 133, 368, 227, 167, 159, 227, 168, 142, 368, 227, 167, 177, 227, 167, 169, 227, 167, 169, 368, 227, 167, 152, 227, 167, 179, 227, 167, 167, 227, 168, 138, 368, 227, 167, 188, 227, 168, 139, 227, 167, 133]


#As we clearly see which code point are assing to the hindi letters.

In [6]:
print([sp.id_to_piece(idx) for idx in ids])

['▁', 'he', 'l', 'l', 'o', '▁w', 'or', 'l', 'd', '▁', '<0xE0>', '<0xA4>', '<0x89>', '<0xE0>', '<0xA4>', '<0xA8>', '▁', '<0xE0>', '<0xA4>', '<0x85>', '<0xE0>', '<0xA4>', '<0x9C>', '<0xE0>', '<0xA5>', '<0x8D>', '<0xE0>', '<0xA4>', '<0x9E>', '<0xE0>', '<0xA4>', '<0xBE>', '<0xE0>', '<0xA4>', '<0xA4>', '▁', '<0xE0>', '<0xA4>', '<0x95>', '<0xE0>', '<0xA4>', '<0xBE>', '<0xE0>', '<0xA4>', '<0xB0>', '<0xE0>', '<0xA5>', '<0x8D>', '<0xE0>', '<0xA4>', '<0xAF>', '<0xE0>', '<0xA4>', '<0x95>', '<0xE0>', '<0xA4>', '<0xB0>', '<0xE0>', '<0xA5>', '<0x8D>', '<0xE0>', '<0xA4>', '<0xA4>', '<0xE0>', '<0xA4>', '<0xBE>', '<0xE0>', '<0xA4>', '<0x93>', '<0xE0>', '<0xA4>', '<0x82>', '▁', '<0xE0>', '<0xA4>', '<0x95>', '<0xE0>', '<0xA4>', '<0xBE>', '▁', '<0xE0>', '<0xA4>', '<0xAA>', '<0xE0>', '<0xA4>', '<0xA4>', '<0xE0>', '<0xA4>', '<0xBE>', '▁', '<0xE0>', '<0xA4>', '<0xB2>', '<0xE0>', '<0xA4>', '<0x97>', '<0xE0>', '<0xA4>', '<0xBE>', '<0xE0>', '<0xA4>', '<0x8F>', '<0xE0>', '<0xA4>', '<0x82>', '▁', '<0xE0>', '<0xA4