<a href="https://colab.research.google.com/github/abdussahid26/Dara-preparation-and-sampling-for-LLMs/blob/main/Tokenizing_Text_and_Converting_Tokens_into_Token_IDs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tiktoken



# **Tokenizing Text**


In [None]:
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.5.1+cu121
tiktoken version: 0.8.0


Downloading the data file "the-verdict.txt"


In [None]:
import os
import urllib.request

url = "https://raw.githubusercontent.com/abdussahid26/Dara-preparation-and-sampling-for-LLMs/main/the-verdict.txt"
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x79140359c070>)

Loading the "the-verdict.txt" file

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of character: ", len(raw_text))
print(raw_text[:99])

Total number of character:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


Applying tokenization to the raw text

In [None]:
import re # Importing regular expression (re) library

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text) # \s for whitespace
preprocessed = [item.strip() for item in preprocessed if item.strip()] # Strip whitespace from each item and then filter out any empty strings.
print("Total number of token: ", len(preprocessed))
print(preprocessed[:99])

Total number of token:  4690
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter']


# **Converting tokens into token IDs**


By using 'set' we create a list of all **unique** tokens and sort them alphabetically to determine the vocabulary size.

In [None]:
all_words = list(set(preprocessed))
print("Vocabulary size: ", len(all_words))

Vocabulary size:  1130


Creating a vocabulary

In [None]:
vocab = {token:integer for integer, token in enumerate(all_words)}

for i, item in enumerate(vocab.items()):
  print(item)
  if i >= 99:
    break


('drew', 0)
('weeks', 1)
('lucky', 2)
('clear', 3)
('sitters', 4)
('object', 5)
('everlasting', 6)
('throwing', 7)
('wife', 8)
('tottering', 9)
('fostered', 10)
('corner', 11)
('ones', 12)
('patient', 13)
('companion', 14)
('saw', 15)
('Come', 16)
('destroyed', 17)
('upstairs', 18)
('multiplied', 19)
('wonder', 20)
('distinguished', 21)
('live', 22)
('accustomed', 23)
('?', 24)
('interesting', 25)
('activity', 26)
('You', 27)
('pink', 28)
('Jove', 29)
('fancy', 30)
('brings', 31)
('veins', 32)
('--', 33)
('bric-a-brac', 34)
('chap', 35)
('yellow', 36)
('full', 37)
('extracting', 38)
('hear', 39)
('saying', 40)
('sensation', 41)
('minute', 42)
('Burlington', 43)
('packed', 44)
('says', 45)
('Chicago', 46)
('Gallery', 47)
('long', 48)
('though', 49)
('his', 50)
('genial', 51)
('took', 52)
('Professional', 53)
('breathing', 54)
('voice', 55)
('slightly', 56)
('yourself', 57)
('instructive', 58)
('after', 59)
('kind', 60)
('It', 61)
('vista', 62)
('come', 63)
('finality', 64)
('breaking', 

Let's now modify the vocabulary to include two special tokens, <|unk|> and <|endoftext|>, by adding them to our list of all unique words.

In [None]:
all_tokens = sorted (list(set(preprocessed)))
all_tokens.extend(["<|unk|>", "|endoftext|"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}

for i, item in enumerate(list(vocab.items())[-5:]):
  print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|unk|>', 1130)
('|endoftext|', 1131)


Let's create a complete text tokenizer class with encoding and decoding capabilities in python



In [None]:
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s, i in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text) # \s for whitespace
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # Removes spaces before the specified puntuation.
    return text

Let's instantiate a new tokenizer object from the ***SimpleTokenizerV1*** class and tokenize the text from the Edith Wharton's short story.

In [None]:
tokenizer = SimpleTokenizerV1(vocab)
ids = tokenizer.encode(raw_text)
print(ids[:99])

[53, 44, 149, 1003, 57, 38, 818, 115, 256, 486, 6, 1002, 115, 500, 435, 392, 6, 908, 585, 1077, 709, 508, 961, 1016, 663, 1016, 535, 987, 5, 568, 988, 538, 722, 549, 496, 5, 533, 514, 370, 549, 748, 5, 661, 115, 841, 1102, 5, 157, 397, 547, 568, 115, 1066, 727, 988, 84, 7, 3, 99, 53, 818, 1003, 585, 1120, 530, 208, 85, 734, 34, 7, 4, 1, 93, 538, 722, 549, 496, 1, 6, 987, 1077, 1089, 988, 1112, 242, 585, 7, 53, 244, 535, 67, 7, 37, 100, 6, 549, 602, 25, 897]


Let's turn these token IDs back into text using the decode method

In [None]:
print(tokenizer.decode(ids[:99]))

I HAD always thought Jack Gisburn rather a cheap genius -- though a good fellow enough -- so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera.( Though I rather thought it would have been Rome or Florence.)" The height of his glory" -- that was what the women called it. I can hear Mrs. Gideon Thwing -- his last Chicago sitter


# **Adding special context tokens**


In this section, we'll modify the tokenizer to handle unknown words.

In [None]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = { i:s for s,i in vocab.items()}
  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [
    item.strip() for item in preprocessed if item.strip()
    ]
    preprocessed = [item if item in self.str_to_int
    else "<|unk|>" for item in preprocessed]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids
  def decode(self, ids):
#    text = " ".join([self.int_to_str[i] for i in ids])
    text = " ".join([self.int_to_str[i] if i in self.int_to_str else "|unk|" for i in ids])
    text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
    return text

In [None]:
tokenizer = SimpleTokenizerV2(vocab)
ids = tokenizer.encode(raw_text)
print(ids[:99])

[53, 44, 149, 1003, 57, 38, 818, 115, 256, 486, 6, 1002, 115, 500, 435, 392, 6, 908, 585, 1077, 709, 508, 961, 1016, 663, 1016, 535, 987, 5, 568, 988, 538, 722, 549, 496, 5, 533, 514, 370, 549, 748, 5, 661, 115, 841, 1102, 5, 157, 397, 547, 568, 115, 1066, 727, 988, 84, 7, 3, 99, 53, 818, 1003, 585, 1120, 530, 208, 85, 734, 34, 7, 4, 1, 93, 538, 722, 549, 496, 1, 6, 987, 1077, 1089, 988, 1112, 242, 585, 7, 53, 244, 535, 67, 7, 37, 100, 6, 549, 602, 25, 897]


In [None]:
print(tokenizer.decode([1122]))
print(tokenizer.decode([9113]))

year
|unk|


In [None]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
combined_text = " <|endoftext|> ".join((text1, text2))
print(combined_text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [None]:
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(combined_text))

[1130, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1130, 7]


In [None]:
print(tokenizer.decode(tokenizer.encode(combined_text)))

<|unk|>, do you like tea? <|unk|> In the sunlit terraces of the <|unk|>.
