# Create Token

In [1]:
pip install PyPDF2

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import PyPDF2
import re
# Open in binary mode
with open("The_Verdict.pdf", "rb") as fp:
    reader = PyPDF2.PdfReader(fp)
    text_book = ""
    for page in reader.pages:
        text_book += page.extract_text() or ""  # Avoid None

print("Total number of characters:", len(text_book))
print(text_book[:999])
      

Total number of characters: 21918
1The Verdict
Edith Wharton
1908
Exported from Wikisource on May 20, 20242I HAD always thought Jack Gisburn rather a cheap genius--
though a good fellow enough--so it was no great surprise to
me to hear that, in the height of his glory , he had dropped
his painting, married a rich widow , and established himself
in a villa on the Riviera. (Though I rather thought it would
have been Rome or Florence.)
"The height of his glory"--that was what the women called
it. I can hear Mrs. Gideon Thwing--his last Chicago sitter --
deploring his unaccountable abdication. "Of course it's
going to send the value of my picture 'way up; but I don't
think of that, Mr . Rickham--the loss to Arrt is all I think of."
The word, on Mrs. Thwing's lips, multiplied its _rs_ as
though they were reflected in an endless vista of mirrors.
And it was not only the Mrs. Thwings who mourned. Had
not the exquisite  Hermia Croft, at the last Grafton Gallery
show , stopped me before Gisburn

In [3]:
# 
text1  = "Today is sunny, let's -- go to the beach!"
# split text around empty space
res = re.split(r'\s', text1)
res

['Today', 'is', 'sunny,', "let's", '--', 'go', 'to', 'the', 'beach!']

In [4]:
# split text around empty space, comman and period
res = re.split(r'[,.]|\s', text1)
res

['Today', 'is', 'sunny', '', "let's", '--', 'go', 'to', 'the', 'beach!']

In [5]:
res = re.split(r'([,.:?!",()]|--|\s)', text1)

# Removing white spaces
res = [word for word in res if word.strip() ]
print("res:", res)

# Alternative: Keep words AND punctuation as separate tokens but no empty strings
res1 =  re.findall(r'[,.:?!",()]|--|[^,.:?!",()\s]+', text1) 
print("res1:", res1)



res: ['Today', 'is', 'sunny', ',', "let's", '--', 'go', 'to', 'the', 'beach', '!']
res1: ['Today', 'is', 'sunny', ',', "let's", '--', 'go', 'to', 'the', 'beach', '!']


In [6]:
# Tokenize by removing all space, comma, --, etcc
res = [word.strip() for word in res if word.strip() ]
print("res:",res)

# If you want to keep apostrophes in words like "let's":
res1 = re.findall(r'[^\s,.:?!",()--]+|(?<=\w)\'(?=\w)', text1)
print("res1:", res1)

res: ['Today', 'is', 'sunny', ',', "let's", '--', 'go', 'to', 'the', 'beach', '!']
res1: ['Today', 'is', 'sunny', "let's", 'go', 'to', 'the', 'beach']


  res1 = re.findall(r'[^\s,.:?!",()--]+|(?<=\w)\'(?=\w)', text1)


In [7]:
# preprocessing Case 1:
# Split text on punctuation, symbols, or whitespace, keeping delimiters
preprocess = re.split(r'( [,.:?!",()]|--|\s)', text_book)
# Remove empty strings and whitespace-only elements
preprocess = [word.strip() for word in preprocess if word.strip() ]
print('preprocess:\n',preprocess[:20])

# preprocessing Case 2:
# Alternative approaches Extract words and punctuation as separate tokens
preprocess_1 = re.findall(r'[^\s,.:?!",()--]+|[,.:?!",()]|--', text_book)
# Extract all non-whitespace chunks (includes punctuation attached to words)
preprocess_1 = re.findall(r'[^\s]+', text_book)
print('preprocess_1:\n',preprocess_1[:20])

preprocess:
 ['1The', 'Verdict', 'Edith', 'Wharton', '1908', 'Exported', 'from', 'Wikisource', 'on', 'May', '20,', '20242I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap']
preprocess_1:
 ['1The', 'Verdict', 'Edith', 'Wharton', '1908', 'Exported', 'from', 'Wikisource', 'on', 'May', '20,', '20242I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap']


  preprocess_1 = re.findall(r'[^\s,.:?!",()--]+|[,.:?!",()]|--', text_book)


# get the unique word vocabulary 

In [15]:
# option  1
from itertools import islice
w = sorted(set(preprocess))
vocabulary1 = {key: value for key, value in enumerate(w)}
# Print top 10 items
print("\nvocabulary:_________")
for key, value in islice(vocabulary1.items(), 10):
    print(f"{key}: {value}")

w = sorted(set(preprocess))


vocabulary = {word: key for key, word in enumerate(w)}
# Fix: This should be {word: idx} not {idx: word}
string_to_int = {word: idx for idx, word in enumerate(w)}
print("\nstring_to_int:_____________")
for key, value in islice(string_to_int.items(), 10):
    print(f"{key}: {value}")


vocabulary:_________
0: !
1: "
2: "Ah
3: "Ah,
4: "Begin
5: "By
6: "Destroyed
7: "Don't
8: "Hang
9: "Has

string_to_int:_____________
!: 0
": 1
"Ah: 2
"Ah,: 3
"Begin: 4
"By: 5
"Destroyed: 6
"Don't: 7
"Hang: 8
"Has: 9


# Encode and Decode 
## Endcode: Simple Text -> Token text -> Token ID
1. Simple Text (Raw Input):   "Hello, world!"
2. Tokenized Text (Splitting into Tokens):

               A- Whole words ("Hello", "world")
               B- Subwords ("unhappiness" → "un", "happiness")
               C- Punctuation and symbols (",", "!")
               Tokenized Output: ["Hello", ",", "world", "!"]
3. Token IDs (Numerical Representation):

                "Hello" → 15496
                "," → 11
                "world" → 995
                "!" → 0
                Final Token IDs: [15496, 11, 995, 0]
## Decode: Token ID -> Token Text -> Simple Text
    
    Decode is the reverse: Token id -> Token Text -> Simple Text

In [16]:
# option 1
class Tokenization:
    def __init__(self, vocabulary):
        self.string_to_int = vocabulary
        self.int_to_string = {item: s for s,item in vocabulary.items()}

    def encoding(self, text):
        # split text around empty space, comman and period
        preprocessing = re.split(r'([,.:?!",()]|--|\s)', text)
        # Tokenize by removing all space, comma, --, etcc
        preprocessing = [word.strip() for word in preprocessing if word.strip() ]
        # converting into id number
        id = [self.string_to_int[s] for s in preprocessing ]
        return id 

    def decoding(self, vocabulary ):
        # convert token id to Token text
        tokenID_to_TokenText = " ".join([self.int_to_string[i] for i in id])
        # replace empty space with punctuation
        tokenID_to_TokenText =  re.sub (r'([,.:?!",()]|--|\s)', r'\1', tokenID_to_TokenText)
        return tokenID_to_TokenText
# option 2
class Tokenization_1:
    def __init__(self, vocabulary):
        self.string_to_int = vocabulary  # Should be {word: id}
        self.int_to_string = {id: word for word, id in vocabulary.items()}  # Reverse mapping

    def encoding_1(self, text):
        # Extract tokens directly (no empty strings)
        tokens = re.findall(r'[^\s,.:?!",()--]+|[,.:?!",()]|--', text)
        # Convert to IDs using dictionary lookup
        return [self.string_to_int[token] for token in tokens]

    def decode_1(self, ids):
        # Convert list of token IDs back to their string representations
        tokens = [self.int_to_string[id] for id in ids]
        
        # Reconstruct the original text by joining tokens with spaces
        # This handles most cases where spaces were between words
        text = ' '.join(tokens)
        
        # Fix punctuation spacing by removing spaces before punctuation
        # Matches spaces before any of these punctuation marks: , . : ? ! " , ( )
        text = re.sub(r'\s+([,.:?!",()])', r'\1', text)
        
        # Handle special cases like apostrophes (e.g., "let ' s" → "let's")
        text = re.sub(r"(\w)\s+'\s+(\w)", r"\1'\2", text)
        
        # Handle double dashes (e.g., "word -- word" → "word--word")
        text = re.sub(r'\s+--\s+', '--', text)
        
        return text


In [17]:
# unseen word give us error because of short data
Tok = Tokenization(vocabulary)
inputText = "height of his glory that was what the women called it."
tokenID = Tok.encoding(inputText)
print(tokenID)

[758, 1017, 778, 706, 1347, 1471, 1491, 1351, 1522, 399, 827, 46]


In [18]:
# unseen word give us error because of short data
Tok_1 = Tokenization_1(vocabulary)
inputText_1 = "height of his glory that was what the women called it."
tokenID_1 = Tok_1.encoding_1(inputText_1)
print(tokenID_1)

[758, 1017, 778, 706, 1347, 1471, 1491, 1351, 1522, 399, 827, 46]


# How to deal with unseen token
# Assume preprocessing is your list of tokens from dataset
unseen_token = sorted(list(set(preprocessing)))

# Add special tokens explicitly
unseen_token.extend(["<|endoftext|>", "<|unk|>"])

# Assign each token an integer ID
vocab = {tok: num for num, tok in enumerate(unseen_token)}

# Vocabulary size
print("Vocab size:", len(vocab))



# Explanation of Special Tokens

#### <|endoftext|>:  Marks the end of a text sequence and Used for separating documents/contexts

#### <|unk|> - Represents "unknown" tokens: Handles words not seen during training (out-of-vocabulary)



In [None]:
# Add special tokens to vocabulary
special_tokens = ["<|endoftext|>", "<|unk|>"]
base_vocab = {"hello": 0, "world": 1, "test": 2}  # Your normal vocabulary

# Create extended vocabulary with special tokens
extended_vocab = {}
# Add special tokens first (usually at beginning of ID range)
for i, token in enumerate(special_tokens):
    extended_vocab[token] = i

# Add normal vocabulary with offset
for word, idx in base_vocab.items():
    extended_vocab[word] = idx + len(special_tokens)

print(extended_vocab)
# {'<|endoftext|>': 0, '<|unk|>': 1, 'hello': 2, 'world': 3, 'test': 4}

In [None]:
# When your model encounters a word not in vocab, you map it to <|unk|>:
def get_token_id(token, vocab):
    return vocab.get(token, vocab["<|unk|>"])


In [None]:
!pip3 install tiktoken

In [None]:
import importlib
import tiktoken

In [None]:
text = (
    "Hello, would like coffee? <|endoftext|> In the  morning in your room"
     "of windburry."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)