In [None]:
#!pip install tiktoken

In [None]:
import pandas as pd
import tiktoken


In [None]:
text = pd.read_csv('./data/text.csv')
text = text['0'].to_list()
type(text)

list

### loading the token

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [5]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors

#Initializing the Tokenizer
tokenizer = Tokenizer(models.BPE())

#Configure the tokenizer
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)



### Train the Tokenizer

In [6]:
trainer = trainers.BpeTrainer(vocab_size=50257, min_frequency=2, special_tokens=["<|endoftext|>"])
tokenizer.train_from_iterator(text, trainer=trainer)

## Save the tokenizer

In [7]:
#tokenizer.save("fine_tuned_tokenizer.json")


## Evaluate the tokenizer

There are several ways to evaluate a tokenizer

* Tokenization output encoding on text data
* Evaluate Vocabulary Coverage
* Evaluate Computational Efficiency

In [17]:
from transformers import GPT2Tokenizer
import time
import os

#this is the original gpt2 tokenizer
old_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

#This is the fine tuned one
new_tokenizer_path = os.path.join(os.getcwd(), "fine_tuned_tokenizer.json")
new_tokenizer = Tokenizer.from_file(new_tokenizer_path)

#We will take random 100 lines from the dataset
dataset = pd.read_csv('./data/text.csv')
dataset100 = dataset.sample(n=10)

texts = dataset100['0'].tolist()

In [None]:
def evaluate_tokenization_quality(tokenizer, texts):
    tokenization_results = []
    for text in texts:
        if isinstance(tokenizer, GPT2Tokenizer):
            tokens = tokenizer.tokenize(text)
        else:
            tokens = tokenizer.encode(text).tokens
        tokenization_results.append(tokens)
    return tokenization_results

old_tokenization_results = evaluate_tokenization_quality(old_tokenizer, texts[:100])  
new_tokenization_results = evaluate_tokenization_quality(new_tokenizer, texts[:100])  

for old_tokens, new_tokens in zip(old_tokenization_results, new_tokenization_results):
    print("Old Tokenizer:", old_tokens)
    print("New Tokenizer:", new_tokens)
    print("-" * 40)

#Evaluate vocabulary coverage
def evaluate_vocabulary_coverage(tokenizer, texts):
    unique_tokens = set()
    for text in texts:
        if isinstance(tokenizer, GPT2Tokenizer):
            tokens = tokenizer.tokenize(text)
        else:
            tokens = tokenizer.encode(text).tokens
        unique_tokens.update(tokens)
    return unique_tokens

old_unique_tokens = evaluate_vocabulary_coverage(old_tokenizer, texts)
new_unique_tokens = evaluate_vocabulary_coverage(new_tokenizer, texts)

print("Old Tokenizer Unique Tokens:", len(old_unique_tokens))
print("New Tokenizer Unique Tokens:", len(new_unique_tokens))

#Evaluate computational efficiency
def evaluate_computational_efficiency(tokenizer, texts):
    start_time = time.time()
    for text in texts:
        if isinstance(tokenizer, GPT2Tokenizer):
            tokenizer.tokenize(text)
        else:
            tokenizer.encode(text)
    end_time = time.time()
    return end_time - start_time

old_time = evaluate_computational_efficiency(old_tokenizer, texts[:100])  
new_time = evaluate_computational_efficiency(new_tokenizer, texts[:100])   

print("Old Tokenizer Time:", old_time)
print("New Tokenizer Time:", new_time)


Old Tokenizer: ['"', 'Ul', 'up', 'i', 'Ġanswered', ',', "Ġ'", 'I', 'Ġknow', ',', 'ĠO', 'Ġson', 'Ġof', 'ĠPand', 'u', ',', 'Ġwhy', 'Ġthou', 'Ġwand', 'erest', 'Ġover', 'Ġthe', 'Ġearth', ',', 'Ġand', 'Ġwhy', 'Ġthou', 'Ġhast', 'Ġbeen', 'Ġcommanded', 'Ġto', 'Ġlead', 'Ġthe', 'Ġlife', 'Ġof', 'Ġa', 'ĠBra', 'hm', 'ach', 'arin', 'Ġby', 'Ġthe', 'Ġsuperior', '.', 'ĠEven', 'Ġthis', 'Ġwas', 'Ġthe', 'Ġunderstanding', 'Ġto', 'Ġwhich', 'Ġall', 'Ġof', 'Ġyou', 'Ġhad', 'Ġbeen', 'Ġpledged', ',', 'Ġviz', '.,', 'Ġthat', 'Ġamongst', 'Ġyou', 'Ġall', 'Ġowning', 'ĠD', 'rup', 'ada', "'s", 'Ġdaughter', 'Ġas', 'Ġyour', 'Ġcommon', 'Ġwife', ',', 'Ġhe', 'Ġwho', 'Ġwould', 'Ġfrom', 'Ġignorance', 'Ġenter', 'Ġthe', 'Ġroom', 'Ġwhere', 'Ġone', 'Ġof', 'Ġyou', 'Ġwould', 'Ġbe', 'Ġsitting', 'Ġwith', 'Ġher', ',', 'Ġshould', 'Ġlead', 'Ġthe', 'Ġlife', 'Ġof', 'Ġa', 'ĠBra', 'hm', 'ach', 'arin', 'Ġin', 'Ġthe', 'Ġwoods', 'Ġfor', 'Ġtwelve', 'Ġyears', '.', 'ĠThe', 'Ġexile', 'Ġof', 'Ġany', 'Ġone', 'Ġamongst', 'Ġyou', ',', 'Ġtherefore', ',

## These are the results:

* The new tokenizer appears to be better at handling compound words and proper nouns, which is evident from the tokenization results. For example, in the old tokenizer, compound names like "Pasusakha" and "Brahmacharin" are split into multiple tokens, whereas the new tokenizer correctly tokenizes them as single tokens.

* The new tokenizer also handles contractions and special characters more effectively. For instance, "Ġ'" is correctly tokenized as a single token in the new tokenizer, whereas it is split in the old tokenizer.

* The new tokenizer has a slightly smaller vocabulary size (1166 unique tokens) compared to the old tokenizer (1299 unique tokens). This suggests that the new tokenizer is more efficient in combining tokens that are often used together, leading to a more compact vocabulary.

* The new tokenizer is faster than the old tokenizer. The old tokenizer took approximately 0.0131 seconds, while the new tokenizer took about 0.00998 seconds. This indicates that the new tokenizer is more computationally efficient, which could be beneficial for real-time applications or larger datasets.