<a href="https://colab.research.google.com/github/cathieG/Tokenizer/blob/main/tok.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Project Description: This file contains three tokenizers, one for English text, one for python code, and one for French text.

In [2]:
!pip install tokenizers



In [3]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

In [4]:
#English tokenizer:
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()

In [5]:
trainer = BpeTrainer(
    vocab_size=22000,
    min_frequency = 2, # Minimum frequency for tokens to be included
    special_tokens = ["<pad>", "<unk>", "<s>", "</s>"]
)

In [6]:
from google.colab import files
upload = files.upload() # Select corpus text file from my computer. This file is submitted on Gradescope.

Saving Huckleberry.txt to Huckleberry.txt


In [7]:
file = ["Huckleberry.txt"]
tokenizer.train(file,trainer)

In [8]:
tokenizer.save("huckleberry_tokenizer.json")

In [9]:
tokenizer = Tokenizer.from_file("huckleberry_tokenizer.json")

sample_text = "This is a sample text to test out the English tokenizer based on Huckleberry!"
encoded = tokenizer.encode(sample_text)
print("Tokens:", encoded.tokens)
print("IDs:", encoded.ids)

Tokens: ['This', 'is', 'a', 'sa', 'mp', 'le', 'te', 'xt', 'to', 'te', 'st', 'out', 'the', 'E', 'ng', 'li', 'sh', 'to', 'k', 'en', 'i', 'z', 'er', 'b', 'ased', 'on', 'Huckleberry', '!']
IDs: [978, 130, 36, 100, 279, 91, 352, 413, 73, 352, 85, 104, 69, 17, 1458, 123, 235, 73, 46, 89, 44, 61, 78, 37, 1533, 77, 1342, 4]


In [10]:
#French tokenizer:

from tokenizers.models import Unigram
from tokenizers.trainers import UnigramTrainer
from tokenizers.processors import TemplateProcessing

tokenizer_French = Tokenizer(Unigram())
tokenizer_French.pre_tokenizer = Whitespace()

In [21]:
trainer_French = UnigramTrainer(
    vocab_size=20000,
    min_frequency = 3, # Minimum frequency for tokens to be included
    special_tokens = ["<pad>", "<unk>", "<s>", "</s>"],
    unk_token = "<unk>",
)

In [12]:
from google.colab import files
upload_French = files.upload() # Select corpus text file from my computer. This file is submitted on Gradescope.

Saving Monte-Cristo.txt to Monte-Cristo.txt


In [22]:
file_French = ["Monte-Cristo.txt"]
tokenizer_French.train(file_French,trainer_French)

In [23]:
tokenizer_French.save("French_tokenizer.json")

In [24]:
tokenizer_French = Tokenizer.from_file("French_tokenizer.json")

sample_French = "Voici un texte d’exemple pour tester le tokenizer français basé sur Le Comte de Monte-Cristo!"
encoded_French = tokenizer_French.encode(sample_French)
print("Tokens:", encoded_French.tokens)
print("IDs:", encoded_French.ids)

Tokens: ['Vo', 'ici', 'un', 't', 'ex', 'te', 'd', '’', 'exe', 'mple', 'pour', 't', 'est', 'er', 'le', 'to', 'k', 'en', 'i', 'z', 'er', 'fr', 'ançai', 's', 'bas', 'é', 'sur', 'Le', 'C', 'o', 'm', 'te', 'de', 'Mon', 'te', '-', 'C', 'r', 'isto', '!']
IDs: [478, 198, 28, 8, 657, 149, 13, 1, 1114, 907, 60, 8, 32, 92, 17, 642, 1, 30, 24, 29, 92, 674, 880, 4, 453, 35, 81, 108, 56, 83, 43, 149, 11, 399, 149, 9, 56, 23, 1139, 26]


In [25]:
from tokenizers import Tokenizer, pre_tokenizers, models, trainers
from tokenizers.normalizers import Sequence, Strip
from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import Whitespace, ByteLevel
import re

In [26]:
# Custom function to replace indentation with special tokens
def process_indentation(code):
    processed_code = []
    indent_levels = [0]  # Track indentation levels
    for line in code.split("\n"):
        stripped_line = line.lstrip()
        indent_size = len(line) - len(stripped_line)

        if indent_size > indent_levels[-1]:  # Indent increase
            processed_code.append("<INDENT>")
            indent_levels.append(indent_size)
        while indent_size < indent_levels[-1]:  # Dedent decrease
            processed_code.append("<DEDENT>")
            indent_levels.pop()

        processed_code.append(stripped_line)  # Add actual line content

    return "\n".join(processed_code)

In [None]:
# Load corpus and process indentation
with open("python_corpus.py", "r", encoding="utf-8") as f:
    raw_code = f.read()
processed_code = process_indentation(raw_code)

In [None]:
# Save processed text to train on
with open("processed_python_corpus.py", "w", encoding="utf-8") as f:
    f.write(processed_code)

In [None]:
# Initialize tokenizer with a Byte Pair Encoding model
tokenizer_Python = Tokenizer(models.BPE())

# Use ByteLevel pre-tokenizer for handling spaces and special characters
tokenizer_Python.pre_tokenizer = ByteLevel()


In [None]:
# Define a trainer with special tokens for indentation
trainer_Python = trainers.BpeTrainer(
    vocab_size=10000,
    min_frequency=2,
    special_tokens=["<pad>", "<unk>", "<s>", "</s>", "<INDENT>", "<DEDENT>"]
)

In [None]:
# Train tokenizer on processed Python code
tokenizer_Python.train(["processed_python_corpus.py"], trainer_Python)

In [None]:
# Save tokenizer
tokenizer_Python.save("Python_tokenizer.json")

In [None]:
# Reload tokenizer
tokenizer_Python = Tokenizer.from_file("Python_tokenizer.json")

In [None]:
# Test tokenizer on an example Python function
sample_Python = """
def greet():
    print("Hello")
    if True:
        print("Indented!")
"""

# Process sample before tokenizing
processed_sample = process_indentation(sample_Python)
encoded_Python = tokenizer_Python.encode(processed_sample)

# Output results
print("Tokens:", encoded_Python.tokens)
print("IDs:", encoded_Python.ids)

fatal: not a git repository (or any of the parent directories): .git
