# Tokenizer

## Installs

In [1]:
%pip install tokenizers

Collecting tokenizers
  Obtaining dependency information for tokenizers from https://files.pythonhosted.org/packages/5e/8c/e32b066f3a2c924235e6a2ecad5b22c3c64b569f192975815c887b4bcfdf/tokenizers-0.15.1-cp312-none-win_amd64.whl.metadata
  Downloading tokenizers-0.15.1-cp312-none-win_amd64.whl.metadata (6.8 kB)
Collecting huggingface_hub<1.0,>=0.16.4 (from tokenizers)
  Obtaining dependency information for huggingface_hub<1.0,>=0.16.4 from https://files.pythonhosted.org/packages/28/03/7d3c7153113ec59cfb31e3b8ee773f5f420a0dd7d26d40442542b96675c3/huggingface_hub-0.20.3-py3-none-any.whl.metadata
  Downloading huggingface_hub-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting filelock (from huggingface_hub<1.0,>=0.16.4->tokenizers)
  Obtaining dependency information for filelock from https://files.pythonhosted.org/packages/81/54/84d42a0bee35edba99dee7b59a8d4970eccdd44b99fe728ed912106fc781/filelock-3.13.1-py3-none-any.whl.metadata
  Downloading filelock-3.13.1-py3-none-any.whl.metadata (2.8 


[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## Imports

In [2]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from tqdm.auto import tqdm
from pathlib import Path
import os

  from .autonotebook import tqdm as notebook_tqdm


## Directories

In [3]:
# Data directory
dir = os.getcwd()
data_folder = Path(f'{dir}/data')
files = [str(data_folder / file) for file in os.listdir(data_folder) if file.endswith('.txt')]

# Directory to save the tokenizer
output_dir = "RoBERTa_Tokenizer"

# Create the directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Main loop

In [4]:
for file in tqdm(files, desc="Training tokenizers"):
    # Extract filename without extension to use as a unique identifier
    filename = Path(file).stem

    # Directory to save this specific tokenizer
    output_dir = os.path.join(base_output_dir, filename)

    # Create the directory if it does not exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()

    tokenizer.train(
        files=[file],
        vocab_size=52_000,
        min_frequency=2,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"],
    )

    # Save the tokenizer in its unique directory
    tokenizer.save_model(output_dir)

    # Post-processing: adding special tokens for RoBERTa
    tokenizer._tokenizer.post_processor = BertProcessing(
        sep=("</s>", tokenizer.token_to_id("</s>")),
        cls=("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_truncation(max_length=512)