In [3]:
# set current directory
os.chdir("/cs/student/projects1/aibh/2024/cbaumgar/MSC_THESIS")

In [4]:
# %% [markdown]
# **TOKENIZER TRAINING (Corrected with config file)**

# %%
from tokenizers import ByteLevelBPETokenizer
import os
import json # <--- Import json library

# --- Configuration ---
TRAIN_FILE_PATH = os.path.join("generalist_data", "train.txt") 
TOKENIZER_SAVE_PATH = "./generalist_tokenizer"
VOCAB_SIZE = 1000

# --- Training ---
tokenizer = ByteLevelBPETokenizer()

print(f"Training new tokenizer from corpus: {TRAIN_FILE_PATH}...")
tokenizer.train(files=[TRAIN_FILE_PATH], vocab_size=VOCAB_SIZE, min_frequency=2,
                special_tokens=["[PAD]", "[UNK]", "[EOS]", "[SOS]"])

os.makedirs(TOKENIZER_SAVE_PATH, exist_ok=True)
tokenizer.save_model(TOKENIZER_SAVE_PATH)
print(f"Tokenizer files (vocab.json, merges.txt) saved to: {TOKENIZER_SAVE_PATH}")

# --- THE FIX: Create the tokenizer_config.json file ---
print("Creating tokenizer_config.json...")

# Define the configuration. The key is to specify the class.
# We also map our custom special tokens to the standard ones.
tokenizer_config = {
    "tokenizer_class": "GPT2Tokenizer",
    "bos_token": "[SOS]",
    "eos_token": "[EOS]",
    "unk_token": "[UNK]",
    "pad_token": "[PAD]",
    "model_max_length": 1024 # This should match the model's context window
}

# Write the configuration to the file
with open(os.path.join(TOKENIZER_SAVE_PATH, 'tokenizer_config.json'), 'w') as f:
    json.dump(tokenizer_config, f, indent=2)

print("tokenizer_config.json created successfully.")
print(f"Vocabulary size: {tokenizer.get_vocab_size()}")

# --- Also create a special_tokens_map.json for completeness ---
# This helps the tokenizer know the string representation of its special tokens
special_tokens_map = {
    "bos_token": "[SOS]",
    "eos_token": "[EOS]",
    "unk_token": "[UNK]",
    "pad_token": "[PAD]"
}
with open(os.path.join(TOKENIZER_SAVE_PATH, 'special_tokens_map.json'), 'w') as f:
    json.dump(special_tokens_map, f, indent=2)

print("special_tokens_map.json created successfully.")

Training new tokenizer from corpus: generalist_data/train.txt...



Tokenizer files (vocab.json, merges.txt) saved to: ./generalist_tokenizer
Creating tokenizer_config.json...
tokenizer_config.json created successfully.
Vocabulary size: 1000
special_tokens_map.json created successfully.


In [None]:

# VERIFYING THE NEW TOKENIZER

from transformers import GPT2Tokenizer

# Load the tokenizer we just trained. 
# NOTE: We use GPT2Tokenizer.from_pretrained() because it knows how to handle 
# the files our custom tokenizer saved, including special tokens.
my_custom_tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER_SAVE_PATH)

# Let's test it on a sample sentence from our domain
sample_sentence = "MODE Shortest START ab END cd PATH ab NORTH ef"
print(f"Original sentence: \n'{sample_sentence}'")

# Encode the sentence
encoded = my_custom_tokenizer.encode(sample_sentence)
print(f"\nEncoded Token IDs: \n{encoded}")

# Decode back to see the tokens
decoded_tokens = [my_custom_tokenizer.decode([token_id]) for token_id in encoded]
print(f"\nDecoded Tokens (one per ID): \n{decoded_tokens}")


  from .autonotebook import tqdm as notebook_tqdm
  backends.update(_get_backends("networkx.backends"))
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Original sentence: 
'MODE Shortest START ab END cd PATH ab NORTH ef'

Encoded Token IDs: 
[311, 326, 312, 467, 322, 912, 309, 467, 267, 454]

Decoded Tokens (one per ID): 
['MODE', ' Shortest', ' START', ' ab', ' END', ' cd', ' PATH', ' ab', ' NORTH', ' ef']

--- Verification ---
❌ FAILURE: Tokenization did not work as expected.
