In [8]:
from datasets import load_from_disk

In [9]:
# Edit path here
dataset = load_from_disk('../outputs/datasets/random4to4_1k_42/')

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'prompt', 'completion'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['text', 'prompt', 'completion'],
        num_rows: 128
    })
})

In [11]:
dataset['train'][0]

{'text': 'loc(c_6252)=1741,6999',
 'prompt': '<bos>loc(c_6252)=',
 'completion': '1741,6999<eos>'}

In [6]:
# Test tokenizer
from transformers import AutoTokenizer
import sys
sys.path.append('..')

# Load HF tokenizer from outputs directory
tokenizer = AutoTokenizer.from_pretrained('../outputs/tokenizer/wm1_tokenizer')
print(f"Vocab size: {len(tokenizer)}")
print(f"Special tokens: bos={tokenizer.bos_token}, eos={tokenizer.eos_token}, pad={tokenizer.pad_token}")

Vocab size: 44
Special tokens: bos=<bos>, eos=<eos>, pad=<pad>


In [7]:
# Tokenize a sample
sample = dataset['train'][0]
print(f"Text: {sample['text']}")

# Tokenize without special tokens
tokens = tokenizer(sample['text'], add_special_tokens=False)
print(f"\nToken IDs: {tokens['input_ids']}")
print(f"Decoded: {tokenizer.decode(tokens['input_ids'])}")

# Check length
print(f"\nCharacter count: {len(sample['text'])}")
print(f"Token count: {len(tokens['input_ids'])}")
print(f"Match: {len(sample['text']) == len(tokens['input_ids'])}")

Text: loc(c_6252)=1741

Token IDs: [19, 22, 10, 3, 10, 7, 40, 36, 39, 36, 4, 6, 35, 41, 38, 35]
Decoded: loc(c_6252)=1741

Character count: 16
Token count: 16
Match: True


In [7]:
# Tokenize prompt and completion separately
prompt_tokens = tokenizer(sample['prompt'], add_special_tokens=False)
completion_tokens = tokenizer(sample['completion'], add_special_tokens=False)

print(f"Prompt: {sample['prompt']}")
print(f"Prompt tokens: {prompt_tokens['input_ids']}")
print(f"\nCompletion: {sample['completion']}")  
print(f"Completion tokens: {completion_tokens['input_ids']}")

# Full sequence
full_tokens = prompt_tokens['input_ids'] + completion_tokens['input_ids']
print(f"\nFull sequence: {full_tokens}")
print(f"Decoded: {tokenizer.decode(full_tokens)}")

Prompt: <bos>loc(c_1730)=
Prompt tokens: [1, 19, 22, 10, 3, 10, 7, 35, 41, 37, 34, 4, 6]

Completion: 1267,2003<eos>
Completion tokens: [35, 36, 40, 41, 5, 36, 34, 34, 37, 2]

Full sequence: [1, 19, 22, 10, 3, 10, 7, 35, 41, 37, 34, 4, 6, 35, 36, 40, 41, 5, 36, 34, 34, 37, 2]
Decoded: <bos>loc(c_1730)=1267,2003<eos>
