# TokenSmith Showcase

In [1]:
import sys
sys.path.insert(0, "/NS/llm-pretraining/work/afkhan/tokensmith")
sys.path.insert(0, "/NS/llm-pretraining/work/afkhan/USC_Colab/gpt-neox")

from transformers import AutoTokenizer
TOKENIZER_NAME_OR_PATH = "EleutherAI/gpt-neox-20b"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME_OR_PATH, add_eos_token=True)

import numpy as np
from pathlib import Path
import os
import json

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Setup

In [2]:
from tokensmith.manager import DatasetManager

[2025-07-04 19:11:19,891] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


  @autocast_custom_fwd
  @autocast_custom_bwd


In [3]:
dataset_manager = DatasetManager()

In [4]:
dataset_manager.setup_search(
    bin_file_path="data_ingested_text_document.bin",
    search_index_save_path="data_ingested_search_index_text_document.idx",
    vocab=2**16,
    verbose=True,
    reuse=False,
)

dataset_manager.setup_edit_inspect_sample_export(
    dataset_prefix='data_tokenized_text_document',
    batch_info_save_prefix='batch_info',
    train_iters=100,
    train_batch_size=16,
    train_seq_len=2048,
    seed=42,
    splits_string='990,5,5',
    packing_impl='packed',
    allow_chopped=True,
)

Writing indices to disk...
Time elapsed: 23.779177ms
Sorting indices...
Time elapsed: 85.717528ms
    warming up index mmap file...
    reading sizes...
    reading pointers...
    reading document index...


# Inspect

In [5]:
sequence, document_details = dataset_manager.inspect.inspect_sample_by_id(sample_id=10, return_doc_details=True, return_detokenized = True, tokenizer=tokenizer)

print(f"Sequence: {sequence}")
print(f"Document Details: {document_details}")

Sequence:  day, they find a big club on the grass. It is brown and heavy. "Look, a club!" Lily says. "Let's play with it!" "OK!" Ben says. "What can we do with it?" Lily thinks for a moment. Then she has an idea. "We can spin it!" she says. "Like this!" She holds the club with both hands and turns around fast. The club makes a whoosh sound in the air. "Wow, that looks fun!" Ben says. "Can I try?" "Sure!" Lily says. She gives the club to Ben. Ben tries to spin it, but it is too heavy for him. He falls down and drops the club. "Ouch!" Ben says. "That hurts!" Lily laughs. She helps Ben get up. "You are lazy!" she says. "You need to be strong to spin the club!" Ben makes a face. "I am not lazy!" he says. "I am just small. You are big and strong. That's not fair!" Lily smiles. She hugs Ben. "Don't be sad," she says. "We can play something else. How about we swing on the swings?" "OK!" Ben says. "That sounds fun!" They leave the club on the grass and run to the swings. They swing high and lo

In [6]:
resp = dataset_manager.inspect.inspect_sample_by_batch(batch_id=1, batch_size=16, return_doc_details=False, return_detokenized=True, tokenizer=tokenizer)

for i, r in enumerate(resp):
    print(f"Sequence {i}: {r}")

Sequence 0:  you too," she says. "But you have to promise me that you won't do that again. And you have to help me clean up the mess. And you have to wait for dinner. No more cookies for you today." Lily and Ben agree. They promise to be good. They help Mom clean up the kitchen. They wait for dinner. They don't touch the cookies anymore. They learn their lesson. They also learn that cookies are brown and sweet and crunchy. And that they taste better when they are shared.<|endoftext|>Once upon a time, there was a little girl named Lily who loved to collect shiny things. One day, she found a pretty crystal on the ground and put it in her pocket. She was so happy and showed it to everyone she met. But one day, Lily forgot where she put the crystal. She looked everywhere but couldn't find it. She was troubled and sad. Her friend, Timmy, asked her what was wrong. She told him about the lost crystal. "Don't worry, Lily," Timmy said. "Maybe someone else found it and will take good care of it.

# Sample

In [7]:
def stride_batch_policy(start_batch_id, num_batches, stride=5):
    """
    Policy function that returns batch IDs with a specific stride.
    
    Args:
        start_batch_id: Starting batch ID
        num_batches: Number of batches to return
        stride: Stride between batch IDs
    
    Returns:
        List of strided batch IDs
    """
    return [start_batch_id + i * stride for i in range(num_batches)]

# Get 2 batches with stride 3 and include document details
stride_batches = dataset_manager.sample.get_batches_by_policy(
    policy_fn=stride_batch_policy,
    batch_size=2,
    start_batch_id=5,
    num_batches=2,
    stride=3,  # Skip 2 batches between selected batches
    return_doc_details=True,
    return_detokenized=True,
    tokenizer=tokenizer
)

for batch_idx, batch in enumerate(stride_batches):
    batch_id = 5 + batch_idx * 3  # stride=3
    print(f"Stride Batch {batch_idx + 1} (Batch ID {batch_id}):")
    for sample_idx, (sample, doc_details) in enumerate(batch):
        sample_id = batch_id * 2 + sample_idx  # Assuming batch_size=2
        print(f"  Sample {sample_idx + 1} (ID {sample_id}):")
        print(f"    Document details: {doc_details}")
        print(f"    Text: {sample[:60]}...")  # Show first 60 chars
    print()

Stride Batch 1 (Batch ID 5):
  Sample 1 (ID 10):
    Document details: {'doc_index_f': 7983, 'doc_index_l': 7994, 'offset_f': 16, 'offset_l': 4}
    Text:  day, they find a big club on the grass. It is brown and hea...
  Sample 2 (ID 11):
    Document details: {'doc_index_f': 13066, 'doc_index_l': 13077, 'offset_f': 97, 'offset_l': 30}
    Text:  and dad said it seemed too dangerous, so they said no. The ...

Stride Batch 2 (Batch ID 8):
  Sample 1 (ID 16):
    Document details: {'doc_index_f': 9361, 'doc_index_l': 9371, 'offset_f': 639, 'offset_l': 30}
    Text:  you too," she says. "But you have to promise me that you wo...
  Sample 2 (ID 17):
    Document details: {'doc_index_f': 13023, 'doc_index_l': 13035, 'offset_f': 245, 'offset_l': 4}
    Text:  and Dad's hands. They were alone in the crowd. "Mom! Dad! W...



# Edit

In [8]:
RNG = np.random.default_rng(42)

dataset_manager.edit.inject_and_preview(
    text = 'This is a test sentence.',
    tokenizer=tokenizer,
    injection_loc=103,
    injection_type="seq_start",
    rng=RNG,
    add_eos_token=True,
    dry_run=False,
    return_details=True,
)



{'injection_location': 103,
 'injection_type': 'seq_start',
 'dry_run': False,
 'injected_text': 'This is a test sentence.',
 'injected_tokens': [1552, 310, 247, 1071, 6197, 15, 0],
 'original_sample': {'raw_tokens': [1552,
   310,
   247,
   1071,
   6197,
   15,
   0,
   5412,
   285,
   8872,
   15,
   1500,
   16632,
   703,
   574,
   18029,
   281,
   6270,
   15,
   1500,
   4925,
   253,
   2419,
   285,
   19336,
   327,
   253,
   3369,
   15,
   6270,
   5485,
   352,
   285,
   11373,
   15,
   346,
   12764,
   13,
   32817,
   13,
   28238,
   15,
   1422,
   403,
   1077,
   9685,
   15,
   13516,
   275,
   13,
   309,
   452,
   1633,
   323,
   368,
   937,
   344,
   753,
   15,
   754,
   3977,
   731,
   281,
   253,
   3811,
   2316,
   13,
   835,
   13541,
   574,
   1160,
   247,
   3289,
   275,
   253,
   39696,
   15,
   754,
   3534,
   32817,
   247,
   29861,
   285,
   247,
   23069,
   15,
   754,
   3534,
   28238,
   247,
   5961,
   285,
   247,
   1

# Search

In [9]:
counts = dataset_manager.search.count(
    query=tokenizer.encode("Once upon a time", add_special_tokens=False),
)

print(f"Count of sequences matching 'Once upon a time': {counts}")

Count of sequences matching 'Once upon a time': 13533


In [10]:
positions = dataset_manager.search.positions(
    query=tokenizer.encode("Once upon a time", add_special_tokens=False),
)

print(f"Positions of sequences matching 'Once upon a time': {positions}")

Positions of sequences matching 'Once upon a time': [1871078, 3526259, 739125, 1332842, 1838022, 3434072, 484592, 2798653, 313457, 1297946, 3240107, 1726979, 1186083, 2846250, 3235209, 1948546, 2320144, 3204120, 1755562, 1812768, 2318625, 117421, 200673, 133628, 2417830, 4283389, 746884, 3847189, 600631, 4538351, 4446259, 193978, 3035722, 3267620, 4368474, 267055, 4317187, 4037606, 3059354, 1768128, 3242644, 1946357, 2673169, 1928816, 1602552, 3061564, 1566219, 321040, 512528, 1115990, 1251762, 2166459, 2228325, 3425131, 826887, 1668074, 544951, 3280289, 2675024, 1116959, 2008809, 3424199, 2692171, 1965091, 3523149, 1368589, 2693613, 67375, 906844, 2379143, 71554, 3208440, 2152541, 3051185, 814319, 2231705, 4260153, 2883922, 146061, 3711526, 1198478, 1288497, 2677616, 2488470, 388564, 1928076, 3725804, 2375092, 1283766, 824296, 1857814, 2664215, 2873793, 1910837, 130536, 3750221, 2297546, 1949089, 3286035, 960729, 359501, 183144, 858332, 1536214, 1320451, 2553715, 4088470, 1100597, 305

# Export

In [11]:
# Define sequence indices to export
sequence_indices = [10, 25, 50, 75, 100]
output_path_jsonl = "Export/specific_sequences.jsonl"

# Export to JSONL format with detokenized text
dataset_manager.export.export_sequences(
    sequence_indices=sequence_indices,
    output_path=str(output_path_jsonl),
    format_type="jsonl",
    return_detokenized=True,
    tokenizer=tokenizer,
    include_doc_details=False
)

print(f"Exported {len(sequence_indices)} sequences to: {output_path_jsonl}")

# Read and display first few lines to verify
with open(output_path_jsonl, 'r') as f:
    for i, line in enumerate(f):
        if i < 2:  # Show first 2 records
            record = json.loads(line)
            print(f"Record {i+1}: Index {record['index']}, Content: {record['content'][:80]}...")
        else:
            break

Exported 5 sequences to: Export/specific_sequences.jsonl
Record 1: Index 0, Content:  day, they find a big club on the grass. It is brown and heavy. "Look, a club!" ...
Record 2: Index 1, Content:  that the sun made droplets scatter off of their backs! They felt so refreshed i...
