# Tokenizer Check

In [18]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")

In [19]:
from datasets import load_dataset, get_dataset_config_names
from transformers import AutoTokenizer
total_datasets = dict()
for dataset_name in get_dataset_config_names("HuggingFaceTB/smoltalk"):
    if dataset_name == "all":
        continue  # Skip the 'all' dataset as it is not a specific dataset
    try:
        dataset = load_dataset("HuggingFaceTB/smoltalk", dataset_name, split="train")
        total_datasets[dataset_name] = len(dataset)
    except Exception as e:
        print(f"Failed to load {dataset_name}: {e}")
# dataset = load_dataset("HuggingFaceTB/smoltalk", name='all', split="train")

print("Number of samples in the dataset:", len(dataset))
print("First sample:", dataset[0])


Number of samples in the dataset: 34133
First sample: {'messages': [{'content': 'Always respond with an emoji at the end', 'role': 'system'}, {'content': 'Can you help me draft a tweet about the beautiful sunset I saw today at the beach? I want to include a sense of tranquility and inspiration in the message.', 'role': 'user'}, {'content': '"Embraced by the tranquil whisper of the waves, I watched as the sky turned into a canvas of warm hues. A beautiful sunset at the beach today, painting a picture of inspiration and serenity. 🌅"', 'role': 'assistant'}, {'content': "That's a beautiful tweet, thank you! Can you now show me how to post it on Twitter?", 'role': 'user'}, {'content': "Sure! Here are the steps:\n\n1. Open the Twitter app or website. \n2. Click on the 'Tweet' button (it's a blue circle with a feather in it).\n3. Copy and paste the tweet I crafted into the text box that appears.\n4. When you're ready, click the 'Tweet' button to post it.\n\nRemember, Twitter has a character l

In [20]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", use_fast=True)

In [23]:
total_map_datasets = dict()
for dataset_name, num_samples in total_datasets.items():
    try:
        dataset = load_dataset("HuggingFaceTB/smoltalk", dataset_name, split="train")
        def process_batch(batch):
            inputs = [tokenizer.apply_chat_template(msgs[:-1]) for msgs in batch['messages']]
            outputs = [tokenizer.apply_chat_template(msgs[-1:]) for msgs in batch['messages']]
            input_lens = [len(i) for i in inputs]
            output_lens = [len(o) for o in outputs]
            return {
                "input": inputs,
                "output": outputs,
                "input_len": input_lens,
                "output_len": output_lens,
                "total_len": [il + ol for il, ol in zip(input_lens, output_lens)],
            }
        map_dataset = dataset.map(
            process_batch,
            batched=True,
            num_proc=64,  # 시스템에 맞게 조정
        )
        total_map_datasets[dataset_name] = map_dataset  # 이미 total_datasets에 있음

        total_input_length = sum(map_dataset['input_len'])
        total_output_length = sum(map_dataset['output_len'])
        print(f"{dataset_name} - Total input length: {total_input_length}")
        print(f"{dataset_name} - Total output length: {total_output_length}")

        print(f"{dataset_name} - Total length: {sum(map_dataset['total_len'])}")
    except Exception as e:
        print(f"Failed to process {dataset_name}: {e}")


Map (num_proc=64): 100%|██████████| 409537/409537 [00:32<00:00, 12795.77 examples/s]


smol-magpie-ultra - Total input length: 409951800
smol-magpie-ultra - Total output length: 183492630
smol-magpie-ultra - Total length: 593444430


Map (num_proc=64): 100%|██████████| 34424/34424 [00:01<00:00, 21840.07 examples/s]


smol-constraints - Total input length: 2670828
smol-constraints - Total output length: 4518128
smol-constraints - Total length: 7188956


Map (num_proc=64): 100%|██████████| 53342/53342 [00:01<00:00, 26814.94 examples/s]


smol-rewrite - Total input length: 9819005
smol-rewrite - Total output length: 7651217
smol-rewrite - Total length: 17470222


Map (num_proc=64): 100%|██████████| 96356/96356 [00:03<00:00, 25789.13 examples/s]


smol-summarize - Total input length: 38215806
smol-summarize - Total output length: 9147688
smol-summarize - Total length: 47363494


Map (num_proc=64): 100%|██████████| 83144/83144 [00:03<00:00, 27671.73 examples/s]


apigen-80k - Total input length: 41063414
apigen-80k - Total output length: 4945778
apigen-80k - Total length: 46009192


Map (num_proc=64): 100%|██████████| 2260/2260 [00:00<00:00, 2724.04 examples/s]


everyday-conversations - Total input length: 351451
everyday-conversations - Total output length: 78421
everyday-conversations - Total length: 429872


Map (num_proc=64): 100%|██████████| 30400/30400 [00:00<00:00, 32065.29 examples/s]


explore-instruct-rewriting - Total input length: 1750141
explore-instruct-rewriting - Total output length: 879541
explore-instruct-rewriting - Total length: 2629682


Map (num_proc=64): 100%|██████████| 3547/3547 [00:03<00:00, 983.49 examples/s] 


longalign - Total input length: 36463804
longalign - Total output length: 653041
longalign - Total length: 37116845


Map (num_proc=64): 100%|██████████| 47500/47500 [00:01<00:00, 30488.42 examples/s]


metamathqa-50k - Total input length: 3011574
metamathqa-50k - Total output length: 8623503
metamathqa-50k - Total length: 11635077


Map (num_proc=64): 100%|██████████| 106147/106147 [00:03<00:00, 34699.73 examples/s]


numina-cot-100k - Total input length: 9446620
numina-cot-100k - Total output length: 45826019
numina-cot-100k - Total length: 55272639


Map (num_proc=64): 100%|██████████| 95000/95000 [00:03<00:00, 28535.29 examples/s]


openhermes-100k - Total input length: 14797027
openhermes-100k - Total output length: 20917171
openhermes-100k - Total length: 35714198


Map (num_proc=64): 100%|██████████| 48127/48127 [00:01<00:00, 31994.76 examples/s]


self-oss-instruct - Total input length: 5709826
self-oss-instruct - Total output length: 9207413
self-oss-instruct - Total length: 14917239


Map (num_proc=64): 100%|██████████| 34133/34133 [00:01<00:00, 19297.84 examples/s]


systemchats-30k - Total input length: 13091122
systemchats-30k - Total output length: 7081342
systemchats-30k - Total length: 20172464


In [24]:
total_map_datasets

{'smol-magpie-ultra': Dataset({
     features: ['messages', 'category', 'difficulty', 'quality', 'reward_model_score', 'conversation_tokens', 'input', 'output', 'input_len', 'output_len', 'total_len'],
     num_rows: 409537
 }),
 'smol-constraints': Dataset({
     features: ['messages', 'input', 'output', 'input_len', 'output_len', 'total_len'],
     num_rows: 34424
 }),
 'smol-rewrite': Dataset({
     features: ['messages', 'input', 'output', 'input_len', 'output_len', 'total_len'],
     num_rows: 53342
 }),
 'smol-summarize': Dataset({
     features: ['messages', 'input', 'output', 'input_len', 'output_len', 'total_len'],
     num_rows: 96356
 }),
 'apigen-80k': Dataset({
     features: ['messages', 'input', 'output', 'input_len', 'output_len', 'total_len'],
     num_rows: 83144
 }),
 'everyday-conversations': Dataset({
     features: ['full_topic', 'messages', 'input', 'output', 'input_len', 'output_len', 'total_len'],
     num_rows: 2260
 }),
 'explore-instruct-rewriting': Dataset(

In [27]:
for dataset_name, map_dataset in total_map_datasets.items():
    try:
        # total_input_length = sum(map_dataset['input_len'])
        # total_output_length = sum(map_dataset['output_len'])
        # print(f"{dataset_name} - Total input length: {total_input_length}")
        # print(f"{dataset_name} - Total output length: {total_output_length}")

        print(f"{dataset_name} - Max length: {max(map_dataset['total_len'])}")
    except Exception as e:
        print(f"Failed to process {dataset_name}: {e}")
# total_map_datasets['smol-magpie-ultra']['total_len']

smol-magpie-ultra - Max length: 8956
smol-constraints - Max length: 1525
smol-rewrite - Max length: 860
smol-summarize - Max length: 3276
apigen-80k - Max length: 2882
everyday-conversations - Max length: 309
explore-instruct-rewriting - Max length: 545
longalign - Max length: 28505
metamathqa-50k - Max length: 2679
numina-cot-100k - Max length: 3739
openhermes-100k - Max length: 5041
self-oss-instruct - Max length: 1894
systemchats-30k - Max length: 3180


Total input length: 586341555
Total output length: 303040887


In [3]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("mncai/foundation_model_smoltalk_ko_translate", name='default', split="train")

# PrimeIntellect

In [28]:
dataset = load_dataset("PrimeIntellect/SYNTHETIC-2-SFT-verified", name='default', split="train")

In [31]:
def process_batch(batch):
    inputs = [tokenizer.apply_chat_template(msgs[:-1]) for msgs in batch['messages']]
    outputs = [tokenizer.apply_chat_template(msgs[-1:]) for msgs in batch['messages']]
    input_lens = [len(i) for i in inputs]
    output_lens = [len(o) for o in outputs]
    return {
        "input": inputs,
        "output": outputs,
        "input_len": input_lens,
        "output_len": output_lens,
        "total_len": [il + ol for il, ol in zip(input_lens, output_lens)],
    }
map_dataset = dataset.map(
    process_batch,
    batched=True,
    num_proc=64,  # 시스템에 맞게 조정
)
# total_map_datasets[dataset_name] = num_samples  # 이미 total_datasets에 있음

total_input_length = sum(map_dataset['input_len'])
total_output_length = sum(map_dataset['output_len'])
print(f"Total input length: {total_input_length}")
print(f"Total output length: {total_output_length}")

Map (num_proc=64): 100%|██████████| 104913/104913 [00:06<00:00, 15440.73 examples/s]


Total input length: 36047851
Total output length: 85360359


In [32]:
print(f"{dataset_name} - Max length: {max(map_dataset['total_len'])}")

systemchats-30k - Max length: 12859


In [30]:
max_input_length = max(map_dataset['input_len'])
max_output_length = max(map_dataset['output_len'])
map_dataset = map_dataset.map(
    lambda x: {"total_len": [input_len + output_len for input_len, output_len in zip(x['input_len'], x['output_len'])]},
    batched=True,
    num_proc=64,  # 시스템에 맞게 조정
)
max_total_length = max(map_dataset['total_len'])
print(f"Max input length: {max_input_length}")
print(f"Max output length: {max_output_length}")
print(f"Max Total length: {max_total_length}")

Max input length: 9386
Max output length: 12616
Max Total length: 12859


In [14]:
len(map_dataset)

104913

# MATH