In [1]:
import datasets
import os
import numpy as np

In [2]:
from transformers import AutoTokenizer

model_name = "Qwen/Qwen2.5-32B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)


def tokenize(example):
    text = example["text"]
    inputs = tokenizer(
        text,
        truncation=True,
        max_length=tokenizer.model_max_length,
    )
    return inputs

In [3]:
def load_dataset(dataset_name: str):
    if os.path.exists(dataset_name):
        dataset = datasets.load_from_disk(dataset_name)
    else:
        dataset = datasets.load_dataset(dataset_name)

    return dataset

In [4]:
def get_context_length_stats(dataset_name: str):
    dataset = load_dataset(dataset_name)

    # tokenize the dataset
    tokenized_dataset = dataset["train"].map(
        tokenize,
        desc="Tokenizing dataset",
    )

    context_lengths = [len(x) for x in tokenized_dataset["input_ids"]]

    return np.max(context_lengths), np.mean(context_lengths), np.std(context_lengths)

In [5]:
get_context_length_stats("simplescaling/s1K_tokenized")

Tokenizing dataset: 100%|██████████| 1000/1000 [00:08<00:00, 117.35 examples/s]


(9109, 5658.483, 1484.26871277104)

In [6]:
get_context_length_stats("simplescaling/s1K-1.1_tokenized")

(26969, 10038.302, 4530.923393172301)

In [7]:
get_context_length_stats("2k-deepseek-traces-k30-rerank-k5")

(74641, 12440.810075566751, 6996.738811163135)

In [8]:
get_context_length_stats("2k-deepseek-traces-k5_qwen_summarised_data")

(7544, 1957.4959798994976, 707.9811723740258)