In [1]:
from datasets import load_dataset, concatenate_datasets, Dataset
from jinja2 import Template
from vllm import LLM, SamplingParams

import json

In [2]:
NUM_OUTPUTS = 1024 * 10
NUM_GPUS = 4
BATCH_SIZE = 256
PUSH_INTERVAL = 8

In [3]:
system_prompt = """
Your goal is to create high quality learning content, like Wikipedia articles, that provides information collected through experience and research."
""".strip()

In [4]:
model_id = "meta-llama/Llama-3.1-405B-Instruct-FP8"

In [5]:
llm = LLM(model=model_id, max_model_len=4096, tensor_parallel_size=NUM_GPUS, gpu_memory_utilization=0.98)

INFO 12-26 20:45:32 config.py:478] This model supports multiple tasks: {'classify', 'generate', 'reward', 'score', 'embed'}. Defaulting to 'generate'.
INFO 12-26 20:45:32 config.py:1216] Defaulting to use mp for distributed inference
INFO 12-26 20:45:32 llm_engine.py:249] Initializing an LLM engine (v0.6.5) with config: model='meta-llama/Llama-3.1-405B-Instruct-FP8', speculative_config=None, tokenizer='meta-llama/Llama-3.1-405B-Instruct-FP8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=fbgemm_fp8, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None

Loading safetensors checkpoint shards:   0% Completed | 0/109 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=52322)[0;0m INFO 12-26 20:47:38 model_runner.py:1097] Loading model weights took 113.4847 GB
INFO 12-26 20:47:38 model_runner.py:1097] Loading model weights took 113.4847 GB
[1;36m(VllmWorkerProcess pid=52321)[0;0m INFO 12-26 20:47:38 model_runner.py:1097] Loading model weights took 113.4847 GB
[1;36m(VllmWorkerProcess pid=52320)[0;0m INFO 12-26 20:47:38 model_runner.py:1097] Loading model weights took 113.4847 GB
[1;36m(VllmWorkerProcess pid=52321)[0;0m [1;36m(VllmWorkerProcess pid=52320)[0;0m INFO 12-26 20:47:42 worker.py:241] Memory profiling takes 3.74 seconds
INFO 12-26 20:47:42 worker.py:241] Memory profiling takes 3.74 seconds
[1;36m(VllmWorkerProcess pid=52321)[0;0m [1;36m(VllmWorkerProcess pid=52320)[0;0m INFO 12-26 20:47:42 worker.py:241] the current vLLM instance can use total_gpu_memory (139.72GiB) x gpu_memory_utilization (0.98) = 136.92GiB
INFO 12-26 20:47:42 worker.py:241] the current vLLM instance can use total_gpu_memory (139.

In [7]:
def generate_content(n):
    message = [{"role": "system", "content": system_prompt},
               {"role": "user", "content": "Write a long article.\n\nTitle:"}]

    outputs = llm.chat([message]*n, SamplingParams(temperature=1, top_p=0.9, max_tokens=3584))

    return {"synthetic_content": [output.outputs[0].text.strip() for output in outputs]}

In [8]:
def generate_content_base(n):
    prompt = "Here is long high quality learning article, like a Wikipedia article, that provides information collected through experience and research. The article is formatted for easy readibility.\n\nTitle:"
    outputs = llm.generate([prompt]*n, SamplingParams(temperature=0.75, top_p=0.9, max_tokens=4096))

    return {"synthetic_content": [output.outputs[0].text.strip() for output in outputs]}

In [9]:
ds = Dataset.from_dict(generate_content(4))

INFO 12-26 20:50:51 chat_utils.py:333] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.


Processed prompts: 100% 4/4 [00:57<00:00, 14.28s/it, est. speed input: 4.62 toks/s, output: 82.00 toks/s]


In [17]:
ds_id = "amang1802/synthetic_data_unconditioned_L3.1_405B_Instruct"

In [18]:
num_batches = NUM_OUTPUTS // BATCH_SIZE
for i in range(num_batches):
    new_data = Dataset.from_dict(generate_content_base(BATCH_SIZE))
    ds = concatenate_datasets([ds, new_data])
    print(f"Dataset size: {ds.num_rows}")
    if (i+1) % PUSH_INTERVAL == 0:
        ds.push_to_hub(ds_id)

Processed prompts:   1% 3/256 [00:54<54:35, 12.95s/it, est. speed input: 1.86 toks/s, output: 27.11 toks/s]  



Processed prompts: 100% 256/256 [03:58<00:00,  1.07it/s, est. speed input: 36.50 toks/s, output: 893.19 toks/s] 


Dataset size: 260


Processed prompts:   3% 8/256 [00:58<08:54,  2.16s/it, est. speed input: 4.67 toks/s, output: 72.40 toks/s]  



Processed prompts: 100% 256/256 [03:35<00:00,  1.19it/s, est. speed input: 40.34 toks/s, output: 998.10 toks/s] 


Dataset size: 516


Processed prompts:   1% 3/256 [00:59<58:32, 13.88s/it, est. speed input: 1.72 toks/s, output: 27.01 toks/s]  



Processed prompts: 100% 256/256 [03:54<00:00,  1.09it/s, est. speed input: 37.15 toks/s, output: 899.04 toks/s] 


Dataset size: 772


Processed prompts:  11% 29/256 [01:08<01:20,  2.81it/s, est. speed input: 14.47 toks/s, output: 241.82 toks/s]



Processed prompts: 100% 256/256 [02:25<00:00,  1.76it/s, est. speed input: 59.75 toks/s, output: 1451.98 toks/s]


Dataset size: 1028


Processed prompts:  12% 32/256 [01:11<01:24,  2.64it/s, est. speed input: 15.19 toks/s, output: 269.25 toks/s]



Processed prompts: 100% 256/256 [02:11<00:00,  1.95it/s, est. speed input: 66.41 toks/s, output: 1586.35 toks/s]


Dataset size: 1284


Processed prompts: 100% 256/256 [03:55<00:00,  1.09it/s, est. speed input: 36.98 toks/s, output: 895.12 toks/s] 


Dataset size: 1540


Processed prompts:   4% 10/256 [00:56<05:22,  1.31s/it, est. speed input: 5.99 toks/s, output: 84.33 toks/s] 



Processed prompts: 100% 256/256 [02:15<00:00,  1.88it/s, est. speed input: 64.03 toks/s, output: 1521.22 toks/s]


Dataset size: 1796


Processed prompts:   5% 12/256 [01:01<03:32,  1.15it/s, est. speed input: 6.61 toks/s, output: 104.89 toks/s]



Processed prompts: 100% 256/256 [01:55<00:00,  2.21it/s, est. speed input: 75.14 toks/s, output: 1773.61 toks/s]


Dataset size: 2052


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   6% 15/256 [01:04<03:51,  1.04it/s, est. speed input: 7.94 toks/s, output: 118.99 toks/s]



Processed prompts: 100% 256/256 [03:56<00:00,  1.08it/s, est. speed input: 36.88 toks/s, output: 894.19 toks/s] 


Dataset size: 2308


Processed prompts:   5% 13/256 [01:06<04:00,  1.01it/s, est. speed input: 6.68 toks/s, output: 112.62 toks/s]



Processed prompts: 100% 256/256 [03:58<00:00,  1.07it/s, est. speed input: 36.51 toks/s, output: 896.70 toks/s] 


Dataset size: 2564


Processed prompts:   9% 24/256 [01:09<01:54,  2.03it/s, est. speed input: 11.81 toks/s, output: 202.47 toks/s]



Processed prompts: 100% 256/256 [02:21<00:00,  1.81it/s, est. speed input: 61.46 toks/s, output: 1475.26 toks/s]


Dataset size: 2820


Processed prompts:  14% 35/256 [01:13<02:08,  1.72it/s, est. speed input: 16.10 toks/s, output: 286.28 toks/s]



Processed prompts: 100% 256/256 [03:57<00:00,  1.08it/s, est. speed input: 36.63 toks/s, output: 904.05 toks/s] 


Dataset size: 3076


Processed prompts: 100% 256/256 [02:08<00:00,  1.99it/s, est. speed input: 67.70 toks/s, output: 1636.06 toks/s]


Dataset size: 3332


Processed prompts:   2% 4/256 [00:55<43:13, 10.29s/it, est. speed input: 2.46 toks/s, output: 28.91 toks/s]  



Processed prompts: 100% 256/256 [02:14<00:00,  1.91it/s, est. speed input: 64.94 toks/s, output: 1529.44 toks/s]


Dataset size: 3588


Processed prompts:   0% 1/256 [00:52<3:42:34, 52.37s/it, est. speed input: 0.65 toks/s, output: 9.30 toks/s]



Processed prompts: 100% 256/256 [04:01<00:00,  1.06it/s, est. speed input: 36.03 toks/s, output: 901.30 toks/s] 


Dataset size: 3844


Processed prompts:   7% 18/256 [01:04<02:55,  1.36it/s, est. speed input: 9.52 toks/s, output: 152.04 toks/s]



Processed prompts: 100% 256/256 [02:22<00:00,  1.79it/s, est. speed input: 60.91 toks/s, output: 1467.10 toks/s]

Dataset size: 4100





Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/287 [00:00<?, ?B/s]

Processed prompts:   3% 8/256 [01:02<10:38,  2.58s/it, est. speed input: 4.36 toks/s, output: 68.88 toks/s]  



Processed prompts: 100% 256/256 [02:11<00:00,  1.95it/s, est. speed input: 66.43 toks/s, output: 1616.81 toks/s]


Dataset size: 4356


Processed prompts:  12% 32/256 [01:09<01:44,  2.15it/s, est. speed input: 15.65 toks/s, output: 261.44 toks/s]



Processed prompts: 100% 256/256 [03:56<00:00,  1.08it/s, est. speed input: 36.79 toks/s, output: 904.53 toks/s] 


Dataset size: 4612


Processed prompts:  14% 37/256 [01:13<01:07,  3.24it/s, est. speed input: 17.17 toks/s, output: 308.29 toks/s]



Processed prompts: 100% 256/256 [03:57<00:00,  1.08it/s, est. speed input: 36.60 toks/s, output: 909.86 toks/s] 


Dataset size: 4868


Processed prompts: 100% 256/256 [02:55<00:00,  1.46it/s, est. speed input: 49.73 toks/s, output: 1190.39 toks/s]


Dataset size: 5124


Processed prompts:   1% 3/256 [00:56<55:37, 13.19s/it, est. speed input: 1.81 toks/s, output: 27.07 toks/s]  



Processed prompts: 100% 256/256 [02:31<00:00,  1.69it/s, est. speed input: 57.37 toks/s, output: 1383.83 toks/s]


Dataset size: 5380


Processed prompts:   4% 11/256 [00:59<04:47,  1.17s/it, est. speed input: 6.26 toks/s, output: 97.95 toks/s] 



Processed prompts: 100% 256/256 [04:01<00:00,  1.06it/s, est. speed input: 36.00 toks/s, output: 897.52 toks/s] 


Dataset size: 5636


Processed prompts:   5% 14/256 [01:02<04:20,  1.08s/it, est. speed input: 7.61 toks/s, output: 122.77 toks/s]



Processed prompts: 100% 256/256 [02:02<00:00,  2.10it/s, est. speed input: 71.34 toks/s, output: 1701.53 toks/s]


Dataset size: 5892


Processed prompts:  13% 34/256 [01:10<01:22,  2.70it/s, est. speed input: 16.33 toks/s, output: 281.81 toks/s]



Processed prompts: 100% 256/256 [02:15<00:00,  1.89it/s, est. speed input: 64.31 toks/s, output: 1533.65 toks/s]

Dataset size: 6148





Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/289 [00:00<?, ?B/s]

Processed prompts: 100% 256/256 [02:05<00:00,  2.04it/s, est. speed input: 69.30 toks/s, output: 1678.36 toks/s]


Dataset size: 6404


Processed prompts:   1% 2/256 [00:56<1:44:35, 24.71s/it, est. speed input: 1.21 toks/s, output: 16.94 toks/s]



Processed prompts: 100% 256/256 [03:54<00:00,  1.09it/s, est. speed input: 37.15 toks/s, output: 900.66 toks/s] 


Dataset size: 6660


Processed prompts:   9% 24/256 [01:05<01:32,  2.50it/s, est. speed input: 12.51 toks/s, output: 202.76 toks/s]



Processed prompts: 100% 256/256 [03:53<00:00,  1.10it/s, est. speed input: 37.28 toks/s, output: 904.40 toks/s] 


Dataset size: 6916


Processed prompts: 100% 256/256 [02:06<00:00,  2.03it/s, est. speed input: 68.90 toks/s, output: 1639.46 toks/s]


Dataset size: 7172


Processed prompts:   2% 4/256 [00:54<32:14,  7.68s/it, est. speed input: 2.50 toks/s, output: 36.38 toks/s]  



Processed prompts: 100% 256/256 [02:11<00:00,  1.95it/s, est. speed input: 66.23 toks/s, output: 1572.58 toks/s]


Dataset size: 7428


Processed prompts:   1% 3/256 [00:58<56:28, 13.39s/it, est. speed input: 1.75 toks/s, output: 27.43 toks/s]  



Processed prompts: 100% 256/256 [01:57<00:00,  2.17it/s, est. speed input: 73.81 toks/s, output: 1757.66 toks/s]


Dataset size: 7684


Processed prompts:   7% 18/256 [01:04<03:43,  1.07it/s, est. speed input: 9.55 toks/s, output: 149.27 toks/s]



Processed prompts: 100% 256/256 [02:13<00:00,  1.92it/s, est. speed input: 65.22 toks/s, output: 1523.77 toks/s]


Dataset size: 7940


Processed prompts:  20% 52/256 [01:17<01:00,  3.35it/s, est. speed input: 22.96 toks/s, output: 416.63 toks/s]



Processed prompts: 100% 256/256 [01:59<00:00,  2.14it/s, est. speed input: 72.74 toks/s, output: 1703.82 toks/s]

Dataset size: 8196





Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]



README.md:   0%|          | 0.00/290 [00:00<?, ?B/s]

Processed prompts:  24% 61/256 [01:22<01:14,  2.63it/s, est. speed input: 24.99 toks/s, output: 470.64 toks/s]



Processed prompts: 100% 256/256 [03:58<00:00,  1.07it/s, est. speed input: 36.51 toks/s, output: 895.37 toks/s] 


Dataset size: 8452


Processed prompts: 100% 256/256 [02:40<00:00,  1.60it/s, est. speed input: 54.24 toks/s, output: 1287.31 toks/s]


Dataset size: 8708


Processed prompts:   2% 5/256 [00:58<25:56,  6.20s/it, est. speed input: 2.93 toks/s, output: 43.27 toks/s]  



Processed prompts: 100% 256/256 [03:55<00:00,  1.09it/s, est. speed input: 36.98 toks/s, output: 881.96 toks/s] 


Dataset size: 8964


Processed prompts:   5% 13/256 [01:02<03:19,  1.22it/s, est. speed input: 7.03 toks/s, output: 113.66 toks/s]



Processed prompts: 100% 256/256 [03:56<00:00,  1.08it/s, est. speed input: 36.73 toks/s, output: 902.87 toks/s] 


Dataset size: 9220


Processed prompts:  12% 30/256 [01:09<01:47,  2.10it/s, est. speed input: 14.71 toks/s, output: 248.89 toks/s]



Processed prompts: 100% 256/256 [02:11<00:00,  1.95it/s, est. speed input: 66.28 toks/s, output: 1582.58 toks/s]


Dataset size: 9476


Processed prompts:   9% 22/256 [01:07<02:00,  1.94it/s, est. speed input: 11.08 toks/s, output: 187.70 toks/s]



Processed prompts: 100% 256/256 [02:00<00:00,  2.13it/s, est. speed input: 72.42 toks/s, output: 1744.89 toks/s]


Dataset size: 9732


Processed prompts:  12% 32/256 [01:11<01:31,  2.44it/s, est. speed input: 15.14 toks/s, output: 269.47 toks/s]



Processed prompts: 100% 256/256 [03:56<00:00,  1.08it/s, est. speed input: 36.78 toks/s, output: 895.79 toks/s] 


Dataset size: 9988


Processed prompts: 100% 256/256 [03:55<00:00,  1.09it/s, est. speed input: 36.99 toks/s, output: 898.59 toks/s] 

Dataset size: 10244





Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/290 [00:00<?, ?B/s]