In [1]:
from datasets import load_dataset, Dataset
from jinja2 import Template
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

import json

In [2]:
NUM_TOPICS = 1024
NUM_GPUS = 4
BATCH_SIZE = 32

In [3]:
ds = load_dataset('wikimedia/wikipedia', name='20231101.en', split='train', streaming=True)

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

In [4]:
for row in ds.shuffle(seed=1998, buffer_size=10_000).take(10):
    print(row['title'])

Komorica
Glassport Odds
Ciudad Nueva (Hato Rey)
Kamiokite
Roobaka
Wayne Ormond
The Pagans (film)
Alfred A. Gilman
1922 Austin twin tornadoes
Gornji Emovci


In [5]:
with open("topic_conditioned.jinja2") as f:
    template_str = f.read()

In [6]:
with open("topic_content_shots.json") as f:
    content_json = json.load(f)

In [7]:
template = Template(template_str)

In [8]:
system_prompt = template.render(contents=content_json)

In [9]:
print(system_prompt)

# Instructions

Imagine you're an expert on the topic given by the user. Your goal is to write an article explain the topic in detail.

# Output Instructions

Respond with the content in plain text, with no structure.

# Examples

Topic:
Gordon Ramsay's early career

Content:
Gordon James Ramsay was born in Johnstone, Scotland, on 8 November 1966, the son of Helen (née Cosgrove), a nurse, and Gordon James Sr., who worked as a swimming pool manager, welder, and shopkeeper. He has an older sister, a younger brother, and a younger sister. When he was nine years old, he moved with his family to England and grew up in the Bishopton area of Stratford-upon-Avon. He has described his early life as 'hopelessly itinerant' and said his family moved constantly owing to the aspirations and failures of his father, who was an occasionally violent alcoholic; Ramsay described him as a 'hard-drinking womaniser'. In his autobiography, he revealed that his father abused and neglected the children and his 

In [10]:
model_id = "meta-llama/Llama-3.1-405B-Instruct-FP8"

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [12]:
llm = LLM(model=model_id, max_model_len=4096, tensor_parallel_size=NUM_GPUS, gpu_memory_utilization=0.98, enable_chunked_prefill=True, max_num_batched_tokens=4096, max_num_seqs=BATCH_SIZE)

INFO 12-24 05:58:37 config.py:478] This model supports multiple tasks: {'score', 'reward', 'embed', 'generate', 'classify'}. Defaulting to 'generate'.
INFO 12-24 05:58:38 config.py:1216] Defaulting to use mp for distributed inference
INFO 12-24 05:58:38 config.py:1364] Chunked prefill is enabled with max_num_batched_tokens=4096.
INFO 12-24 05:58:38 llm_engine.py:249] Initializing an LLM engine (v0.6.5) with config: model='meta-llama/Llama-3.1-405B-Instruct-FP8', speculative_config=None, tokenizer='meta-llama/Llama-3.1-405B-Instruct-FP8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=fbgemm_fp8, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided

Loading safetensors checkpoint shards:   0% Completed | 0/109 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=92047)[0;0m INFO 12-24 06:00:58 model_runner.py:1097] Loading model weights took 113.4847 GB
INFO 12-24 06:00:58 model_runner.py:1097] Loading model weights took 113.4847 GB
[1;36m(VllmWorkerProcess pid=92048)[0;0m INFO 12-24 06:00:58 model_runner.py:1097] Loading model weights took 113.4847 GB
[1;36m(VllmWorkerProcess pid=92049)[0;0m INFO 12-24 06:00:59 model_runner.py:1097] Loading model weights took 113.4847 GB
[1;36m(VllmWorkerProcess pid=92048)[0;0m INFO 12-24 06:01:03 worker.py:241] Memory profiling takes 4.55 seconds
[1;36m(VllmWorkerProcess pid=92048)[0;0m INFO 12-24 06:01:03 worker.py:241] the current vLLM instance can use total_gpu_memory (139.72GiB) x gpu_memory_utilization (0.98) = 136.92GiB
[1;36m(VllmWorkerProcess pid=92048)[0;0m INFO 12-24 06:01:03 worker.py:241] model weights take 113.48GiB; non_torch_memory takes 4.15GiB; PyTorch activation peak memory takes 0.76GiB; the rest of the memory reserved for KV Cache is 18.53GiB.
[1;

In [13]:
def generate_content(topics):
    messages = [[{"role": "system", "content": system_prompt},
                {"role": "user", "content": "Topic:\n" + topic + "\n\nContent:"}] for topic in topics]
    prompts = [tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) for chat in messages]

    outputs = llm.generate(prompts, SamplingParams(temperature=0.25, top_p=0.9, max_tokens=3072))

    return {"synthetic_content": [output.outputs[0].text.strip() for output in outputs]}

In [14]:
syn_ds_stream = ds.shuffle(seed=1998, buffer_size=1000_000).take(NUM_TOPICS).map(generate_content, batched=True, batch_size=NUM_TOPICS, input_columns=["title"])

In [15]:
syn_ds_list = list(syn_ds_stream)

Processed prompts: 100% 1024/1024 [16:59<00:00,  1.00it/s, est. speed input: 777.05 toks/s, output: 468.71 toks/s]


In [16]:
syn_ds = Dataset.from_list(syn_ds_list)
syn_ds.push_to_hub('amang1802/wiki_topic_conditioned_405B')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/datasets/amang1802/wiki_topic_conditioned_405B/commit/b17e47bcaf7bbe4f3a329b6b3322599c72172036', commit_message='Upload dataset', commit_description='', oid='b17e47bcaf7bbe4f3a329b6b3322599c72172036', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/amang1802/wiki_topic_conditioned_405B', endpoint='https://huggingface.co', repo_type='dataset', repo_id='amang1802/wiki_topic_conditioned_405B'), pr_revision=None, pr_num=None)