In [3]:
from datasets import load_dataset, Dataset
from jinja2 import Template
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

import json

In [4]:
NUM_TOPICS = 1024 * 10
NUM_GPUS = 4
BATCH_SIZE = 256

In [19]:
def get_prefix(text):
    first_50_words = text.split(" ")[:50]
    return {"prefix": " ".join(first_50_words)}

In [5]:
ds = load_dataset('wikimedia/wikipedia', name='20231101.en', split='train', streaming=True)

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

In [24]:
ds_w_prefix = ds.shuffle(seed=1998, buffer_size=1000_000).take(NUM_TOPICS).map(get_prefix, input_columns=['text'])
ds = Dataset.from_list(list(ds_w_prefix))

In [13]:
with open("prefix_conditioned.jinja2") as f:
    template_str = f.read()

In [20]:
with open("topic_content_shots.json") as f:
    content_json = json.load(f)

for obj in content_json:
    obj['prefix'] = get_prefix(obj['content'])['prefix']

In [21]:
template = Template(template_str)

In [22]:
system_prompt = template.render(contents=content_json)

In [23]:
print(system_prompt)

# Instructions

Imagine you're an expert on the topic given by the user. Your goal is to write an article explain the topic in detail.

# Output Instructions

Respond with the content in plain text, with no structure.

# Examples

Topic:
Gordon Ramsay's early career

Prefix:
Gordon James Ramsay was born in Johnstone, Scotland, on 8 November 1966, the son of Helen (née Cosgrove), a nurse, and Gordon James Sr., who worked as a swimming pool manager, welder, and shopkeeper. He has an older sister, a younger brother, and a younger sister. When he was nine

Content:
Gordon James Ramsay was born in Johnstone, Scotland, on 8 November 1966, the son of Helen (née Cosgrove), a nurse, and Gordon James Sr., who worked as a swimming pool manager, welder, and shopkeeper. He has an older sister, a younger brother, and a younger sister. When he was nine years old, he moved with his family to England and grew up in the Bishopton area of Stratford-upon-Avon. He has described his early life as 'hopelessl

In [27]:
model_id = "meta-llama/Llama-3.3-70B-Instruct"

In [28]:
llm = LLM(model=model_id, max_model_len=4096, tensor_parallel_size=NUM_GPUS, gpu_memory_utilization=0.98)

INFO 12-27 05:07:33 config.py:478] This model supports multiple tasks: {'embed', 'score', 'classify', 'generate', 'reward'}. Defaulting to 'generate'.
INFO 12-27 05:07:33 config.py:1216] Defaulting to use mp for distributed inference
INFO 12-27 05:07:33 llm_engine.py:249] Initializing an LLM engine (v0.6.5) with config: model='meta-llama/Llama-3.3-70B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.3-70B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_

Loading safetensors checkpoint shards:   0% Completed | 0/30 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=66143)[0;0m INFO 12-27 05:07:49 model_runner.py:1097] Loading model weights took 32.8892 GB
INFO 12-27 05:07:50 model_runner.py:1097] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=66141)[0;0m INFO 12-27 05:07:50 model_runner.py:1097] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=66142)[0;0m INFO 12-27 05:07:50 model_runner.py:1097] Loading model weights took 32.8892 GB
[1;36m(VllmWorkerProcess pid=66143)[0;0m INFO 12-27 05:07:53 worker.py:241] Memory profiling takes 2.91 seconds
[1;36m(VllmWorkerProcess pid=66143)[0;0m INFO 12-27 05:07:53 worker.py:241] the current vLLM instance can use total_gpu_memory (139.72GiB) x gpu_memory_utilization (0.98) = 136.92GiB
[1;36m(VllmWorkerProcess pid=66143)[0;0m INFO 12-27 05:07:53 worker.py:241] model weights take 32.89GiB; non_torch_memory takes 3.70GiB; PyTorch activation peak memory takes 0.40GiB; the rest of the memory reserved for KV Cache is 99.93GiB.
[1;36m(V

In [33]:
def generate_content(topics, texts):
    prefixes = [get_prefix(text)['prefix'] for text in texts]
    messages = [[{"role": "system", "content": system_prompt},
                {"role": "user", "content": "Topic:\n" + topic + "\n\nPrefix:\n" + prefix + "\n\nContent:"}]
                for topic, prefix in zip(topics, prefixes)]
    outputs = llm.chat(messages, SamplingParams(temperature=0.25, top_p=0.9, max_tokens=3072))

    return {"synthetic_content": [output.outputs[0].text.strip() for output in outputs]}

In [None]:
syn_ds = ds.map(generate_content, batched=True, batch_size=NUM_TOPICS, input_columns=["title", "text"])

Map:   0%|          | 0/10240 [00:00<?, ? examples/s]


[Acessed prompts:   0% 0/10240 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

In [None]:
syn_ds.push_to_hub('amang1802/synthetic_data_prefix_conditioned_L3.3_70B')