In [1]:
from datasets import load_dataset, Dataset
from jinja2 import Template
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

import json

In [2]:
NUM_GPUS = 4
BATCH_SIZE = 1024

In [3]:
with open("topic_conditioned.jinja2") as f:
    template_str = f.read()

In [4]:
with open("topic_content_shots.json") as f:
    content_json = json.load(f)

In [5]:
template = Template(template_str)

In [6]:
system_prompt = template.render(contents=content_json)

In [7]:
print(system_prompt)

Here is long high quality learning article, like a Wikipedia article, that provides information collected through experience and research. The article is formatted for easy readibility.

Title: **Gordon Ramsay's early career**
Gordon James Ramsay was born in Johnstone, Scotland, on 8 November 1966, the son of Helen (née Cosgrove), a nurse, and Gordon James Sr., who worked as a swimming pool manager, welder, and shopkeeper. He has an older sister, a younger brother, and a younger sister. When he was nine years old, he moved with his family to England and grew up in the Bishopton area of Stratford-upon-Avon. He has described his early life as 'hopelessly itinerant' and said his family moved constantly owing to the aspirations and failures of his father, who was an occasionally violent alcoholic; Ramsay described him as a 'hard-drinking womaniser'. In his autobiography, he revealed that his father abused and neglected the children and his mother. He worked as a pot washer in a local India

In [8]:
ds = load_dataset('amang1802/synthetic_data_qna_fulltext_conditioned_L3.3_70B_deduped')['train']

In [9]:
#model_id = "/root/synthetic-data-recipes/cpt/ft_models/llama3_1_8B/qna_fulltext_conditioned_20epochs_lr1e-5/epoch_19"
model_id = "meta-llama/Llama-3.1-8B"

In [10]:
llm = LLM(model=model_id, max_model_len=6144, tensor_parallel_size=NUM_GPUS, gpu_memory_utilization=0.98)

INFO 01-06 19:08:19 config.py:478] This model supports multiple tasks: {'embed', 'classify', 'score', 'generate', 'reward'}. Defaulting to 'generate'.
INFO 01-06 19:08:19 config.py:1216] Defaulting to use mp for distributed inference
INFO 01-06 19:08:19 llm_engine.py:249] Initializing an LLM engine (v0.6.5) with config: model='meta-llama/Llama-3.1-8B', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=6144, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, 

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=552739)[0;0m INFO 01-06 19:08:26 model_runner.py:1097] Loading model weights took 3.7710 GB
[1;36m(VllmWorkerProcess pid=552738)[0;0m INFO 01-06 19:08:27 model_runner.py:1097] Loading model weights took 3.7710 GB
[1;36m(VllmWorkerProcess pid=552740)[0;0m INFO 01-06 19:08:27 model_runner.py:1097] Loading model weights took 3.7710 GB
INFO 01-06 19:08:27 model_runner.py:1097] Loading model weights took 3.7710 GB
[1;36m(VllmWorkerProcess pid=552739)[0;0m INFO 01-06 19:08:30 worker.py:241] Memory profiling takes 2.66 seconds
[1;36m(VllmWorkerProcess pid=552739)[0;0m INFO 01-06 19:08:30 worker.py:241] the current vLLM instance can use total_gpu_memory (139.72GiB) x gpu_memory_utilization (0.98) = 136.92GiB
[1;36m(VllmWorkerProcess pid=552739)[0;0m INFO 01-06 19:08:30 worker.py:241] model weights take 3.77GiB; non_torch_memory takes 4.16GiB; PyTorch activation peak memory takes 0.31GiB; the rest of the memory reserved for KV Cache is 128.69GiB.
[1;36m

In [11]:
def generate_content_base(titles):
    prompts = [f"{system_prompt}\n\nTitle: **{title}**\n" for title in titles]
    outputs = llm.generate(prompts, SamplingParams(temperature=0.25, top_p=0.9, max_tokens=2048, stop=["Title:"]))

    return {"cpt_gen_content": [f"**{title}**\n{output.outputs[0].text.strip()}" for title, output in zip(titles, outputs)]}

In [None]:
syn_ds = ds.map(generate_content_base, batched=True, batch_size=BATCH_SIZE, input_columns=["title"])



Map:   0%|          | 0/5119 [00:00<?, ? examples/s]


[Acessed prompts:   0% 0/1024 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   0% 1/1024 [00:08<2:21:56,  8.33s/it, est. speed input: 344.61 toks/s, output: 3.36 toks/s]
[Acessed prompts:   0% 2/1024 [00:08<1:03:25,  3.72s/it, est. speed input: 649.94 toks/s, output: 8.38 toks/s]
[Acessed prompts:   0% 3/1024 [00:09<36:06,  2.12s/it, est. speed input: 951.46 toks/s, output: 14.04 toks/s] 
[Acessed prompts:   0% 4/1024 [00:09<22:35,  1.33s/it, est. speed input: 1253.33 toks/s, output: 19.99 toks/s]
[Acessed prompts:   1% 6/1024 [00:09<13:13,  1.28it/s, est. speed input: 1766.56 toks/s, output: 32.54 toks/s]
[Acessed prompts:   1% 7/1024 [00:09<10:07,  1.68it/s, est. speed input: 2036.87 toks/s, output: 40.18 toks/s]
[Acessed prompts:   1% 9/1024 [00:10<07:52,  2.15it/s, est. speed input: 2470.40 toks/s, output: 55.12 toks/s]
[Acessed prompts:   1% 10/1024 [00:10<06:26,  2.62it/s, est. speed input: 2715.35 toks/s, output: 64.18 toks/s]
[

In [None]:
syn_ds.push_to_hub('amang1802/cpt_gen_content_topic_conditioned_L3.1_8B_qna')