In [1]:
from datasets import load_dataset, Dataset
from jinja2 import Template
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

import json

In [2]:
NUM_TOPICS = 1024 * 10
NUM_GPUS = 4
BATCH_SIZE = 1024

In [3]:
with open("topic_conditioned.jinja2") as f:
    template_str = f.read()

In [4]:
with open("topic_content_shots.json") as f:
    content_json = json.load(f)

In [5]:
template = Template(template_str)

In [6]:
system_prompt = template.render(contents=content_json)

In [7]:
print(system_prompt)

Here is long high quality learning article, like a Wikipedia article, that provides information collected through experience and research. The article is formatted for easy readibility.

Title: **Gordon Ramsay's early career**
Gordon James Ramsay was born in Johnstone, Scotland, on 8 November 1966, the son of Helen (née Cosgrove), a nurse, and Gordon James Sr., who worked as a swimming pool manager, welder, and shopkeeper. He has an older sister, a younger brother, and a younger sister. When he was nine years old, he moved with his family to England and grew up in the Bishopton area of Stratford-upon-Avon. He has described his early life as 'hopelessly itinerant' and said his family moved constantly owing to the aspirations and failures of his father, who was an occasionally violent alcoholic; Ramsay described him as a 'hard-drinking womaniser'. In his autobiography, he revealed that his father abused and neglected the children and his mother. He worked as a pot washer in a local India

In [8]:
ds = load_dataset('amang1802/synthetic_data_fulltext_conditioned_L3.3_70B_deduped')['train']

In [9]:
model_id = "/root/synthetic-data-recipes/diversity/ft_models/llama3_1_8B/fulltext_conditioned_20epochs_dup10_lr1e-5/epoch_19"

In [10]:
llm = LLM(model=model_id, max_model_len=6144, tensor_parallel_size=NUM_GPUS, gpu_memory_utilization=0.98)

INFO 01-01 03:09:41 config.py:478] This model supports multiple tasks: {'score', 'classify', 'embed', 'generate', 'reward'}. Defaulting to 'generate'.
INFO 01-01 03:09:41 config.py:1216] Defaulting to use mp for distributed inference
INFO 01-01 03:09:41 llm_engine.py:249] Initializing an LLM engine (v0.6.5) with config: model='/root/synthetic-data-recipes/diversity/ft_models/llama3_1_8B/fulltext_conditioned_20epochs_dup10_lr1e-5/epoch_19', speculative_config=None, tokenizer='/root/synthetic-data-recipes/diversity/ft_models/llama3_1_8B/fulltext_conditioned_20epochs_dup10_lr1e-5/epoch_19', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=6144, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_confi

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=479650)[0;0m INFO 01-01 03:09:49 model_runner.py:1097] Loading model weights took 3.7710 GB
[1;36m(VllmWorkerProcess pid=479651)[0;0m INFO 01-01 03:09:49 model_runner.py:1097] Loading model weights took 3.7710 GB
INFO 01-01 03:09:49 model_runner.py:1097] Loading model weights took 3.7710 GB
[1;36m(VllmWorkerProcess pid=479649)[0;0m INFO 01-01 03:09:49 model_runner.py:1097] Loading model weights took 3.7710 GB
[1;36m(VllmWorkerProcess pid=479651)[0;0m INFO 01-01 03:09:52 worker.py:241] Memory profiling takes 2.97 seconds
[1;36m(VllmWorkerProcess pid=479651)[0;0m INFO 01-01 03:09:52 worker.py:241] the current vLLM instance can use total_gpu_memory (139.72GiB) x gpu_memory_utilization (0.98) = 136.92GiB
[1;36m(VllmWorkerProcess pid=479651)[0;0m INFO 01-01 03:09:52 worker.py:241] model weights take 3.77GiB; non_torch_memory takes 3.69GiB; PyTorch activation peak memory takes 0.31GiB; the rest of the memory reserved for KV Cache is 129.15GiB.
[1;36m

In [12]:
def generate_content_base(titles):
    prompts = [f"{system_prompt}\n\nTitle: **{title}**\n" for title in titles]
    outputs = llm.generate(prompts, SamplingParams(temperature=0.25, top_p=0.9, max_tokens=2048, stop=["Title:"]))

    return {"cpt_gen_content": [f"**{title}**\n{output.outputs[0].text.strip()}" for title, output in zip(titles, outputs)]}

In [13]:
syn_ds = ds.select(range(16)).map(generate_content_base, batched=True, batch_size=NUM_TOPICS, input_columns=["title"])



Map:   0%|          | 0/16 [00:00<?, ? examples/s]


[Acessed prompts:   0% 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
[Acessed prompts:   6% 1/16 [00:03<00:51,  3.45s/it, est. speed input: 829.78 toks/s, output: 141.87 toks/s]
[Acessed prompts:  12% 2/16 [00:03<00:21,  1.54s/it, est. speed input: 1570.34 toks/s, output: 277.70 toks/s]
[Acessed prompts:  25% 4/16 [00:04<00:08,  1.39it/s, est. speed input: 2783.46 toks/s, output: 523.35 toks/s]
[Acessed prompts:  31% 5/16 [00:04<00:07,  1.44it/s, est. speed input: 3013.60 toks/s, output: 604.90 toks/s]
[Acessed prompts:  50% 8/16 [00:04<00:02,  3.11it/s, est. speed input: 4702.39 toks/s, output: 1040.81 toks/s]
[Acessed prompts:  62% 10/16 [00:05<00:01,  3.90it/s, est. speed input: 5568.80 toks/s, output: 1287.81 toks/s]
[Acessed prompts:  75% 12/16 [00:05<00:00,  4.02it/s, est. speed input: 6125.44 toks/s, output: 1482.76 toks/s]
[Acessed prompts:  81% 13/16 [00:06<00:00,  3.28it/s, est. speed input: 6055.01 toks/s, output: 1515.74 toks/s]
[Acess

In [15]:
syn_ds[1]

{'id': '55313219',
 'url': 'https://en.wikipedia.org/wiki/Elmer%20Otto%20Bergman',
 'title': 'Elmer Otto Bergman',
 'text': 'Elmer Otto Bergman (January 21, 1892 - January 1973) was an American civil, mechanical and consulting engineer at the University of Colorado and at C. F. Braun & Company, later KBR Inc. He served as the 83rd president of the American Society of Mechanical Engineers in the year 1964–65.\n\nBiography\n\nYouth, education, and early career \nBergman was born in 1892 in Kimball, Nebraska, son of Andrew Bergman and Hannah (Sjoblom) Bergman. He graduated from the Kimball High School in 1909. He started working as teacher in the elementary schools, and served in the United States Navy for two years.\n\nNext, Bergman started his studies at Creighton University in 1914, where he obtained his AB in 1920. He continued his studies at the University of Colorado, where he obtained his BSc in 1925 and his MSc in 1926.\n\nBergman continued his studies University Colorado in Civil

In [None]:
#syn_ds.push_to_hub('amang1802/cpt_gen_content_topic_conditioned_L3.1_8B')