In [1]:
from datasets import Dataset
from jinja2 import Template
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

import json

In [2]:
NUM_PERSONAS = 2048
NUM_GPUS = 4
BATCH_SIZE = 32

In [3]:
with open("new_persona.jinja2") as f:
    template_str = f.read()

In [4]:
template = Template(template_str)

In [5]:
with open("personas_shots.json") as f:
    personas = json.load(f)

In [6]:
personas_str = [json.dumps(pjson, indent=2) for pjson in personas]

In [7]:
system_prompt = template.render(personas=personas_str)

In [8]:
print(system_prompt)

# Instructions

Construct detailed imaginary personas for characters in imaginary dialogue. We want these persona descriptions to contain enough details so that anyone can identify one persona apart from another. 

These personas should portray a complex character. Real people have good behaviors as well as aspects which others may not always appreciate. Their personal and professional social networks don't fully represent their identity. Create a character that has depth of complexity beneath the surface

# Output format

Output in JSON format as follows

{
  "identity": "string",
  "personal_life": "string",
  "communication": "string",
  "background": "string",
  "daily_life": "string",
  "coping": "string",
  "interests": "string",
  "relationships": "string",
  "values": "string",
  "aspirations": "string",
  "dialogue": "string"
}

# Examples

{
  "identity": "Victor Patel is a 47-year-old cryptocurrency trading savant and angel investor who made his first million by age 25 throu

In [9]:
model_id = "meta-llama/Llama-3.1-405B-Instruct-FP8"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [11]:
llm = LLM(model=model_id, max_model_len=6144, tensor_parallel_size=NUM_GPUS, gpu_memory_utilization=0.98, enable_chunked_prefill=True, max_num_batched_tokens=4096, max_num_seqs=BATCH_SIZE)

INFO 12-24 02:03:38 config.py:478] This model supports multiple tasks: {'reward', 'score', 'generate', 'classify', 'embed'}. Defaulting to 'generate'.
INFO 12-24 02:03:38 config.py:1216] Defaulting to use mp for distributed inference
INFO 12-24 02:03:38 config.py:1364] Chunked prefill is enabled with max_num_batched_tokens=4096.
INFO 12-24 02:03:38 llm_engine.py:249] Initializing an LLM engine (v0.6.5) with config: model='meta-llama/Llama-3.1-405B-Instruct-FP8', speculative_config=None, tokenizer='meta-llama/Llama-3.1-405B-Instruct-FP8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=6144, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=fbgemm_fp8, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided

Loading safetensors checkpoint shards:   0% Completed | 0/109 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=41457)[0;0m [1;36m(VllmWorkerProcess pid=41459)[0;0m INFO 12-24 02:06:00 model_runner.py:1097] Loading model weights took 113.4847 GB
INFO 12-24 02:06:00 model_runner.py:1097] Loading model weights took 113.4847 GB
INFO 12-24 02:06:00 model_runner.py:1097] Loading model weights took 113.4847 GB
[1;36m(VllmWorkerProcess pid=41458)[0;0m INFO 12-24 02:06:00 model_runner.py:1097] Loading model weights took 113.4847 GB
[1;36m(VllmWorkerProcess pid=41457)[0;0m [1;36m(VllmWorkerProcess pid=41459)[0;0m INFO 12-24 02:06:04 worker.py:241] Memory profiling takes 4.30 seconds
INFO 12-24 02:06:04 worker.py:241] Memory profiling takes 4.30 seconds
[1;36m(VllmWorkerProcess pid=41457)[0;0m [1;36m(VllmWorkerProcess pid=41459)[0;0m INFO 12-24 02:06:04 worker.py:241] the current vLLM instance can use total_gpu_memory (139.72GiB) x gpu_memory_utilization (0.98) = 136.92GiB
INFO 12-24 02:06:04 worker.py:241] the current vLLM instance can use total_gpu_memory (139.

In [18]:
def generate_personas(n):
    messages = [{
        "role": "system",
        "content": system_prompt
    }, {
        "role": "user",
        "content": "Construct a new persona and just output the JSON"
    }]

    chat_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    prompts = [chat_input] * n

    outputs = llm.generate(prompts, SamplingParams(temperature=0.9, top_p=0.9, max_tokens=2560))
    print(len(outputs))

    new_personas = []
    for output in outputs:
        text = output.outputs[0].text.strip()
        if text.startswith("```json"):
            text = text[7:-3].strip()
        if text.startswith("```"):
            text = text[3:-3].strip()
        try:
            persona = json.loads(text)
            new_personas.append(persona)
        except:
            print("JSON decode error")
            print(text)
            continue

    print(f"n={n}, len(new_personas)={len(new_personas)}")

    return new_personas

In [22]:
all_personas = generate_personas(NUM_PERSONAS)

Processed prompts: 100% 2048/2048 [1:29:14<00:00,  2.61s/it, est. speed input: 1390.26 toks/s, output: 384.08 toks/s]

2048
JSON decode error
{
  "identity": "Ava Moreno is a 29-year-old Afro-Latina indie game developer in Seattle, known for her biting wit and genre-bending game designs. Born to a Cuban mother and African-American father, Ava's experiences as a queer woman of color in tech inform her unapologetic style and commitment to inclusive storytelling. While her peers see her as a rising star in the gaming world, Ava struggles with the pressure to constantly represent her communities and the fear of being tokenized by the industry.",
  "personal_life": "Ava lives in a vibrant, if cluttered, Capitol Hill apartment filled with vintage gaming consoles, thrift store art, and stacks of Octavia Butler novels. Her tight-knit group of friends and collaborators - mostly fellow women and non-binary developers - know her as a fiercely loyal and generous confidante, always willing to lend her expertise or just a listening ear. Ava's dating life is a whirlwind of casual flings and deliberate distance, as sh




In [23]:
len(all_personas)

2002

In [24]:
personas_dict = {"persona": all_personas}
ds = Dataset.from_dict(personas_dict)

In [None]:
ds = ds.map(lambda _, idx: {"id": idx}, with_indices=True)

In [25]:
ds.push_to_hub('amang1802/personas_sample_405B')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/datasets/amang1802/personas_sample_405B/commit/0a9d656fe8c45eb08f9266c402fbb2f965b1cba8', commit_message='Upload dataset', commit_description='', oid='0a9d656fe8c45eb08f9266c402fbb2f965b1cba8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/amang1802/personas_sample_405B', endpoint='https://huggingface.co', repo_type='dataset', repo_id='amang1802/personas_sample_405B'), pr_revision=None, pr_num=None)