In [1]:
from jinja2 import Template
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

import json
import yaml

In [2]:
NUM_PERSONAS = 100
NUM_GPUS = 1

In [3]:
with open("new_persona.jinja2") as f:
    template_str = f.read()

In [4]:
template = Template(template_str)

In [5]:
with open("personas_shots.json") as f:
    personas = json.load(f)

In [6]:
personas_str = [yaml.dump(pjson) for pjson in personas]

In [7]:
system_prompt = template.render(personas=personas_str)

In [8]:
print(system_prompt)

# Instructions

Construct detailed imaginary personas for characters in imaginary dialogue. We want these persona descriptions to contain enough details so that anyone can identify one persona apart from another. 

These personas should portray a complex character. Real people have good behaviors as well as aspects which others may not always appreciate. Their personal and professional social networks don't fully represent their identity. Create a character that has depth of complexity beneath the surface

# Output format

Output in YAML format as follows

identity: string,
personalLife: string,
communication: string,
background: string,
dailyLife: string,
coping: string,
interests: string,
relationships: string,
values: string,
aspirations: string,
dialogue: string

# Examples

aspirations: Dreams of creating an AI-driven financial literacy platform accessible
  to everyone, though he struggles with whether to include his most profitable trading
  strategies. Wants to die with zero do

In [9]:
model_id = "meta-llama/Llama-3.1-8B-Instruct"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [11]:
llm = LLM(model=model_id, max_model_len=5120, tensor_parallel_size=NUM_GPUS)

INFO 12-23 16:05:04 config.py:478] This model supports multiple tasks: {'score', 'embed', 'classify', 'generate', 'reward'}. Defaulting to 'generate'.
INFO 12-23 16:05:04 llm_engine.py:249] Initializing an LLM engine (v0.6.5) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=5120, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_mode

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 12-23 16:05:08 model_runner.py:1097] Loading model weights took 14.9888 GB
INFO 12-23 16:05:10 worker.py:241] Memory profiling takes 1.37 seconds
INFO 12-23 16:05:10 worker.py:241] the current vLLM instance can use total_gpu_memory (23.69GiB) x gpu_memory_utilization (0.90) = 21.32GiB
INFO 12-23 16:05:10 worker.py:241] model weights take 14.99GiB; non_torch_memory takes 0.07GiB; PyTorch activation peak memory takes 1.21GiB; the rest of the memory reserved for KV Cache is 5.06GiB.
INFO 12-23 16:05:10 gpu_executor.py:76] # GPU blocks: 2589, # CPU blocks: 2048
INFO 12-23 16:05:10 gpu_executor.py:80] Maximum concurrency for 5120 tokens per request: 8.09x
INFO 12-23 16:05:11 model_runner.py:1413] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 12-23 16:05:11 model_runner.py:1417] If out-of-memory error occurs during cudagraph capture

In [12]:
def generate_personas(n):
    messages = [{
        "role": "system",
        "content": system_prompt
    }, {
        "role": "user",
        "content": "Construct a new persona and output valid YAML"
    }]

    chat_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    prompts = [chat_input] * n

    outputs = llm.generate(prompts, SamplingParams(temperature=0.25, top_p=0.9, max_tokens=2048))
    print(len(outputs))

    new_personas = []
    for output in outputs:
        text = output.outputs[0].text.strip()
        try:
            persona = yaml.safe_load(text)
            new_personas.append(persona)
        except:
            print("YAML decode error")
            continue

    print(f"n={n}, len(new_personas)={len(new_personas)}")

    return new_personas

In [13]:
generate_personas(8)

Processed prompts: 100%|███████| 8/8 [00:34<00:00,  4.34s/it, est. speed input: 893.71 toks/s, output: 200.40 toks/s]

8
YAML decode error
YAML decode error
YAML decode error
YAML decode error
YAML decode error
YAML decode error
YAML decode error
n=8, len(new_personas)=1





[{'identity': 'Astrid Jensen is a 42-year-old, queer, Afro-Danish artist and educator living in Copenhagen.',
  'personalLife': "Astrid's life is a vibrant tapestry of art, activism, and community engagement. She lives in a colorful, eclectic house in the Vesterbro neighborhood with her partner, a social worker, and their two children. Her home is a testament to her creativity, filled with vibrant textiles, eclectic trinkets, and an impressive collection of vinyl records.",
  'communication': "Astrid's communication style is warm, inclusive, and unapologetically authentic. She speaks with a thick Copenhagen accent, peppered with Danish slang and phrases that make her sound like a true local. Her language is rich in metaphors and storytelling, often drawing from her experiences as a queer woman of color. When discussing her art, she becomes animated, using her hands to convey the emotions and ideas behind each piece.",
  'background': "Born to a Danish mother and a Ghanaian father, Astr