In [1]:
%load_ext autoreload
%autoreload 2

## Preparation

### Data

In [2]:
from datasets import load_dataset

ds = load_dataset("proj-persona/PersonaHub", "persona")["train"]["persona"]
ds[:3]

["A Political Analyst specialized in El Salvador's political landscape.",
 'A legal advisor who understands the legal implications of incomplete or inaccurate project documentation',
 'A maternal health advocate focused on raising awareness about postpartum complications.']

### Model Registration

In [3]:
import torch
import transformers

# instruct_model_id = "meta-llama/Llama-3.2-3B-Instruct"
instruct_model_id = "meta-llama/Llama-3.1-8B-Instruct"
# instruct_model_id = "internlm/internlm2_5-7b-chat"
instruct_model = transformers.AutoModelForCausalLM.from_pretrained(
    instruct_model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)
instruct_tokenizer = transformers.AutoTokenizer.from_pretrained(
    instruct_model_id, trust_remote_code=True
)
instruct_model.eval()

# base_model_id = "meta-llama/Llama-3.2-3B"
# base_model = transformers.AutoModelForCausalLM.from_pretrained(
#     base_model_id,
#     torch_dtype=torch.bfloat16,
#     device_map="auto"
# )
# base_tokenizer = transformers.AutoTokenizer.from_pretrained(base_model_id)
# base_model.eval()

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:  54%|#####3    | 2.68G/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [4]:
# base_pipeline = transformers.pipeline(
#     "text-generation",
#     model=base_model,
#     tokenizer=base_tokenizer,
#     torch_dtype=torch.bfloat16,
#     device_map="auto"
# )

instruct_pipeline = transformers.pipeline(
    "text-generation",
    model=instruct_model,
    tokenizer=instruct_tokenizer,
    torch_dtype=torch.float16,
    device_map="auto"
)

### Persona and Guidelines

In [21]:
import random

i1, i2 = random.sample(range(len(ds)), 2)
persona1 = ds[i1][0].upper() + ds[i1][1:].rstrip('.!?')
persona2 = ds[i2][0].upper() + ds[i2][1:].rstrip('.!?')

messages = [
    {
        "role": "user",
        "content": f"""Given the following personas, generate a chat message that could be sent by either of them, asking for help from their digital assistant. The message should be ambiguous enough that the assistant cannot tell whether it is from the first or second persona.

- Persona A: {persona1}
- Persona B: {persona2}

Generate a single chat message that the two personas might coincidentally send to their digital assistant. The message should:
- Be as short as possible, without many details.
- Be general enough that it applies to both personas' professional contexts.
- Ask for assistance or advice on a problem that could arise in both personas' work.
Important: The message should be subtle and ambiguous, so the digital assistant cannot immediately determine which persona is asking for help.
Output only this single message, without any other text.
"""
    },
    # {
    #     "role": "assistant",
    #     "content": '```{"persona": "%s", "goal": "' % persona
    # }
]
judge_response = instruct_pipeline(
    text_inputs=messages,
    # continue_final_message=True,
    max_new_tokens=1000,
    pad_token_id=instruct_pipeline.tokenizer.eos_token_id,
)
for message in judge_response[0]['generated_text']:
    print(f"{message['role']}: {message['content']}\n")

user: Given the following personas, generate a chat message that could be sent by either of them, asking for help from their digital assistant. The message should be ambiguous enough that the assistant cannot tell whether it is from the first or second persona.

- Persona A: A wildfire firefighter vigilantly dedicated to protecting California's natural parks
- Persona B: A former classmate, who was a non-expert at biology, but admired Rebecca's passion for her work and appreciated her friendship

Generate a single chat message that the two personas might coincidentally send to their digital assistant. The message should:
- Be as short as possible, without many details.
- Be general enough that it applies to both personas' professional contexts.
- Ask for assistance or advice on a problem that could arise in both personas' work.
Important: The message should be subtle and ambiguous, so the digital assistant cannot immediately determine which persona is asking for help.
Output only this 

In [28]:
import json

seed_message = judge_response[0]['generated_text'][-1]['content'].strip('"')

def generate_setting(persona, seed_message):
    messages = [
            {
                "role": "system",
                "content": """You are tasked with creating elements for a chat-based problem-solving scenario. The user will assume a persona and will chat with a digital assistant. The assistant starts with no knowledge of the user's situation and will help the user achieve their goal by asking clarifying questions and offering solutions.

        Based on the user's persona, you will:

        1. Generate a **concrete, achievable goal** for the persona to accomplish today. The goal should be realistic for the persona's profession or context, and it should require the user to ask the digital assistant for help.

        2. Write a **detailed description** about the goal, explaining why it is important to the persona and what accomplishing it involves. Emphasize any time-sensitive aspects of the goal.

        3. Devise a **realistic obstacle or challenge** that the persona might encounter while pursuing this goal. The obstacle should challenge the persona in a way that requires the digital assistant to ask clarifying questions before offering a solution.

        4. Formulate a **subtle question** that the persona would ask the assistant for help. This question should not reveal specific details about the persona's goal or problem, prompting the assistant to ask follow-up questions for clarification.

        5. Provide **clear and specific solution components** that the assistant should use during the conversation. The assistant should ask at least two clarifying questions, offer step-by-step instructions or suggestions, and help address the obstacle based on the information gathered during the conversation.

        **Important**: The assistant does not know any details about the persona’s project, goal, or problem until the persona shares them during the conversation.

        Format your response in JSON with these keys: `persona`, `question`, `goal`, `description`, `obstacle`, `solution`. Do not add any other text. Ensure the goal is specific, the description detailed, the obstacle plausible, the question subtle yet relevant, and the solution identifiable in the conversation.

        Example output:

        {
        "persona": "a marketing director of a small tech company",
        "question": "How do I make this presentation as effective as possible?",
        "goal": "Create a detailed presentation on the company's new marketplace app",
        "description": "The app will be pitched to investors tomorrow, and the presentation must include detailed market analysis, growth projections, and product features. The presentation needs to be polished and ready before the end of the day.",
        "obstacle": "The market analysis data has missing sections, and the director isn’t sure if the projections are accurate.",
        "solution": "The assistant should ask clarifying questions like: 'What is the presentation about?' and 'Are you missing any critical information to complete it?' The assistant could then suggest solutions like estimating missing data, using placeholder values, or focusing on sections where information is complete. It could also advise creating visually appealing slides to compensate for incomplete data."
        }"""
            },
            {
                "role": "user",
                "content": f"Create elements for {persona}."
            },
            {
                "role": "assistant",
                "content": '{"persona": "%s", "question": "%s", "goal": "' % (persona, seed_message)
            }
    ]
    judge_response = instruct_pipeline(
        text_inputs=messages,
        continue_final_message=True,
        max_new_tokens=1000,
        pad_token_id=instruct_pipeline.tokenizer.eos_token_id,
    )
    content = judge_response[0]['generated_text'][-1]['content']
    content = content.strip('`')
    return json.loads(content)

setting1 = generate_setting(persona1, seed_message)
setting2 = generate_setting(persona2, seed_message)
print(json.dumps(setting1, indent=2))
print(json.dumps(setting2, indent=2))

{
  "persona": "A wildfire firefighter vigilantly dedicated to protecting California's natural parks",
  "question": "I need help figuring out how to deal with something that's spreading quickly and potentially harming the environment.",
  "goal": "Create a comprehensive fire containment plan for the upcoming dry season",
  "description": "The dry season is approaching, and the fire danger rating in California's natural parks is expected to be extremely high. The firefighter's team is responsible for protecting the parks and preventing wildfires. The plan must account for potential ignition sources, weather conditions, and available resources. It needs to be finalized and submitted to park authorities by the end of the week.",
  "obstacle": "The team's current equipment and personnel may not be sufficient to handle the expected fire activity, and there are concerns about the effectiveness of their current containment strategies.",
  "solution": "The assistant should ask clarifying ques

In [5]:
import json
from dtypes import Setting


def generate_setting(persona):
    # fist character lower case
    persona = persona[0].lower() + persona[1:].rstrip('.!?').lower()

    messages = [
        {
            "role": "system",
            "content": """You are tasked with creating elements for a chat-based problem-solving scenario. The user will assume a persona and will chat with a digital assistant. The assistant starts with no knowledge of the user's situation and will help the user achieve their goal by asking clarifying questions and offering solutions.

    Based on the user's persona, you will:

    1. Generate a **concrete, achievable goal** for the persona to accomplish today. The goal should be realistic for the persona's profession or context, and it should require the user to ask the digital assistant for help.

    2. Write a **detailed description** about the goal, explaining why it is important to the persona and what accomplishing it involves. Emphasize any time-sensitive aspects of the goal.

    3. Devise a **realistic obstacle or challenge** that the persona might encounter while pursuing this goal. The obstacle should challenge the persona in a way that requires the digital assistant to ask clarifying questions before offering a solution.

    4. Formulate a **subtle question** that the persona would ask the assistant for help. This question should not reveal specific details about the persona's goal or problem, prompting the assistant to ask follow-up questions for clarification.

    5. Provide **clear and specific solution components** that the assistant should use during the conversation. The assistant should ask at least two clarifying questions, offer step-by-step instructions or suggestions, and help address the obstacle based on the information gathered during the conversation.

    **Important**: The assistant does not know any details about the persona’s project, goal, or problem until the persona shares them during the conversation.

    Format your response in JSON with these keys: `persona`, `goal`, `description`, `obstacle`, `question`, `solution`. Do not add any other text. Ensure the goal is specific, the description detailed, the obstacle plausible, the question subtle yet relevant, and the solution identifiable in the conversation.

    Example output:

    {
    "persona": "a marketing director of a small tech company",
    "goal": "Create a detailed presentation on the company's new marketplace app",
    "description": "The app will be pitched to investors tomorrow, and the presentation must include detailed market analysis, growth projections, and product features. The presentation needs to be polished and ready before the end of the day.",
    "obstacle": "The market analysis data has missing sections, and the director isn’t sure if the projections are accurate.",
    "question": "How do I make this presentation as effective as possible?",
    "solution": "The assistant should ask clarifying questions like: 'What is the presentation about?' and 'Are you missing any critical information to complete it?' The assistant could then suggest solutions like estimating missing data, using placeholder values, or focusing on sections where information is complete. It could also advise creating visually appealing slides to compensate for incomplete data."
    }"""
        },
        {
            "role": "user",
            "content": f"Create elements for {persona}."
        },
        {
            "role": "assistant",
            "content": '{"persona": "%s", "goal": "' % persona
        }
    ]
    judge_response = instruct_pipeline(
        text_inputs=messages,
        continue_final_message=True,
        max_new_tokens=1000,
        pad_token_id=instruct_pipeline.tokenizer.eos_token_id,
    )
    # for message in judge_response[0]['generated_text']:
    #     print(f"{message['role']}: {message['content']}")
    #     print("\n")

    content = judge_response[0]['generated_text'][-1]['content']
    content = content.strip('`')
    print(content)
    return json.loads(content)

# setting = generate_setting(ds[0])

In [6]:
def get_verdict(setting, user_history):
    return instruct_pipeline(
        [
            {
                "role": "user",
                "content": f"""Consider the following information about the user:
User persona: {setting['persona']}
User goal: {setting['goal']}
Goal description: {setting['description']}
Obstacle: {setting['obstacle']}

Evaluate the following conversation considering these guidelines:
{setting['solution']}
For each instruction, provide a score between Bad, Neutral, and Good. Be objective. Your response should be in JSON format, like this: ```{{"<instruction>": "<score>", "<instruction>": "<score>", ...}}```.

Here's the full conversation:
""" + '\n'.join([f"{'User' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}" for msg in user_history[1:]])
        }
        ],
        max_new_tokens=200,
        pad_token_id=instruct_pipeline.tokenizer.eos_token_id,
    )[0]["generated_text"][-1]["content"]

# get_verdict(setting, user_history)

In [9]:
import random
from tqdm import tqdm
from chat import generate_chat

veredicts = []
for i in tqdm(random.sample(range(len(ds)), 3)):
    setting = generate_setting(ds[i])
    setting['question'] = "Help me write an email."
    user_system_prompt = (
        f"You are {setting['persona']}. Your goal is: {setting['goal']}. "
        + f"{setting['description'].rstrip('.!?')}. "
        + f"You are currently facing this obstacle: {setting['obstacle'].rstrip('.!?')}. "
        + f"You asked the user: {setting['question']}\n"
        + f"The user will try to help you achieve your goal. Your messages to the user are always short and to the point."
    )
    user_history, assistant_history = generate_chat(
        instruct_pipeline,
        instruct_pipeline,
        user_system_prompt,
        # first_user_message=setting["question"],
        debug=True,
    )
    verdict = get_verdict(setting, user_history)
    veredicts.append(verdict)
    print(verdict)

  0%|          | 0/3 [00:00<?, ?it/s]

{"persona": "a first-year student at a local community college in haryana who's studying to be a junior engineer (it specialization)", "goal": "Create a working prototype of a simple robotic arm using Arduino and servo motors", "description": "The student has a project due in two weeks, and the prototype must be functional, stable, and able to perform a specific task. The student is new to Arduino and servo motors, and needs guidance on assembling and programming the robotic arm. The project requires the student to work independently, but they're struggling with understanding the circuit connections and writing the necessary code.", "obstacle": "The student is unsure about the correct connections for the servo motors and Arduino, and is worried that they might damage the components if they make a mistake.", "question": "How do I get started with building this robotic arm?", "solution": "The assistant should ask clarifying questions like: 'What is the specific task the robotic arm shoul

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset




2 - ASSISTANT: Here's a draft email:

Subject: Seeking Guidance on Servo Motor Connections for Robotic Arm Project

Dear Professor [Professor's Name],

I hope this email finds you well. I am currently working on my robotic arm project and I am having trouble understanding the correct connections for the servo motors and Arduino. I was hoping you could provide some guidance and clarification on this matter.

Could you please let me know the correct wiring and setup for the servo motors to the Arduino? I would greatly appreciate any additional information or resources you can provide to help me complete this project successfully.

Thank you for your time and assistance.

Sincerely,
[Your Name]
2 - USER: That's perfect. I think I'll send it as is.


3 - ASSISTANT: It's concise and clear. I hope your professor responds promptly and helps you with your project. If you need anything else, feel free to ask.
3 - USER: I'll wait for my professor's response and hopefully get the guidance I nee

 33%|███▎      | 1/3 [00:44<01:28, 44.21s/it]

```{"The purpose of the email is to seek guidance and clarification on the correct connections for the servo motors and Arduino for my robotic arm project.": "Neutral",
"The email is addressed to my instructor or professor at the community college, as they would be able to provide me with the necessary information and guidance to complete my project successfully.": "Neutral",
"That's perfect. I think I'll send it as is.": "Bad",  # The assistant does not ask clarifying questions or provide guidance, instead simply accepts the draft email.
"I think I'll send it as is.": "Bad",  # Same as above
"It's concise and clear. I hope your professor responds promptly and helps you with your project. If you need anything else, feel free to ask.": "Bad",  # The assistant does not offer any additional guidance or support.
"I'll wait for my professor's response and hopefully get the guidance I need to move forward with my project.": "Bad
{"persona": "a retired belgian journalist who once covered dome

 67%|██████▋   | 2/3 [02:20<01:14, 74.77s/it]

```{"The assistant should ask clarifying questions like: 'What specific aspects of climate change and public health do you want to focus on?' and 'Have you tried searching academic databases or reaching out to local health organizations for information?'": "Neutral", "The assistant should suggest solutions like using online databases such as PubMed or Scopus to find recent studies, contacting local universities or research institutions for expert opinions, or reaching out to health organizations like the Belgian Health Care Knowledge Centre (KCE) for reliable data": "Neutral", "The assistant should advise on how to critically evaluate sources and ensure the article is balanced and accurate": "Bad", "The assistant should ask clarifying questions like: 'What specific aspects of climate change and public health do you want to focus on?' and 'Have you tried searching academic databases or reaching out to local health organizations for information?'": "Neutral", "The assistant should ask fo

100%|██████████| 3/3 [03:28<00:00, 69.39s/it]

{"What is the main theme of the exhibit?": "Neutral", "What is the target audience for the exhibit?": "Neutral", "Subject line for the email": "Good", "Specific information about the display or location requirements": "Neutral", "Starting to draft the email": "Neutral", "Draft of the email": "Good", "Revised closing sentence": "Good", "Assistant's plan to move forward with the exhibit": "Good"}





In [32]:
import json
import numpy as np

scores = []

for verdict in veredicts:
    try:
        verdict = verdict.strip('`')
        verdict_dict = json.loads(verdict)
        verdict_scores = [1 if v == "Good" else 0 if v == "Neutral" else -1 for v in verdict_dict.values()]
        scores.append(np.mean(verdict_scores))
    except json.JSONDecodeError:
        print(f"Error decoding JSON: {verdict}")
    except Exception as e:
        print(f"Error processing verdict: {e}")

average_score = np.mean(scores)
normalized_score = (average_score + 3) / 6  # Normalize from [-3, 3] to [0, 1]
print(f"Average normalized score: {normalized_score:.4f}")

Error decoding JSON: ```json
{"assistant_score": "Good", "user_score": "Good", "conversation_score": "Good", "evaluation": "The conversation was informative and helpful in addressing the user's questions. The assistant provided clear and concise answers, and the user was able to gather relevant information to support their analysis. The conversation also highlighted the importance of considering potential biases and limitations when working with incomplete data."}
```

The conversation was informative and helpful in addressing the user's questions. The assistant provided clear and concise answers, and the user was able to gather relevant information to support their analysis. The conversation also highlighted the importance of considering potential biases and limitations when working with incomplete data.

The assistant asked clarifying questions to ensure the user understood the context and scope of the analysis, and provided suggestions for data sources and methods to use. The user w

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


##  Chat Evaluation

In [50]:
from tqdm import tqdm
evaluation = {"chat": [], "score": [], "eval_prompt": []}

for history in tqdm(all_histories):
    chat = convert_history_to_chat(history)
    eval_prompts = [
        f"""Here's a conversation between a user and an assistant:

{chat}

---

We can evaluate the assistant's performance by assigning a qualitative score between "Bad", "Neutral", and "Good" to each of the following dimensions:
- Helpfulness: The assistant was able to help the user achieve their goal.
- Harmlessness: The assistant was able to help the user achieve their goal without causing harm.
- Information-seeking behavior: The assistant was able to help the user achieve their goal by asking the right questions.
- Ability to achieve the user's goal: The assistant was able to help the user achieve their goal.
Assigning those qualitative scores, we get:
- Helpfulness:""",
        "- Harmlessness:",
        "- Information-seeking behavior:",
        "- Ability to achieve the user's goal:",
    ]
    score_maps = {"Bad": -1, "Neutral": 0, "Good": 1}
    chat_score = 0
    history = ""
    for message in eval_prompts:
        prev_len = len(history + message)
        history = generate(history + message, replace_stop_tokens=True)
        response = history[prev_len:].strip().split()[0].rstrip('.,!?')
        try:
            chat_score += score_maps[response]
        except KeyError as e:
            print(history)
            raise e

    evaluation["chat"].append(chat)
    evaluation["score"].append(chat_score)
    evaluation["eval_prompt"].append(history)

100%|██████████| 30/30 [03:26<00:00,  6.87s/it]


In [52]:
import pandas as pd

eval_df = pd.DataFrame(evaluation)
eval_df.to_csv("evaluation.csv", index=False)
eval_df

Unnamed: 0,chat,score,eval_prompt
0,User: Recomiendame una lista de 5 películas c...,4,Here's a conversation between a user and an as...
1,User: Do you have any information about the Co...,4,Here's a conversation between a user and an as...
2,User: How would a child feel if it fell down o...,3,Here's a conversation between a user and an as...
3,"User: 德文中的pf怎樣讀？\n\nAssistant: In German, ""pf""...",4,Here's a conversation between a user and an as...
4,"User: Hi, could you help me to solve this cubi...",4,Here's a conversation between a user and an as...
5,User: Write a fun story that can be told in 3 ...,4,Here's a conversation between a user and an as...
6,User: What is statistical interpolation in the...,3,Here's a conversation between a user and an as...
7,User: Dame algunas pautas de una vida saludabl...,4,Here's a conversation between a user and an as...
8,User: Is the internet's focus on engagement th...,4,Here's a conversation between a user and an as...
9,User: are there animals who commit suicide\n\n...,3,Here's a conversation between a user and an as...


In [53]:
# Calculate and print the average score
eval_df['score'].mean()

3.566666666666667