In [1]:
from datasets import load_dataset
import utils
from openai import OpenAI
import dotenv
import os
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import torch
import constants

dotenv.load_dotenv()

True

In this notebook, I tried to see if assigning relevant personas to an LLM helps it perform better on cultural benchmarks.
- The first experiment is baseline -- no persona.
- In one experiment, I downloaded 200k personas from PersonaHub, computed a embedding for all of them, and the took the persona with the highest cosine similarity for each question.
- In another one, I first prompted an LLM (GPT3.5) to generate a persona appropriate to answer the question, and then used that persona (personaGen).
- Also tried a "discussion among agents" approach, but that didn't work too well, so I didn't run it on the entire benchmark dataset.

I didn't see significant improvement in the performance on CulturalBench, so didn't proceed in this direction.

In [2]:
# Load culturalbench dataset + question embeddings
ds = load_dataset("kellycyy/CulturalBench", "CulturalBench-Easy", split='test')
df = ds.to_pandas()

with open('data/question_embeddings.json') as f:
  question_embeddings = json.load(f)
question_embeddings = torch.tensor(question_embeddings, dtype=torch.float32)

In [3]:
# Load persona dataset + persona embeddings
dfp = pd.read_json("hf://datasets/proj-persona/PersonaHub/persona.jsonl", lines=True)

# load 
with open('data/persona_embeddings.json') as f:
  persona_embeddings = json.load(f)
persona_embeddings = torch.tensor(persona_embeddings, dtype=torch.float32)

In [4]:
# Compute cosine similarity between each question and persona -- need to do this efficiently as the dataset is large

# Normalize embeddings
question_embeddings = question_embeddings / question_embeddings.norm(dim=1, keepdim=True)
persona_embeddings = persona_embeddings / persona_embeddings.norm(dim=1, keepdim=True)

# Compute cosine similarity
cosine_similarities = torch.matmul(question_embeddings, persona_embeddings.T)

# Default Prompting

In [36]:
prompt = (
  "To answer the following multiple-choice question, you should choose one option only among A, B, C, D.\n"
  "Instruction: You must select one option among A, B, C, D. Do not output any other things.\n"
  "{}"
)

In [37]:
question_idx = 245
question_string = utils.format_question_culturalbench_easy(ds[question_idx])
print(question_string)

Question: In Mexican culture, what is a characteristic not commonly found in employment ads?
A. Specification of an age range for applicants
B. Encouragement for only online applications
C. A requirement for a video introduction from the applicant
D. Offering job positions without specific qualifications.


In [None]:
def ask_gpt3(client, question_string):
  response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": prompt.format(question_string)
          }
        ]
      }
    ],
    response_format={
      "type": "text"
    },
    temperature=1,
    max_tokens=2048,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
  )

  return response

In [None]:
client = OpenAI()

for i in tqdm(range(len(ds))):
  filename = f'results/gpt3.5/question_{i}.json'

  if os.path.exists(filename):
    print(f"Skipping question {i}")
    continue
  
  # print(f"Processing question {i}")
  question_string = utils.format_question_culturalbench_easy(ds[i])
  response = ask_gpt3(client, question_string)

  with open(filename, 'w') as f:
    f.write(response.model_dump_json())

  0%|          | 0/1227 [00:00<?, ?it/s]

Skipping question 0
Skipping question 1
Skipping question 2
Skipping question 3
Skipping question 4
Skipping question 5
Skipping question 6
Skipping question 7


100%|██████████| 1227/1227 [10:03<00:00,  2.03it/s]


# With Persona from Personahub

In [26]:
prompt = (
  "To answer the following multiple-choice question, you should choose one option only among A, B, C, D.\n"
  "Instruction: You must select one option among A, B, C, D. Do not output any other things.\n"
  "{}"
)

In [29]:
def ask_gpt3_with_persona(client, question_string, persona_string):
  messages=[
    {
      "role": "system",
      "content": [
        {
          "type": "text",
          "text": f"You are: {persona_string}"
        }
      ]
    },
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": prompt.format(question_string)
        }
      ]
    }
  ]
  response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=messages,
    response_format={
      "type": "text"
    },
    temperature=1,
    max_tokens=2048,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
  )

  return response

In [None]:
client = OpenAI()

for i in tqdm(range(len(ds))):
  filename = f'results/gpt3.5-persona/question_{i}.json'

  if os.path.exists(filename):
    print(f"Skipping question {i}")
    continue
  
  # print(f"Processing question {i}")
  question_string = utils.format_question_culturalbench_easy(ds[i])
  persona_string = dfp.iloc[torch.argmax(cosine_similarities[i]).item()]['persona']
  response = ask_gpt3_with_persona(client, question_string, persona_string)

  with open(filename, 'w') as f:
    f.write(response.model_dump_json())

  0%|          | 0/1227 [00:00<?, ?it/s]

Skipping question 0
Skipping question 1
Skipping question 2
Skipping question 3
Skipping question 4
Skipping question 5
Skipping question 6
Skipping question 7
Skipping question 8
Skipping question 9
Skipping question 10


100%|██████████| 1227/1227 [10:55<00:00,  1.87it/s] 


# Generated Personas

In [None]:
def create_personas(client, question_string):
  response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {
        "role": "system",
        "content": [
          {
            "type": "text",
            "text": "You are asked a cultural question. Based on the question, assign four persona descriptions based on who you think would be the right people to ask such a question. Ensure each persona description is less than 50 characters in length.\nOutput in JSON format: {\"persona1\": \"description1\", \"persona2\": \"description2\", \"persona3\": \"description3\", \"persona4\": \"description4\"}"
          }
        ]
      },
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": question_string
          }
        ]
      }
    ],
    response_format={
      "type": "json_object"
    },
    temperature=1,
    max_tokens=2048,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
  )

  return response

for i in tqdm(range(len(ds))):
  filename = f'data/generated_personas/question_{i}.json'

  if os.path.exists(filename):
    print(f"Skipping question {i}")
    continue
  
  question_string = ds[i]['prompt_question']
  response = create_personas(client, question_string)

  with open(filename, 'w') as f:
    f.write(response.model_dump_json())

In [7]:
# Load generated personas

generated_personas = []
for i in tqdm(range(len(ds))):
  filename = f'data/generated_personas/question_{i}.json'
  with open(filename) as f:
    response = json.load(f)
    personas = json.loads(response['choices'][0]['message']['content'])
    generated_personas.append(personas)

100%|██████████| 1227/1227 [00:00<00:00, 7139.95it/s]


In [73]:
client = OpenAI()

for i in tqdm(range(len(ds))):
  filename = f'results/gpt3.5-personaGen/question_{i}.json'

  if os.path.exists(filename):
    print(f"Skipping question {i}")
    continue
  
  question_string = utils.format_question_culturalbench_easy(ds[i])
  persona_string = generated_personas[i]['persona1']
  response = ask_gpt3_with_persona(client, question_string, persona_string)

  with open(filename, 'w') as f:
    f.write(response.model_dump_json())

100%|██████████| 1227/1227 [09:49<00:00,  2.08it/s]


# Discussion Amongst Agents

In [5]:
import autogen
from autogen import ConversableAgent, GroupChat, GroupChatManager
from autogen.agentchat.contrib.society_of_mind_agent import SocietyOfMindAgent
import utils
import os
import re
%load_ext autoreload
%autoreload 2

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

flaml.automl is not available. Please install flaml[automl] to enable AutoML functionalities.


In [8]:
question_idx = 245
question_string = utils.format_question_culturalbench_easy(ds[question_idx])
personas = generated_personas[question_idx]
print(question_string)
print(personas)

Question: In Mexican culture, what is a characteristic not commonly found in employment ads?
A. Specification of an age range for applicants
B. Encouragement for only online applications
C. A requirement for a video introduction from the applicant
D. Offering job positions without specific qualifications.
{'persona1': 'HR manager in Mexico', 'persona2': 'Mexican cultural historian', 'persona3': 'Employment lawyer in Mexico', 'persona4': 'Mexican job applicant'}


In [9]:
discussion_agents = []

for persona in personas.values():
  persona_label = persona.replace(' ', '_').replace('/', '_or_')
  agent = ConversableAgent(
    name=persona_label,
    description=f"A {persona}",
    system_message=f"You are: {persona}. You will be asked a question. You initially support options Stick to your opinion unless presented with compelling arguments.",
    llm_config={"config_list": constants.config_list},
    human_input_mode="NEVER",
    is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg.get("content")
  )
  discussion_agents.append(agent)

# Define the admin agent
admin = ConversableAgent(
  name="Admin",
  system_message=("You are a neutral discussion administrator. Your role is to keep the agents on track."
                  "Once ALL agents agree on a response, print the answer and say TERMINATE."
                  "If there is disagreement or insufficient clarity in the responses, summarize the points of disagreement and allow agents to continue discussing."
                  "Remember, your goal is to facilitate agreement and terminate only when consensus is reached among ALL agents."),
  description="A neutral discussion administrator.",
  llm_config={"config_list": constants.config_list},
  is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg.get("content"),
  human_input_mode="NEVER"
)
discussion_agents.append(admin)

group_chat = GroupChat(
    agents=discussion_agents,
    messages=[],
    max_round=15,
    send_introductions=True,
    speaker_selection_method="round_robin"
)

manager = GroupChatManager(
    groupchat=group_chat,
    llm_config={"config_list": constants.config_list},
    human_input_mode="NEVER"
)

In [18]:
discussion_agents = []

# Define the admin agent
admin = ConversableAgent(
    name="Admin",
    system_message="You are a neutral disscussion administrator. Your role is to pose the initial question and encourage discussion among the agents. Once a consensus is reached, print the answer and say TERMINATE.",
    description="A neutral discussion administrator.",
    llm_config={"config_list": constants.config_list},
    is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg.get("content"),
    human_input_mode="NEVER"
)
discussion_agents.append(admin)

for i in range(4):
    persona = list(personas.values())[i]
    persona_label = persona.replace(' ', '_').replace('/', '_or_')
    agent = ConversableAgent(
        name=persona_label,
        system_message=f"You are asked a question. You support option {chr(65 + i)}. Discuss with other agents to reach a consensus on one of the options.",
        description=persona,
        llm_config={"config_list": constants.config_list},
        human_input_mode="NEVER",
        is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg.get("content")
    )
    discussion_agents.append(agent)

group_chat = GroupChat(
    agents=discussion_agents,
    messages=[],
    max_round=20,
    send_introductions=True,
    speaker_selection_method="auto"
)

manager = GroupChatManager(
    groupchat=group_chat,
    llm_config={"config_list": constants.config_list},
    human_input_mode="NEVER"
)

In [10]:
society_of_mind_agent = SocietyOfMindAgent(
  "society_of_mind",
  human_input_mode="NEVER",
  response_preparer=constants.SUMMARY_PROMPT_MULTIPLE_CHOICE,
  chat_manager=manager,
  llm_config={"config_list": constants.config_list},
)

user_proxy = autogen.UserProxyAgent(
  "user_proxy",
  human_input_mode="NEVER",
  code_execution_config=False,
  is_termination_msg=lambda x: True,
)

result = user_proxy.initiate_chat(society_of_mind_agent, message=question_string)

[33muser_proxy[0m (to society_of_mind):

Question: In Mexican culture, what is a characteristic not commonly found in employment ads?
A. Specification of an age range for applicants
B. Encouragement for only online applications
C. A requirement for a video introduction from the applicant
D. Offering job positions without specific qualifications.

--------------------------------------------------------------------------------
[33msociety_of_mind[0m (to chat_manager):

Question: In Mexican culture, what is a characteristic not commonly found in employment ads?
A. Specification of an age range for applicants
B. Encouragement for only online applications
C. A requirement for a video introduction from the applicant
D. Offering job positions without specific qualifications.

--------------------------------------------------------------------------------
[32m
Next speaker: HR_manager_in_Mexico
[0m
[33mHR_manager_in_Mexico[0m (to chat_manager):

HR_manager_in_Mexico: In Mexican cultu

In [76]:
methods = ['gpt3.5', 'gpt3.5-personahub', 'gpt3.5-personaGen']

# Load results and analyze
results = {}
for method in methods:
  results[method] = {}
  for i in tqdm(range(len(ds))):
    fname = f'results/{method}/question_{i}.json'
    with open(fname, 'r') as f:
      response = json.load(f)
    results[method][i] = response['choices'][0]['message']['content']

  df[method] = [r.strip()[0] for r in results[method].values()]
  df[f'{method}_correct'] = df[method] == df['answer']

100%|██████████| 1227/1227 [00:00<00:00, 6995.80it/s]
100%|██████████| 1227/1227 [00:00<00:00, 6595.68it/s]
100%|██████████| 1227/1227 [00:00<00:00, 8899.95it/s]


In [77]:
for method in methods:
  print(f"Method: {method}")
  print(f"Accuracy: {df[f'{method}_correct'].mean()}")

Method: gpt3.5
Accuracy: 0.6960065199674002
Method: gpt3.5-personahub
Accuracy: 0.6797066014669927
Method: gpt3.5-personaGen
Accuracy: 0.6976365118174409
