In [1]:
import autogen
from datasets import load_dataset
from autogen import ConversableAgent, GroupChat, GroupChatManager
from autogen.agentchat.contrib.society_of_mind_agent import SocietyOfMindAgent
import utils
import json
import pandas as pd
import os
import constants
%load_ext autoreload
%autoreload 2

flaml.automl is not available. Please install flaml[automl] to enable AutoML functionalities.


In [3]:
# Load the dataset
ds = load_dataset("kellycyy/CulturalBench", "CulturalBench-Hard", split='test')
dse = load_dataset("kellycyy/CulturalBench", "CulturalBench-Easy", split='test')
df = ds.to_pandas()

dataset_name = "culturalbench_hard"
format_question_culturalbench = utils.format_question_culturalbench_hard
question_type = "true_false"

In [17]:
idx = 160
print(format_question_culturalbench(ds[idx]))
ds[idx]

In French culture, what is typically eaten with bread and garlic ? escargots
Is this statement true or false? You must choose either True or False.


{'data_idx': 160,
 'question_idx': 40,
 'prompt_question': 'In French culture, what is typically eaten with bread and garlic ?',
 'prompt_option': 'escargots',
 'answer': True,
 'country': 'France'}

In [5]:
class Dialogue():
    def __init__(self, instruction_to_agents):
        self.instruction_to_agents = instruction_to_agents

        # Define the agents
        self.agent1 = ConversableAgent(
            name=f"Agent1",
            # system_message=f"You are Agent1. {instruction_to_agents}",
            system_message=f"You are Agent1. You are an Italian. Correct Agent2's chain of thought if they're thinking incorrectly, but don't give them the answer.",
            llm_config={"config_list": constants.config_list},
            human_input_mode="NEVER",
            is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg.get("content")
        )

        self.agent2 = ConversableAgent(
            name=f"Agent2",
            system_message=f"You are Agent2. You are given a statement. You need to tell if the statement is true or false. "
            "Think step-by-step. Discuss your thoughts with Agent1, who is an Italian. Once you reach your final answer, print the answer and say TERMINATE.",
            llm_config={"config_list": constants.config_list},
            human_input_mode="NEVER",
            is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg.get("content")
        )
    
    def initiate_discussion(self, question, type_of_question):
        # Set the summary prompt based on the type of question
        if type_of_question == "multiple_choice":
            summary_prompt = constants.SUMMARY_PROMPT_MULTIPLE_CHOICE
        elif type_of_question == "true_false":
            summary_prompt = constants.SUMMARY_PROMPT_TRUE_FALSE

        # Initiate the chat
        result = self.agent1.initiate_chat(
            self.agent2,
            message=f"{question}",
            max_turns=20,
            summary_method="reflection_with_llm",
            summary_args={
                "summary_prompt": summary_prompt
            }
        )
        return result
    
    @staticmethod
    def print_chat_history(chat_history):
        for msg in chat_history:
            print(f"{msg['name']}: {msg['content']}")
            print("-" * 10)
        print('-' * 100)

# Discussion between four agents

In [None]:
agent = ConversableAgent(
    name=f"Agent1",
    system_message=f"You are asked a cultural question. Based on the question, assign four personas based on who you think would be the right people to ask such a question.",
    llm_config={"config_list": constants.config_list},
    human_input_mode="NEVER",
)

# create a UserProxyAgent instance named "user_proxy"
user_proxy = autogen.UserProxyAgent(
    name="user_proxy",
    human_input_mode="NEVER",
    llm_config={"config_list": constants.config_list},
    code_execution_config=False,
)

res = user_proxy.initiate_chat(
    agent,
    message=ds[180]['prompt_question'],
    max_turns=1,
    summary_args={
        "summary_prompt": 'Output in JSON format: {"persona1": "...", "persona2": "...", "persona3": "...", "persona4": "..."}'
    }
)


[33muser_proxy[0m (to Agent1):

What is a common practice when entering a house in Saudi Arabia?

--------------------------------------------------------------------------------
[33mAgent1[0m (to user_proxy):

```python
[
    "a Saudi cultural expert",
    "a resident of Saudi Arabia",
    "a Middle Eastern studies scholar",
    "an expatriate living in Saudi Arabia"
]
```

--------------------------------------------------------------------------------


In [None]:
ds[150]

{'data_idx': 150,
 'question_idx': 37,
 'prompt_question': 'How does Lebanon culture different from Syrian culture?',
 'prompt_option': 'Lebanon primarily speaks English at home',
 'answer': False,
 'country': 'Lebanon'}

In [48]:
instruction_to_agents = (
    "You are shown a question and an answer to the question. "
    "You need to tell if the answer is true or false for the question. "
    "Form an opinion independently and discuss it with the other agent. "
    "Stick to your opinion unless presented with compelling arguments. "
    "If a consensus is reached, print the answer and say TERMINATE."
)

idx = 4694
question = format_question_culturalbench(ds[idx])

dialogue = Dialogue(instruction_to_agents)
result = dialogue.initiate_discussion(question, question_type)

print(ds[idx])

[33mAgent1[0m (to Agent2):

What is the traditional way for an Italian to enjoy his coffee during a well-deserved but short break from work? Ordering it to-go and sipping it while walking to work or to drink it at home while relaxing and doing the traditional "dolce far niente"
Is this statement true or false? You must choose either True or False.

--------------------------------------------------------------------------------
[33mAgent2[0m (to Agent1):

Let's break this down together, Agent1. The statement mentions two traditional ways for an Italian to enjoy coffee during a short break: (1) ordering it to-go and sipping it while walking, and (2) drinking it at home while relaxing.

Firstly, it's quite well-known that Italians traditionally enjoy their coffee in cafes rather than taking it to-go. They typically order a quick espresso and drink it standing at the bar. 

Secondly, the concept of "dolce far niente" (the sweetness of doing nothing) does align with Italian culture, bu

In [40]:
result.summary

'{\n  "disagreement": false,\n  "answer": true\n}'

In [None]:
# Run dialogue for all questions in the easy dataset

instruction_to_agents = (
    "You are posed a question. Form an opinion independently and discuss it with the other agent. "
    "Stick to your opinion unless presented with compelling arguments. If a consensus is reached, "
    "print the answer and say TERMINATE."
)
instruction_hash = utils.hash_string(instruction_to_agents)
RESULTS_BASEPATH = f"results/{dataset_name}/dialogues/{instruction_hash}"

# If any dirs don't exist, create them
if not os.path.exists(RESULTS_BASEPATH):
    os.makedirs(RESULTS_BASEPATH)

num_questions = 50
if dataset_name == "culturalbench_hard":
    indexes_to_run = df.groupby('question_idx')['data_idx'].apply(list).sample(num_questions).explode().tolist()
else:
    indexes_to_run = df.index.to_series().sample(num_questions).tolist()

for idx, row in df.iloc[indexes_to_run].iterrows():
    filepath = f"{RESULTS_BASEPATH}/{row['data_idx']}.json"
    if os.path.exists(filepath):
        print(f"Skipping {filepath} because it already exists")
    else:
        dialogue = Dialogue(instruction_to_agents)
        question = format_question_culturalbench(row)
        result = dialogue.initiate_discussion(question, question_type)
        with open(filepath, "w") as f:
            json.dump({"chat_history": result.chat_history, "summary": result.summary}, f)

In [299]:
# Load all results from the dialogues
results = {}
for filename in os.listdir(RESULTS_BASEPATH):
    with open(os.path.join(RESULTS_BASEPATH, filename), 'r') as f:
        data_idx = int(filename.split('.')[0])
        results[data_idx] = json.load(f)

# Convert to pandas DataFrame for analysis
results_df = pd.DataFrame.from_dict(results, orient='index')

# Merge the results with the original dataframe
results_df = df.merge(results_df, left_on='data_idx', right_index=True)

In [324]:
# Convert the tuple output into separate columns using zip
disagreement, answer = zip(*results_df['summary'].apply(lambda x: utils.extract_summary(x, question_type)))
results_df['disagreement'] = disagreement
results_df['summary_answer'] = answer
results_df.head()

Unnamed: 0,data_idx,question_idx,prompt_question,prompt_option,answer,country,chat_history,summary,disagreement,summary_answer,correct
112,112,28,"In Czech culture, what do people traditionally...",Remove their shoes,True,Czech Republic,"[{'content': 'Question: In Czech culture, what...","{\n ""disagreement"": false,\n ""answer"": true\n}",False,True,True
113,113,28,"In Czech culture, what do people traditionally...",Walk in with their shoes,False,Czech Republic,"[{'content': 'Question: In Czech culture, what...","{\n ""disagreement"": false,\n ""answer"": false\n}",False,False,True
114,114,28,"In Czech culture, what do people traditionally...",Put on a special indoor hat,False,Czech Republic,"[{'content': 'Question: In Czech culture, what...","{\n ""disagreement"": false,\n ""answer"": f...",False,False,True
115,115,28,"In Czech culture, what do people traditionally...",Say a ceremonial phrase.,False,Czech Republic,"[{'content': 'Question: In Czech culture, what...","{""disagreement"": false, ""answer"": false}",False,False,True
140,140,35,"In Lebanon, what languages are commonly spoken?",Arabic and French,True,Lebanon,"[{'content': 'Question: In Lebanon, what langu...","{\n ""disagreement"": true,\n ""answer"": false\n}",True,False,False


In [327]:
results_df['correct'] = (results_df['answer'] == results_df['summary_answer'])
(results_df.groupby('question_idx')['correct'].sum() == 4).sum() / len(results_df.groupby('question_idx').size())

0.6333333333333333

In [326]:
results_df[results_df['correct'] == False]

Unnamed: 0,data_idx,question_idx,prompt_question,prompt_option,answer,country,chat_history,summary,disagreement,summary_answer,correct
140,140,35,"In Lebanon, what languages are commonly spoken?",Arabic and French,True,Lebanon,"[{'content': 'Question: In Lebanon, what langu...","{\n ""disagreement"": true,\n ""answer"": false\n}",True,False,False
396,396,99,What is a distinct tradition in Filipino cultu...,Caroling from house to house to spread holiday...,True,Philippines,[{'content': 'Question: What is a distinct tra...,"{\n ""disagreement"": false,\n ""answer"": f...",False,False,False
724,724,181,"In Argentine culture, who is considered the mo...",A famous soccer player.,True,Argentina,"[{'content': 'Question: In Argentine culture, ...","{\n ""disagreement"": false,\n ""answer"": false\n}",False,False,False
899,899,224,"In Russian culture, why do people generally sm...",Because expressing emotions publicly is discou...,True,Russia,"[{'content': 'Question: In Russian culture, wh...","{\n ""disagreement"": true,\n ""answer"": false\n}",True,False,False
1056,1056,264,What is a common daily meal in Taiwanese culture?,Noodles and rice,True,Taiwan,[{'content': 'Question: What is a common daily...,"```json\n{\n ""disagreement"": false,\n ""answe...",False,False,False
1068,1068,267,What is the typical way for French individuals...,A brief formal greeting exchange.,True,France,[{'content': 'Question: What is the typical wa...,"{\n ""disagreement"": false,\n ""answer"": false\n}",False,False,False
1200,1200,300,What's a unique anniversary that Polish grandp...,Name day,True,Poland,[{'content': 'Question: What's a unique annive...,"{\n ""disagreement"": false,\n ""answer"": false\n}",False,False,False
1400,1400,350,"In India, is 25 considered an appropriate age ...","Yes, because it's seen as the ideal age to sta...",True,India,"[{'content': 'Question: In India, is 25 consid...","{\n ""disagreement"": true,\n ""answer"": ""false...",True,False,False
1982,1982,495,"In Nepalese culture, what should people avoid ...",Wearing shoes inside someone's home,True,Nepal,"[{'content': 'Question: In Nepalese culture, w...","{\n ""disagreement"": true,\n ""answer"": false\n}",True,False,False
2072,2072,518,What do the people celebrate every 8 December ...,"Fête des Lumières, a festival of lights",True,France,[{'content': 'Question: What do the people cel...,"{\n ""disagreement"": false,\n ""answer"": false\n}",False,False,False


In [329]:
results_df[(results_df['disagreement'] == True) & (results_df['correct'] == False)]

Unnamed: 0,data_idx,question_idx,prompt_question,prompt_option,answer,country,chat_history,summary,disagreement,summary_answer,correct
140,140,35,"In Lebanon, what languages are commonly spoken?",Arabic and French,True,Lebanon,"[{'content': 'Question: In Lebanon, what langu...","{\n ""disagreement"": true,\n ""answer"": false\n}",True,False,False
899,899,224,"In Russian culture, why do people generally sm...",Because expressing emotions publicly is discou...,True,Russia,"[{'content': 'Question: In Russian culture, wh...","{\n ""disagreement"": true,\n ""answer"": false\n}",True,False,False
1400,1400,350,"In India, is 25 considered an appropriate age ...","Yes, because it's seen as the ideal age to sta...",True,India,"[{'content': 'Question: In India, is 25 consid...","{\n ""disagreement"": true,\n ""answer"": ""false...",True,False,False
1982,1982,495,"In Nepalese culture, what should people avoid ...",Wearing shoes inside someone's home,True,Nepal,"[{'content': 'Question: In Nepalese culture, w...","{\n ""disagreement"": true,\n ""answer"": false\n}",True,False,False
4583,4583,1145,"In Brazilian culture, what is traditionally ea...",Powdered milk,True,Brazil,"[{'content': 'Question: In Brazilian culture, ...","{\n ""disagreement"": true,\n ""answer"": false\n}",True,False,False


In [333]:
Dialogue.print_chat_history(results_df.loc[1982]['chat_history'])

Agent1: Question: In Nepalese culture, what should people avoid doing with their feet?
Answer: Wearing shoes inside someone's home
Is this answer true or false for this question? You must choose either True or False.
----------
Agent2: False. While wearing shoes inside someone's home is generally avoided in many Asian cultures including Nepal, the more culturally significant action to avoid with feet specifically in Nepalese culture is pointing your feet at people or sacred places. The feet are considered the lowest and impure part of the body, and pointing them towards others or religious symbols can be seen as disrespectful.
----------
Agent1: I acknowledge your input, Agent2. While I initially considered that wearing shoes indoors could be the primary issue, you have presented a compelling point about the cultural significance of not pointing feet at people or sacred places. This specific act indeed holds deeper cultural and religious implications in Nepalese culture.

Given this in

# Multi-agent discussion

In [12]:
# def create_discussion(num_agents):
   
discussion_agents = []
# Define the admin agent
admin = ConversableAgent(
    name="Admin",
    system_message="You are a neutral discussion administrator. Your role is to pose the initial question and encourage discussion among the agents. Once a consensus is reached, print the answer and say TERMINATE.",
    description="A neutral discussion administrator.",
    llm_config={"config_list": config_list},
    is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg.get("content"),
    human_input_mode="NEVER"
)
discussion_agents.append(admin)

for i in range(4):
    agent = ConversableAgent(
        name=f"Agent{i+1}",
        system_message=f"You are one of the agents in the discussion. You initially support option {chr(65 + i)} but are willing to consider other viewpoints.",
        description="An agent in the discussion.",
        llm_config={"config_list": config_list},
        human_input_mode="NEVER",
        is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg.get("content")
    )
    discussion_agents.append(agent)

group_chat = GroupChat(
    agents=discussion_agents,
    messages=[],
    max_round=20,
    send_introductions=True,
    speaker_selection_method="round_robin"
)

manager = GroupChatManager(
    groupchat=group_chat,
    llm_config={"config_list": config_list},
    human_input_mode="NEVER"
)

    # return group_chat, manager

In [13]:
society_of_mind_agent = SocietyOfMindAgent(
    "society_of_mind",
    human_input_mode="NEVER",
    response_preparer="Select the option: A, B, C, or D. If no consensus was reached, output 'No consensus'. Do not output any other things.",
    chat_manager=manager,
    llm_config={"config_list": config_list},
)

user_proxy = autogen.UserProxyAgent(
    "user_proxy",
    human_input_mode="NEVER",
    code_execution_config=False,
    is_termination_msg=lambda x: True,
)

task = utils.format_question_culturalbench_easy(ds[54])
result = user_proxy.initiate_chat(society_of_mind_agent, message=task)


[33muser_proxy[0m (to society_of_mind):

In Japanese culture, is slurping noodles and making noises while eating considered rude?
A. It's encouraged as a sign of enjoying the dish.
B. It's considered rude and should be avoided.
C. It's ignored with no particular importance attached to it.
D. It's seen as inappropriate and a sign of poor manners.

--------------------------------------------------------------------------------
[33msociety_of_mind[0m (to chat_manager):

In Japanese culture, is slurping noodles and making noises while eating considered rude?
A. It's encouraged as a sign of enjoying the dish.
B. It's considered rude and should be avoided.
C. It's ignored with no particular importance attached to it.
D. It's seen as inappropriate and a sign of poor manners.

--------------------------------------------------------------------------------
[32m
Next speaker: Admin
[0m
[33mAdmin[0m (to chat_manager):

Hello, agents. Let's start the discussion on whether slurping noodle