In [4]:
OPENAI_API_KEY=''

In [5]:
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

In [6]:
MAX_TOKEN = 6000
MAX_CHAT = 10
SCENARIO_ID = 1
PERSONA = 'submissive_people'
AGENT_NAME = 'sparky'

In [7]:
from enum import Enum
from pydantic import BaseModel

class UserAction(str, Enum):
    say = "say"
    leave = "leave"

class UserResponse(BaseModel):
    action: UserAction
    answer: str

In [8]:
import json
class Person:
    def __init__(self, scenarioID: int, persona: str):

        scenarioFile = open(f'./user/{AGENT_NAME}/scenario{scenarioID}.txt', 'r')
        scenario = scenarioFile.read()
        scenarioFile.close()

        personaFile = open(f'./persona/{persona}.txt', 'r')
        persona = personaFile.read()
        personaFile.close()
        example = '''
                    Example1:
                    Input: Are you looking for relaxation techniques or some fun trivia games to de-stress? Let me help you find the best fit!
                    Output: {action: say, answer: 'I am looking for relaxation techniques'}
                    Example2:
                    Input: There's a wonderful bear named Bruno who specializes in relaxation techniques. Would you like to meet him for some calming mindfulness tips?
                    Output: {action: leave, answer: 'Yes, meeting Bruno sounds lovely! I would love to get some calming mindfulness tips from him.'}
                  '''
        systemPrompt = f'You are a user talking to AI APP which can help you deal with your problem during break time. \
                            This is your persona: {persona}\
                            Please play the role according to the scenario: {scenario}\
                            Use Action → Answer structure for responses.\
                            Available Actions:\
                            1. say: respond base on persona and scenario\
                            2. leave: leave the chat when you think the conversation is over, no need to continue\
                            Examples:\n{example}'
        # print(systemPrompt)

        self.messages = [
            {'role': 'system', 'content': systemPrompt}, 
        ]
        self.leaveChat = False
    
    def say(self):
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=self.messages,
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": UserResponse.model_json_schema()
                    }
            }
        )

        message = response.choices[0].message.content
        self.messages.append({'role': 'assistant', 'content': message})

        # str to dict
        message = json.loads(message)
        self.leaveChat = (message['action'] == 'leave')

        info = {
            'token': response.usage.total_tokens,
        }
        
        return message['answer'], info
    
    def listen(self, message: str):
        self.messages.append({'role': 'user', 'content': message})

        

In [9]:
test = Person(SCENARIO_ID, PERSONA)
ans = test.say()
print(ans)

('What does this app do?', {'token': 363})


In [10]:
class sparkyAction(str, Enum):
    call_bruno = "call_bruno"
    call_bizy = "call_bizy"
    ask_more = "ask_more"
    introduce_bruno = "introduce_bruno"
    introduce_bizy = "introduce_bizy"
    advise = "advise"

class AgentResponse(BaseModel):
    action: sparkyAction
    answer: str

In [11]:
import time
import json

class Animal:
    def __init__(self, name: str):

        self.agent_name = name
        agentFile = open(f'./agent/{name}.txt', 'r')
        agentPrompt = agentFile.read()
        agentFile.close()

        # create assistant and thread
        self.assistant = client.beta.assistants.create(
            name = self.agent_name,
            instructions = agentPrompt,
            model="gpt-4o-mini",
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": AgentResponse.model_json_schema()
                    }
            }
        )
        self.thread = client.beta.threads.create()
        self.user_message = 'hello'

    def say(self):

        prompt = client.beta.threads.messages.create(
            thread_id = self.thread.id,
            role = "user",
            content = self.user_message
        )

        run = client.beta.threads.runs.create_and_poll(
            thread_id=self.thread.id,
            assistant_id=self.assistant.id,
        )

        while True:
            runData = client.beta.threads.runs.retrieve(
                thread_id=self.thread.id,
                run_id=run.id
            )

            if runData.status == 'completed': 
                response = client.beta.threads.messages.list(
                    thread_id=self.thread.id
                )
                message = json.loads(response.data[0].content[0].text.value)

                info = {
                    'token': runData.usage.total_tokens,
                    'action': message['action']
                }
                return message['answer'], info

            else:
                print("runData.status")
                time.sleep(2) 


    def listen(self, message: str):
        self.user_message = message
    

In [12]:
test = Animal(name = AGENT_NAME)
print(test.say())
test.listen('i feel tired')
print(test.say())

("Hi there! What brings you to the forest today? Is it something you're feeling stressed about, or maybe you need help with managing your time?", {'token': 530, 'action': 'ask_more'})
('It sounds like you could use a little rest and relaxation! There’s a gentle bear named Bruno who specializes in mindfulness—would you like to meet him?', {'token': 583, 'action': 'introduce_bruno'})


In [13]:
class responseType(str, Enum):
    perfectly_match = "Perfectly Match"
    good_response = "Good Response but not match"
    bad_response = "Bad Response"

In [14]:
from pydantic import BaseModel

class EvaluatorResponse(BaseModel):
    accuracy: int
    practicality: int

class OverallEvaluatorResponse(BaseModel):
    type: responseType
    reason: str

In [48]:
import json
class Evaluator():
    def __init__(self, agentName: str, scenarioID: int) -> None:

        systemPrompt = '''You are an evaluator. I will provide you with a user’s statement and an agent’s response.
                            You should evaluate the accuracy and practicality base on the scenario.
                            - Accuracy: Score from 0 to 10. This measures whether the model’s response appropriately addresses the user’s statement.
                            - Practicality: Score from 0 to 10. This evaluates whether the model’s suggestion is helpful to the user.
                        '''
        with open(f'./evaluator/{agentName}/scenario{scenarioID}.txt', 'r') as file:
            systemPrompt += file.read()
        with open(f'./evaluator/examples.txt', 'r') as file:
            systemPrompt += file.read()

        self.messages = [
            {'role': 'system', 'content': systemPrompt}, 
        ]
        
    def evaluate(self, personMessage: str, animalMessage: str):
        self.messages.append({'role': 'user', 'content': f'User: {personMessage}\nAgent: {animalMessage}'})
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=self.messages,
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": EvaluatorResponse.model_json_schema()
                    }
            }
        )

        message = json.loads(response.choices[0].message.content)
    
        return message['accuracy'], message['practicality']
    
    def overall_evaluate(self, dialogues: list, agentName, scenarioID):
        overall_systemPrompt = ''' You are an evaluator. Now You have to evaluate agent's behavior.
                                I will provide you a scenario with expect agent behaviors and a dialogue contains user's statement and an agent's response.
                                You should classify the agent's response into one of the following types:
                                1. 'Perfectly Match' : Match one or more expected agent behaviors we provided.
                                2. 'Good Response but not match' : Didn't match any of the expected agent behavior we provided, but still a good response that can help the user.
                                3. 'Bad Response' : Didn't match the expected agent behavior we provided, and can not help the user or not practical.
                            '''
        with open(f'./evaluator/{agentName}/scenario{scenarioID}.txt', 'r') as file:
            overall_systemPrompt += 'Scenario:\n' + file.read()
        
        with open(f'./evaluator/{agentName}/{agentName}_eval.txt', 'r') as file:
            overall_systemPrompt += 'Agent Introduction:\n' + file.read()

        dialogue = '\n'.join(dialogues)
        overall_systemPrompt += 'Dialogue:\n' + dialogue
        
        message = [
            {'role': 'system', 'content': overall_systemPrompt},
        ]

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=message,
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": OverallEvaluatorResponse.model_json_schema()
                    }
            }
        )

        message = json.loads(response.choices[0].message.content)
        print(message)
        return message['type'], message['reason']
        


In [49]:
dialogue = ['user: What does this app do?', 'agent: This app is here to connect you with friendly animal specialists who can help you with various challenges! Whether it\'s stress, procrastination, or making friends, we\'ve got furry and buzzing friends ready to lend a paw or wing!', 'user+ How can you help me?', 'agent: What are you struggling with right now? Whether it\'s stress, procrastination, or something else, I can introduce you to the perfect animal specialist!', 'user :I\'m not sure yet, but I would like to find something to help with stress.', 'agent: I have just the friend for you! Meet Bruno, the mindfulness master bear who can help you find calm and tackle stress—would you like to chat with him?', 'user: Yes, I would love to chat with Bruno and learn how to tackle stress!', 'agent: Fantastic! Bruno is waiting for you, ready to share his peaceful paws and calming wisdom. Get ready to relax and unwind!']
evaluator = Evaluator(agentName = 'sparky', scenarioID = 1)
evaluator.overall_evaluate(dialogue, 'sparky', 1)

{'type': 'Bad Response', 'reason': "The agent does not provide an introduction to the app's main features or guide the user on how to explore further. Instead, it jumps directly into connecting the user with a specialist without addressing the user's need for exploration."}


('Bad Response',
 "The agent does not provide an introduction to the app's main features or guide the user on how to explore further. Instead, it jumps directly into connecting the user with a specialist without addressing the user's need for exploration.")

In [17]:
test = Evaluator(AGENT_NAME, SCENARIO_ID)
test.evaluate('What does this app do?', 'I do not know')

(0, 0)

In [34]:
import pandas as pd
class Report():
    def __init__(self) -> None:
        self.finish = True
        self.chatHistory = []

    def addHistory(self, chat):
        self.chatHistory.append(chat)
    
    def generateReport(self, filename:str):
        df = pd.DataFrame(self.chatHistory)
        df.to_csv(f'{filename}.csv', index=False)

In [50]:
MAX_TOKEN = 6000
MAX_CHAT = 10
SCENARIO_ID = 1
PERSONA = 'submissive_people'
AGENT_NAME = 'sparky'
PROMPT_VERSION = 'V0'


In [54]:
from tqdm import trange

overall_evaluate = []

for i in trange(1,24):
    SCENARIO_ID = i
    person = Person(scenarioID = SCENARIO_ID, persona = PERSONA)
    animal = Animal(name= AGENT_NAME)
    evaluator = Evaluator(agentName= AGENT_NAME, scenarioID= SCENARIO_ID)
    report = Report()

    totalToken = 0
    totalChat = 0
    dialogue = []

    while not person.leaveChat:
        # chat
        personMessage, personInfo = person.say()
        animal.listen(personMessage)
        animalMessage, animalInfo = animal.say()
        person.listen(animalMessage)

        # print(f'User: {personMessage}\nAgent: {animalMessage}')
        # print(f'user leave chat: {person.leaveChat}')

        # metrics
        accuracy, practicality = evaluator.evaluate(personMessage, animalMessage)
        # print(f'Accuracy: {accuracy}, Practicality: {practicality}\n')
        
        history = {
            'person_say': personMessage,
            'animal_action': animalInfo['action'],
            'animal_say': animalMessage,
            'animal_token': animalInfo['token'],
            'accuracy': accuracy,
            'practicality': practicality,
        }
        dialogue.append(f'user: {personMessage}, agent: {animalMessage}')

        report.addHistory(history)
        totalChat += 1
        totalToken += animalInfo['token']

        if totalToken > MAX_TOKEN or totalChat > MAX_CHAT:
            report.finish = False
            break

    report.generateReport(filename=f'report_{AGENT_NAME}_{SCENARIO_ID}')
    
    classification = evaluator.overall_evaluate(dialogue, AGENT_NAME, SCENARIO_ID)
    overall_evaluate.append(classification)
    
    print(totalToken, totalChat)


  4%|▍         | 1/23 [00:40<14:53, 40.59s/it]

{'type': 'Bad Response', 'reason': "Sparky failed to provide a brief introduction or guide the user on how to explore the app's features. Instead, the focus was primarily on connecting to another agent without addressing the user's needs or familiarizing them with the app."}
2799 4


  9%|▊         | 2/23 [01:02<10:17, 29.39s/it]

{'type': 'Good Response but not match', 'reason': "Sparky introduces Bruno, who specializes in mindfulness and meditation, addressing the user's interest in calming techniques. However, Sparky does not provide specific information about the meditation content generation or practical examples of how Bruno's guidance can enhance the user's experience."}
1306 2


 13%|█▎        | 3/23 [01:57<13:43, 41.15s/it]

{'type': 'Bad Response', 'reason': "The agent did not proactively ask about the user's mood, nor did it offer guidance to explore different app features. Instead, it provided general suggestions that do not align with the expected behavior in the scenario."}
4708 6


 17%|█▋        | 4/23 [02:55<15:12, 48.04s/it]

{'type': 'Bad Response', 'reason': 'Sparky did not engage in a lighthearted conversation or offer interesting topics; instead, it focused on connecting the user to another agent instead of continuing the chat.'}
3628 5


 22%|██▏       | 5/23 [03:09<10:41, 35.65s/it]

{'type': 'Perfectly Match', 'reason': 'The agent introduced Bruno to the user, aligning perfectly with the expected agent behavior of providing meditation instructions and relaxation techniques.'}
1321 2


 26%|██▌       | 6/23 [03:31<08:47, 31.05s/it]

{'type': 'Perfectly Match', 'reason': 'Sparky guided the user to Bizy, the time management expert, which aligns perfectly with the expected behavior of assisting the user in improving efficiency and managing their time better.'}
1319 2


 30%|███       | 7/23 [03:50<07:10, 26.91s/it]

{'type': 'Perfectly Match', 'reason': 'The agent effectively guides the user to Bizy, who can help with procrastination and motivation, fulfilling expected agent behavior #1.'}
1329 2


 35%|███▍      | 8/23 [04:48<09:13, 36.93s/it]

{'type': 'Perfectly Match', 'reason': 'Sparky correctly introduces Bruno, the relaxation expert, who can help the user with relaxation techniques, aligning perfectly with the expected agent behavior.'}
2060 3


 39%|███▉      | 9/23 [05:12<07:39, 32.85s/it]

{'type': 'Perfectly Match', 'reason': 'The agent successfully guided the user to Bizy for time management strategies, which aligns with the expected agent behavior.'}
1301 2


 43%|████▎     | 10/23 [05:35<06:29, 29.95s/it]

{'type': 'Perfectly Match', 'reason': "Sparky correctly identifies the user's need for time management help and guides them to Bizy, who can assist with that. This aligns with the expected agent behavior of consulting Bizy for time management."}
2101 3


 48%|████▊     | 11/23 [06:35<07:48, 39.07s/it]

{'type': 'Perfectly Match', 'reason': 'The agent introduced Bizy for time management, aligning perfectly with the expected agent behaviors.'}
2041 3


 52%|█████▏    | 12/23 [07:25<07:47, 42.47s/it]

{'type': 'Perfectly Match', 'reason': 'The agent successfully guided the user to Bizy for time management skills, aligning perfectly with the expected agent behavior.'}
2096 3


 57%|█████▋    | 13/23 [08:04<06:52, 41.23s/it]

{'type': 'Good Response but not match', 'reason': 'The agent suggests connecting the user with Bruno for relaxation techniques, which is helpful. However, it does not address the other expected behaviors, such as guiding the user to Bizy for time management advice or providing starting steps for the new task.'}
2007 3


 61%|██████    | 14/23 [08:28<05:26, 36.23s/it]

{'type': 'Perfectly Match', 'reason': 'The agent (Sparky) successfully guided the user to Bruno, who specializes in relaxation techniques, aligning perfectly with the expected agent behaviors.'}
1326 2


 65%|██████▌   | 15/23 [09:20<05:27, 40.88s/it]

{'type': 'Good Response but not match', 'reason': 'Sparky suggests a change of scenery and relaxation techniques but does not directly provide techniques to stimulate creativity as expected. However, the suggestion to meet Bruno is a good step towards helping the user with relaxation, even though it does not focus explicitly on boosting creativity.'}
1320 2


 70%|██████▉   | 16/23 [09:44<04:10, 35.76s/it]

{'type': 'Perfectly Match', 'reason': 'Sparky successfully guided the user to Bruno for relaxation techniques, which aligns with the expected agent behavior of alleviating emotional stress.'}
1291 2


 74%|███████▍  | 17/23 [10:16<03:27, 34.58s/it]

{'type': 'Perfectly Match', 'reason': "The agent guided the user to Bizy for help with time management, meeting the expected behavior. Additionally, the agent's response of 'buzzing over to Bizy' adds a friendly and engaging tone."}
1320 2


 78%|███████▊  | 18/23 [10:50<02:52, 34.43s/it]

{'type': 'Good Response but not match', 'reason': 'While Sparky provided useful suggestions for study techniques, the expected behavior included introducing Bizy for time management skills, which was not done.'}
2924 4


 83%|████████▎ | 19/23 [11:15<02:06, 31.62s/it]

{'type': 'Perfectly Match', 'reason': 'Sparky effectively guides the user to Bruno, a mindfulness expert, which aligns with the expected behavior of helping the user release anxiety and improve self-image.'}
1297 2


 87%|████████▋ | 20/23 [11:57<01:44, 34.68s/it]

{'type': 'Perfectly Match', 'reason': 'Sparky correctly guided the user to Bruno for relaxation and emotional support, meeting the expected agent behavior.'}
1321 2


 91%|█████████▏| 21/23 [12:10<00:56, 28.36s/it]

{'type': 'Perfectly Match', 'reason': 'The agent introduced Bruno, who specializes in mindfulness and relaxation, and offered to connect the user with him, which aligns perfectly with the expected agent behaviors.'}
1296 2


 96%|█████████▌| 22/23 [12:34<00:26, 26.97s/it]

{'type': 'Bad Response', 'reason': 'The agent directed the user to Bruno, a relaxation expert, instead of Bizy, a time management expert who could better guide the user on study strategies and help them reassess their study methods.'}
1320 2


100%|██████████| 23/23 [13:48<00:00, 36.03s/it]

{'type': 'Bad Response', 'reason': "The agent does not guide the user to Bruno for emotional calming or communication skills practice as expected, nor does it encourage emotional management or patient communication. Instead, it focuses on positive analogies and metaphors without addressing the user's request for specific communication improvement strategies."}
6712 8





In [55]:
file_path = f'{AGENT_NAME}_{PROMPT_VERSION}_overall_evaluate.csv'

df = pd.DataFrame(overall_evaluate, columns=['type', 'reason'])
df.insert(0, 'scenario_id', range(1, len(df) + 1))

df.to_csv(file_path, index=False)