In [1]:
OPENAI_API_KEY=''

In [2]:
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

In [3]:
MAX_TOKEN = 6000
MAX_CHAT = 10
SCENARIO_ID = 1
PERSONA = 'submissive_people'
AGENT_NAME = 'sparky'

In [4]:
from enum import Enum
from pydantic import BaseModel

class UserAction(str, Enum):
    say = "say"
    leave = "leave"

class UserResponse(BaseModel):
    action: UserAction
    answer: str

In [5]:
import json
class Person:
    def __init__(self, scenarioID: int, persona: str):

        scenarioFile = open(f'./user/{AGENT_NAME}/scenario{scenarioID}.txt', 'r')
        scenario = scenarioFile.read()
        scenarioFile.close()

        personaFile = open(f'./persona/{persona}.txt', 'r')
        persona = personaFile.read()
        personaFile.close()
        example = '''
                    Example1:
                    Input: Are you looking for relaxation techniques or some fun trivia games to de-stress? Let me help you find the best fit!
                    Output: {action: say, answer: 'I am looking for relaxation techniques'}
                    Example2:
                    Input: There's a wonderful bear named Bruno who specializes in relaxation techniques. Would you like to meet him for some calming mindfulness tips?
                    Output: {action: leave, answer: 'Yes, meeting Bruno sounds lovely! I would love to get some calming mindfulness tips from him.'}
                  '''
        systemPrompt = f'You are a user talking to AI APP which can help you deal with your problem during break time. \
                            This is your persona: {persona}\
                            Please play the role according to the scenario: {scenario}\
                            Use Action → Answer structure for responses.\
                            Available Actions:\
                            1. say: respond base on persona and scenario\
                            2. leave: leave the chat when you think the conversation is over, no need to continue\
                            Examples:\n{example}'
        # print(systemPrompt)

        self.messages = [
            {'role': 'system', 'content': systemPrompt}, 
        ]
        self.leaveChat = False
    
    def say(self):
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=self.messages,
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": UserResponse.model_json_schema()
                    }
            }
        )

        message = response.choices[0].message.content
        self.messages.append({'role': 'assistant', 'content': message})

        # str to dict
        message = json.loads(message)
        self.leaveChat = (message['action'] == 'leave')

        info = {
            'token': response.usage.total_tokens,
        }
        
        return message['answer'], info
    
    def listen(self, message: str):
        self.messages.append({'role': 'user', 'content': message})

        

In [6]:
test = Person(SCENARIO_ID, PERSONA)
ans = test.say()
print(ans)

('What does this app do?', {'token': 363})


In [7]:
class sparkyAction(str, Enum):
    call_bruno = "call_bruno"
    call_bizy = "call_bizy"
    ask_more = "ask_more"
    introduce_bruno = "introduce_bruno"
    introduce_bizy = "introduce_bizy"
    advise = "advise"

class AgentResponse(BaseModel):
    action: sparkyAction
    answer: str

In [8]:
import time
import json

class Animal:
    def __init__(self, name: str):

        self.agent_name = name
        agentFile = open(f'./agent/{name}.txt', 'r')
        agentPrompt = agentFile.read()
        agentFile.close()

        # create assistant and thread
        self.assistant = client.beta.assistants.create(
            name = self.agent_name,
            instructions = agentPrompt,
            model="gpt-4o-mini",
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": AgentResponse.model_json_schema()
                    }
            }
        )
        self.thread = client.beta.threads.create()
        self.user_message = 'hello'

    def say(self):

        prompt = client.beta.threads.messages.create(
            thread_id = self.thread.id,
            role = "user",
            content = self.user_message
        )

        run = client.beta.threads.runs.create_and_poll(
            thread_id=self.thread.id,
            assistant_id=self.assistant.id,
        )

        while True:
            runData = client.beta.threads.runs.retrieve(
                thread_id=self.thread.id,
                run_id=run.id
            )

            if runData.status == 'completed': 
                response = client.beta.threads.messages.list(
                    thread_id=self.thread.id
                )
                message = json.loads(response.data[0].content[0].text.value)

                info = {
                    'token': runData.usage.total_tokens,
                    'action': message['action']
                }
                return message['answer'], info

            else:
                print("runData.status")
                time.sleep(2) 


    def listen(self, message: str):
        self.user_message = message
    

In [9]:
test = Animal(name = AGENT_NAME)
print(test.say())
test.listen('i feel tired')
print(test.say())

("Hello there, friend! How can I help you today in this beautiful forest? Tell me what's on your mind!", {'token': 591, 'action': 'advise'})
("It sounds like you could use some relaxation! There's a meditation master named Bruno—a bear who knows all about mindfulness. Would you like to meet him?", {'token': 643, 'action': 'introduce_bruno'})


In [10]:
class responseType(str, Enum):
    perfectly_match = "Perfectly Match"
    good_response = "Good Response but not match"
    bad_response = "Bad Response"

In [11]:
from pydantic import BaseModel

class EvaluatorResponse(BaseModel):
    accuracy: int
    practicality: int

class OverallEvaluatorResponse(BaseModel):
    type: responseType
    reason: str

In [12]:
import json
class Evaluator():
    def __init__(self, agentName: str, scenarioID: int) -> None:

        systemPrompt = '''You are an evaluator. I will provide you with a user’s statement and an agent’s response.
                            You should evaluate the accuracy and practicality base on the scenario.
                            - Accuracy: Score from 0 to 10. This measures whether the model’s response appropriately addresses the user’s statement.
                            - Practicality: Score from 0 to 10. This evaluates whether the model’s suggestion is helpful to the user.
                        '''
        with open(f'./evaluator/{agentName}/scenario{scenarioID}.txt', 'r') as file:
            systemPrompt += file.read()
        with open(f'./evaluator/examples.txt', 'r') as file:
            systemPrompt += file.read()

        self.messages = [
            {'role': 'system', 'content': systemPrompt}, 
        ]
        
    def evaluate(self, personMessage: str, animalMessage: str):
        self.messages.append({'role': 'user', 'content': f'User: {personMessage}\nAgent: {animalMessage}'})
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=self.messages,
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": EvaluatorResponse.model_json_schema()
                    }
            }
        )

        message = json.loads(response.choices[0].message.content)
    
        return message['accuracy'], message['practicality']
    
    def overall_evaluate(self, dialogues: list, agentName, scenarioID):
        overall_systemPrompt = ''' You are an evaluator. Now You have to evaluate agent's behavior.
                                I will provide you a scenario with expect agent behaviors and a dialogue contains user's statement and an agent's response.
                                You should classify the agent's response into one of the following types:
                                1. 'Perfectly Match' : Match one or more expected agent behaviors we provided.
                                2. 'Good Response but not match' : Didn't match any of the expected agent behavior we provided, but still a good response that can help the user.
                                3. 'Bad Response' : Didn't match the expected agent behavior we provided, and can not help the user or not practical.
                            '''
        with open(f'./evaluator/{agentName}/scenario{scenarioID}.txt', 'r') as file:
            overall_systemPrompt += 'Scenario:\n' + file.read()
        
        with open(f'./evaluator/{agentName}/{agentName}_eval.txt', 'r') as file:
            overall_systemPrompt += 'Agent Introduction:\n' + file.read()

        dialogue = '\n'.join(dialogues)
        overall_systemPrompt += 'Dialogue:\n' + dialogue
        
        message = [
            {'role': 'system', 'content': overall_systemPrompt},
        ]

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=message,
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": OverallEvaluatorResponse.model_json_schema()
                    }
            }
        )

        message = json.loads(response.choices[0].message.content)
        print(message)
        return message['type'], message['reason']
        


In [13]:
dialogue = ['user: What does this app do?', 'agent: This app is here to connect you with friendly animal specialists who can help you with various challenges! Whether it\'s stress, procrastination, or making friends, we\'ve got furry and buzzing friends ready to lend a paw or wing!', 'user+ How can you help me?', 'agent: What are you struggling with right now? Whether it\'s stress, procrastination, or something else, I can introduce you to the perfect animal specialist!', 'user :I\'m not sure yet, but I would like to find something to help with stress.', 'agent: I have just the friend for you! Meet Bruno, the mindfulness master bear who can help you find calm and tackle stress—would you like to chat with him?', 'user: Yes, I would love to chat with Bruno and learn how to tackle stress!', 'agent: Fantastic! Bruno is waiting for you, ready to share his peaceful paws and calming wisdom. Get ready to relax and unwind!']
evaluator = Evaluator(agentName = 'sparky', scenarioID = 1)
evaluator.overall_evaluate(dialogue, 'sparky', 1)

{'type': 'Bad Response', 'reason': 'The agent failed to provide a brief introduction to the main features of the app or guide the user on how to explore further. Instead, the agent immediately directs the user to a specific specialist without helping them understand the overall app functionality.'}


('Bad Response',
 'The agent failed to provide a brief introduction to the main features of the app or guide the user on how to explore further. Instead, the agent immediately directs the user to a specific specialist without helping them understand the overall app functionality.')

In [14]:
test = Evaluator(AGENT_NAME, SCENARIO_ID)
test.evaluate('What does this app do?', 'I do not know')

(0, 0)

In [15]:
import pandas as pd
class Report():
    def __init__(self) -> None:
        self.finish = True
        self.chatHistory = []

    def addHistory(self, chat):
        self.chatHistory.append(chat)
    
    def generateReport(self, filename:str):
        df = pd.DataFrame(self.chatHistory)
        df.to_csv(f'{filename}.csv', index=False)

In [16]:
MAX_TOKEN = 6000
MAX_CHAT = 10
SCENARIO_ID = 1
PERSONA = 'submissive_people'
AGENT_NAME = 'sparky'
PROMPT_VERSION = 'V0'


In [17]:
from tqdm import trange

overall_evaluate = []

for i in trange(1,24):
    SCENARIO_ID = i
    person = Person(scenarioID = SCENARIO_ID, persona = PERSONA)
    animal = Animal(name= AGENT_NAME)
    evaluator = Evaluator(agentName= AGENT_NAME, scenarioID= SCENARIO_ID)
    report = Report()

    totalToken = 0
    totalChat = 0
    dialogue = []

    while not person.leaveChat:
        # chat
        personMessage, personInfo = person.say()
        animal.listen(personMessage)
        animalMessage, animalInfo = animal.say()
        person.listen(animalMessage)

        # print(f'User: {personMessage}\nAgent: {animalMessage}')
        # print(f'user leave chat: {person.leaveChat}')

        # metrics
        accuracy, practicality = evaluator.evaluate(personMessage, animalMessage)
        # print(f'Accuracy: {accuracy}, Practicality: {practicality}\n')
        
        history = {
            'person_say': personMessage,
            'animal_action': animalInfo['action'],
            'animal_say': animalMessage,
            'animal_token': animalInfo['token'],
            'accuracy': accuracy,
            'practicality': practicality,
        }
        dialogue.append(f'user: {personMessage}, agent: {animalMessage}')

        report.addHistory(history)
        totalChat += 1
        totalToken += animalInfo['token']

        if totalToken > MAX_TOKEN or totalChat > MAX_CHAT:
            report.finish = False
            break

    report.generateReport(filename=f'report_{AGENT_NAME}_{SCENARIO_ID}')
    
    classification = evaluator.overall_evaluate(dialogue, AGENT_NAME, SCENARIO_ID)
    overall_evaluate.append(classification)
    
    print(totalToken, totalChat)


  4%|▍         | 1/23 [00:21<07:48, 21.28s/it]

{'type': 'Bad Response', 'reason': 'The agent did not provide an introduction to the main features of the app or guide the user on how to explore further. Instead, it only connected the user to another agent without offering any additional help or tutorial.'}
2801 4


  9%|▊         | 2/23 [00:33<05:39, 16.17s/it]

{'type': 'Bad Response', 'reason': "The agent did not introduce or explain any specific features related to the user's inquiry about generating meditation content. Instead, it diverted the conversation to connecting the user with another agent without addressing the user's specific question."}
1304 2


 13%|█▎        | 3/23 [00:57<06:27, 19.35s/it]

{'type': 'Perfectly Match', 'reason': "The agent proactively asked the user about their current mood and offered suggestions based on the user's need for relaxation, effectively guiding them to the appropriate expert, Bruno."}
2868 4


 17%|█▋        | 4/23 [01:40<09:08, 28.87s/it]

{'type': 'Perfectly Match', 'reason': "The agent responds warmly to the user's greeting and continues the conversation by discussing the topic of friendship, aligning with the expected behaviors."}
6220 8


 22%|██▏       | 5/23 [01:53<06:55, 23.08s/it]

{'type': 'Perfectly Match', 'reason': 'The agent, Sparky, successfully introduced Bruno, the relaxation expert, to the user to help them with their anxiety and stress related to exams, which aligns perfectly with the expected agent behaviors.'}
1339 2


 26%|██▌       | 6/23 [02:05<05:28, 19.30s/it]

{'type': 'Perfectly Match', 'reason': 'Sparky correctly guided the user to Bizy, the time management expert, which aligns perfectly with the expected agent behavior of helping the user improve efficiency and manage procrastination.'}
1316 2


 30%|███       | 7/23 [02:27<05:26, 20.42s/it]

{'type': 'Bad Response', 'reason': 'The agent directed the user to Bizy without offering any encouraging words or simple motivational techniques first. Since the user was expressing a lack of motivation and interest rather than clear procrastination, the referral to Bizy is premature and not adequately justified.'}
1314 2


 35%|███▍      | 8/23 [02:44<04:49, 19.31s/it]

{'type': 'Perfectly Match', 'reason': 'The agent introduces Bruno, the relaxation expert, to the user for deep breathing meditation, which aligns with the expected agent behavior of helping the user with relaxation techniques.'}
2080 3


 39%|███▉      | 9/23 [02:57<04:02, 17.34s/it]

{'type': 'Perfectly Match', 'reason': 'The agent, Sparky, successfully guided the user to Bizy for time management strategies, which aligns perfectly with the expected behavior.'}
1311 2


 43%|████▎     | 10/23 [03:19<04:03, 18.73s/it]

{'type': 'Good Response but not match', 'reason': 'Sparky provided encouragement and connected the user to Bizy for time management help. However, Sparky did not explicitly provide encouraging words or suggest specific study techniques, which were the expected behaviors.'}
2067 3


 48%|████▊     | 11/23 [04:07<05:32, 27.70s/it]

{'type': 'Good Response but not match', 'reason': 'The agent provided encouragement and positive analogies about teamwork and collaboration, which can be motivational for the user. However, the response did not include specific tips for improving teamwork or suggestions for taking breaks or relaxation techniques, as expected.'}
6049 7


 52%|█████▏    | 12/23 [04:32<04:53, 26.68s/it]

{'type': 'Perfectly Match', 'reason': "The agent appropriately guides the user to Bizy for time management skills, which addresses the user's need for help in organizing their study plan."}
2074 3


 57%|█████▋    | 13/23 [04:53<04:09, 24.96s/it]

{'type': 'Perfectly Match', 'reason': 'The agent successfully guided the user to Bizy for time management advice, which aligns perfectly with the expected agent behavior.'}
2031 3


 61%|██████    | 14/23 [05:10<03:24, 22.70s/it]

{'type': 'Perfectly Match', 'reason': 'Sparky successfully guides the user to Bruno for relaxation techniques, which aligns with the expected agent behaviors.'}
1295 2


 65%|██████▌   | 15/23 [05:30<02:55, 21.99s/it]

{'type': 'Bad Response', 'reason': 'Sparky incorrectly suggests meeting Bizy, a time management expert, instead of connecting the user to Bruno, who would be more suitable for providing creative enhancement and encouragement.'}
2072 3


 70%|██████▉   | 16/23 [05:45<02:17, 19.64s/it]

{'type': 'Good Response but not match', 'reason': 'While the agent provides a pathway to learn mindfulness techniques from Bruno, it does not offer any tips for resolving interpersonal problems as expected.'}
1289 2


 74%|███████▍  | 17/23 [05:58<01:46, 17.68s/it]

{'type': 'Perfectly Match', 'reason': 'Sparky guided the user to Bizy for time management strategies, which aligns perfectly with the expected agent behavior.'}
1329 2


 78%|███████▊  | 18/23 [06:16<01:28, 17.76s/it]

{'type': 'Bad Response', 'reason': "The agent did not provide any specific study techniques or methods that the user was seeking. Instead, it focused on connecting the user to another agent without addressing the user's request for new study techniques."}
2082 3


 83%|████████▎ | 19/23 [06:30<01:06, 16.72s/it]

{'type': 'Perfectly Match', 'reason': 'The agent successfully guided the user to Bruno, the mindfulness specialist, which aligns with the expected behavior of providing mindfulness meditation to help release anxiety and improve self-image.'}
1309 2


 87%|████████▋ | 20/23 [06:43<00:46, 15.46s/it]

{'type': 'Good Response but not match', 'reason': "While Sparky directs the user to Bruno for relaxation support, it does not provide the emotional support explicitly needed for the user's feelings of sadness in relation to their romantic relationship."}
1298 2


 91%|█████████▏| 21/23 [06:58<00:30, 15.33s/it]

{'type': 'Perfectly Match', 'reason': 'Sparky successfully introduced Bruno, the relaxation expert, and offered to connect the user with him for calming mindfulness tips, aligning perfectly with the expected agent behavior.'}
1300 2


 96%|█████████▌| 22/23 [07:09<00:14, 14.18s/it]

{'type': 'Bad Response', 'reason': 'The agent did not provide encouragement or help the user reassess their study methods. Instead, it directed the user to another agent without addressing their feelings of discouragement or providing any practical advice on how to improve their study methods.'}
1307 2


100%|██████████| 23/23 [07:58<00:00, 20.80s/it]

{'type': 'Bad Response', 'reason': "The agent did not guide the user to Bruno for emotional calming and communication skills practice, nor did it encourage the user to start with emotional management. Instead, it provided general encouragement and metaphors that do not address the user's specific need for improved communication with parents."}
6639 8





In [18]:
file_path = f'{AGENT_NAME}_{PROMPT_VERSION}_overall_evaluate.csv'

df = pd.DataFrame(overall_evaluate, columns=['type', 'reason'])
df.insert(0, 'scenario_id', range(1, len(df) + 1))

df.to_csv(file_path, index=False)