In [1]:
OPENAI_API_KEY=''

In [2]:
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

In [3]:
MAX_TOKEN = 6000
MAX_CHAT = 10
SCENARIO_ID = 1
PERSONA = 'rebellious_people'
AGENT_NAME = 'sparky'

In [4]:
from enum import Enum
from pydantic import BaseModel

class UserAction(str, Enum):
    say = "say"
    leave = "leave"

class UserResponse(BaseModel):
    action: UserAction
    answer: str

In [5]:
import json
class Person:
    def __init__(self, scenarioID: int, persona: str):

        scenarioFile = open(f'./user/{AGENT_NAME}/scenario{scenarioID}.txt', 'r')
        scenario = scenarioFile.read()
        scenarioFile.close()

        personaFile = open(f'./persona/{persona}.txt', 'r')
        persona = personaFile.read()
        personaFile.close()
        example = '''
                    Example1:
                    Input: Are you looking for relaxation techniques or some fun trivia games to de-stress? Let me help you find the best fit!
                    Output: {action: say, answer: 'I am looking for relaxation techniques'}
                    Example2:
                    Input: There's a wonderful bear named Bruno who specializes in relaxation techniques. Would you like to meet him for some calming mindfulness tips?
                    Output: {action: leave, answer: 'Yes, meeting Bruno sounds lovely! I would love to get some calming mindfulness tips from him.'}
                  '''
        systemPrompt = f'You are a user talking to AI APP which can help you deal with your problem during break time. \
                            This is your persona: {persona}\
                            Please play the role according to the scenario: {scenario}\
                            Use Action → Answer structure for responses.\
                            Available Actions:\
                            1. say: respond base on persona and scenario\
                            2. leave: leave the chat when you think the conversation is over, no need to continue\
                            Examples:\n{example}'
        # print(systemPrompt)

        self.messages = [
            {'role': 'system', 'content': systemPrompt}, 
        ]
        self.leaveChat = False
    
    def say(self):
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=self.messages,
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": UserResponse.model_json_schema()
                    }
            }
        )

        message = response.choices[0].message.content
        self.messages.append({'role': 'assistant', 'content': message})

        # str to dict
        message = json.loads(message)
        self.leaveChat = (message['action'] == 'leave')

        info = {
            'token': response.usage.total_tokens,
        }
        
        return message['answer'], info
    
    def listen(self, message: str):
        self.messages.append({'role': 'user', 'content': message})

        

In [6]:
test = Person(SCENARIO_ID, PERSONA)
ans = test.say()
print(ans)

('What does this app do?', {'token': 372})


In [7]:
class sparkyAction(str, Enum):
    call_bruno = "call_bruno"
    call_bizy = "call_bizy"
    ask_more = "ask_more"
    introduce_bruno = "introduce_bruno"
    introduce_bizy = "introduce_bizy"
    advise = "advise"

class AgentResponse(BaseModel):
    action: sparkyAction
    answer: str

In [8]:
import time
import json

class Animal:
    def __init__(self, name: str):

        self.agent_name = name
        agentFile = open(f'./agent/{name}.txt', 'r')
        agentPrompt = agentFile.read()
        agentFile.close()

        # create assistant and thread
        self.assistant = client.beta.assistants.create(
            name = self.agent_name,
            instructions = agentPrompt,
            model="gpt-4o-mini",
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": AgentResponse.model_json_schema()
                    }
            }
        )
        self.thread = client.beta.threads.create()
        self.user_message = 'hello'

    def say(self):

        prompt = client.beta.threads.messages.create(
            thread_id = self.thread.id,
            role = "user",
            content = self.user_message
        )

        run = client.beta.threads.runs.create_and_poll(
            thread_id=self.thread.id,
            assistant_id=self.assistant.id,
        )

        while True:
            runData = client.beta.threads.runs.retrieve(
                thread_id=self.thread.id,
                run_id=run.id
            )

            if runData.status == 'completed': 
                response = client.beta.threads.messages.list(
                    thread_id=self.thread.id
                )
                message = json.loads(response.data[0].content[0].text.value)

                info = {
                    'token': runData.usage.total_tokens,
                    'action': message['action']
                }
                return message['answer'], info

            else:
                print("runData.status")
                time.sleep(2) 


    def listen(self, message: str):
        self.user_message = message
    

In [9]:
test = Animal(name = AGENT_NAME)
print(test.say())
test.listen('i feel tired')
print(test.say())

("Hello there! I'm Sparky, your friendly forest guide! How can I assist you today?", {'token': 587, 'action': 'advise'})
('Oh dear, it sounds like you could use a little pick-me-up! There’s a meditation master named Bruno who can help you relax. Would you like to meet him?', {'token': 644, 'action': 'introduce_bruno'})


In [10]:
class responseType(str, Enum):
    perfectly_match = "Perfectly Match"
    good_response = "Good Response but not match"
    bad_response = "Bad Response"

In [11]:
from pydantic import BaseModel

class EvaluatorResponse(BaseModel):
    accuracy: int
    practicality: int

class OverallEvaluatorResponse(BaseModel):
    type: responseType
    reason: str

In [12]:
import json

class Evaluator():
    def __init__(self, agentName: str, scenarioID: int) -> None:

        systemPrompt = '''You are an evaluator. I will provide you with a user’s statement and an agent’s response.
                            You should evaluate the accuracy and practicality base on the scenario.
                            - Accuracy: Score from 0 to 10. This measures whether the model’s response appropriately addresses the user’s statement.
                            - Practicality: Score from 0 to 10. This evaluates whether the model’s suggestion is helpful to the user.
                        '''
        with open(f'./evaluator/{agentName}/scenario{scenarioID}.txt', 'r') as file:
            systemPrompt += file.read()
        with open(f'./evaluator/examples.txt', 'r') as file:
            systemPrompt += file.read()

        self.messages = [
            {'role': 'system', 'content': systemPrompt}, 
        ]
        
    def evaluate(self, personMessage: str, animalMessage: str):
        self.messages.append({'role': 'user', 'content': f'User: {personMessage}\nAgent: {animalMessage}'})
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=self.messages,
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": EvaluatorResponse.model_json_schema()
                    }
            }
        )

        message = json.loads(response.choices[0].message.content)
    
        return message['accuracy'], message['practicality']
    
    def overall_evaluate(self, dialogues: list, agentName, scenarioID):
        overall_systemPrompt = ''' You are an evaluator. Now You have to evaluate agent's behavior.
                                I will provide you a scenario with expect agent behaviors and a dialogue contains user's statement and an agent's response.
                                You should classify the agent's response into one of the following types:
                                1. 'Perfectly Match' : Match one or more expected agent behaviors we provided.
                                2. 'Good Response but not match' : Didn't match any of the expected agent behavior we provided, but still a good response that can help the user.
                                3. 'Bad Response' : Didn't match the expected agent behavior we provided, and can not help the user or not practical.
                            '''
        with open(f'./evaluator/{agentName}/scenario{scenarioID}.txt', 'r') as file:
            overall_systemPrompt += 'Scenario:\n' + file.read()
        
        with open(f'./evaluator/{agentName}/{agentName}_eval.txt', 'r') as file:
            overall_systemPrompt += 'Agent Introduction:\n' + file.read()

        dialogue = '\n'.join(dialogues)
        overall_systemPrompt += 'Dialogue:\n' + dialogue
        
        message = [
            {'role': 'system', 'content': overall_systemPrompt},
        ]

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=message,
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": OverallEvaluatorResponse.model_json_schema()
                    }
            }
        )

        message = json.loads(response.choices[0].message.content)
        print(message)
        return message['type'], message['reason']

In [None]:
import os
import random
# not finish yet
class Arena():
    def __init__(self):
        pass
    
    def oneRound(self, A: str, B: str, scenarioID: int, agentName: str) -> int:
        print(f'Scenario {scenarioID}')
        with open(f'./evaluator/{agentName}/scenario{scenarioID}.txt', 'r') as file:
            print(file.read())
        print('- '*40)
        print(f'A:\n{A}\nB:\n{B}')
        print('- '*40)
        print('Please evaluate the two agents based on the following criteria:')
        print('1. A is better/n2. B is better/n3. tie/n4. both are bad')
        print('- '*40)
        result = input('Enter the result: ')
        print('-'*80)
        return int(result)
    
    def compareReports(self, dir1: str, dir2: str, agentName:str) -> None:
        '''
        dir1: path to the first agent reports
        dir2: path to the second agent reports
        '''
        # get all filename from dir1 and dir2
        files1 = os.listdir(dir1)
        files2 = os.listdir(dir2)
        N = min(len(files1), len(files2))
        results = []
        
        for i in range(N):
            A = ''
            B = ''

            if random.randint(1, 100) % 2 == 0:
                A = '\n'.join(history1[i]['dialogues'])
                B = '\n'.join(history2[i]['dialogues'])
                result = self.oneRound(A, B, i+1, agentName)
                results.append(result)
            else:
                A = '\n'.join(history2[i]['dialogues'])
                B = '\n'.join(history1[i]['dialogues'])
                result = self.pk(firstDialog, secondDialog, i+1, agentName)
                result = 3 - result if result < 3 else result
                results.append(result)
            
        report = {
            history1['version']: len([1 for result in results if result == 1]),
            history2['version']: len([1 for result in results if result == 2]),
            'tie': len([1 for result in results if result == 3]),
            'both_bad': len([1 for result in results if result == 4])
        }
        return report

In [None]:
d1 = 'dialogs1'
d2 = 'dialogs2'

In [13]:
dialogue = ['user: What does this app do?', 'agent: This app is here to connect you with friendly animal specialists who can help you with various challenges! Whether it\'s stress, procrastination, or making friends, we\'ve got furry and buzzing friends ready to lend a paw or wing!', 'user+ How can you help me?', 'agent: What are you struggling with right now? Whether it\'s stress, procrastination, or something else, I can introduce you to the perfect animal specialist!', 'user :I\'m not sure yet, but I would like to find something to help with stress.', 'agent: I have just the friend for you! Meet Bruno, the mindfulness master bear who can help you find calm and tackle stress—would you like to chat with him?', 'user: Yes, I would love to chat with Bruno and learn how to tackle stress!', 'agent: Fantastic! Bruno is waiting for you, ready to share his peaceful paws and calming wisdom. Get ready to relax and unwind!']
evaluator = Evaluator(agentName = 'sparky', scenarioID = 1)
evaluator.overall_evaluate(dialogue, 'sparky', 1)

{'type': 'Good Response but not match', 'reason': 'Sparky does provide an introduction to a specific agent (Bruno) that can help with stress, which is helpful. However, Sparky does not provide a brief introduction to the main features of the app or offer a tour/tutorial for exploring further functionalities, which are part of the expected agent behaviors.'}


('Good Response but not match',
 'Sparky does provide an introduction to a specific agent (Bruno) that can help with stress, which is helpful. However, Sparky does not provide a brief introduction to the main features of the app or offer a tour/tutorial for exploring further functionalities, which are part of the expected agent behaviors.')

In [14]:
test = Evaluator(AGENT_NAME, SCENARIO_ID)
test.evaluate('What does this app do?', 'I do not know')

(0, 0)

In [15]:
import pandas as pd
class Report():
    def __init__(self) -> None:
        self.finish = True
        self.chatHistory = []

    def addHistory(self, chat):
        self.chatHistory.append(chat)
    
    def generateReport(self, filename:str):
        df = pd.DataFrame(self.chatHistory)
        df.to_csv(f'{filename}.csv', index=False)

In [16]:
MAX_TOKEN = 6000
MAX_CHAT = 10
SCENARIO_ID = 1
PERSONA = 'rebellious_people'
AGENT_NAME = 'sparky'
PROMPT_VERSION = 'V0'


In [17]:
overall_evaluate = []

for i in range(1,24):
    SCENARIO_ID = i
    person = Person(scenarioID = SCENARIO_ID, persona = PERSONA)
    animal = Animal(name= AGENT_NAME)
    evaluator = Evaluator(agentName= AGENT_NAME, scenarioID= SCENARIO_ID)
    report = Report()

    totalToken = 0
    totalChat = 0
    dialogue = []

    while not person.leaveChat:
        # chat
        personMessage, personInfo = person.say()
        animal.listen(personMessage)
        animalMessage, animalInfo = animal.say()
        person.listen(animalMessage)

        # print(f'User: {personMessage}\nAgent: {animalMessage}')
        # print(f'user leave chat: {person.leaveChat}')

        # metrics
        accuracy, practicality = evaluator.evaluate(personMessage, animalMessage)
        # print(f'Accuracy: {accuracy}, Practicality: {practicality}\n')
        
        history = {
            'person_say': personMessage,
            'animal_action': animalInfo['action'],
            'animal_say': animalMessage,
            'animal_token': animalInfo['token'],
            'accuracy': accuracy,
            'practicality': practicality,
        }
        dialogue.append(f'user: {personMessage}, agent: {animalMessage}')

        report.addHistory(history)
        totalChat += 1
        totalToken += animalInfo['token']

        if totalToken > MAX_TOKEN or totalChat > MAX_CHAT:
            report.finish = False
            break

    report.generateReport(filename=f'report_{AGENT_NAME}_{SCENARIO_ID}')
    
    classification = evaluator.overall_evaluate(dialogue, AGENT_NAME, SCENARIO_ID)
    overall_evaluate.append(classification)
    
    print(totalToken, totalChat)


{'type': 'Bad Response', 'reason': "The agent does not provide any introduction to the main features of the app or guide the user in exploring the functions. Instead, it focuses solely on connecting the user to specialists, which is not helpful for a first-time user who needs to understand the app's capabilities."}
2018 3
{'type': 'Good Response but not match', 'reason': "While Sparky did engage the user in a conversation and suggest alternative topics related to relaxation, the response did not specifically address the user's inquiries about meditation content generation or introduce relevant features directly. It was a good attempt to maintain user interest, but it lacked the connection to the user's original questions."}
1304 2
{'type': 'Good Response but not match', 'reason': "The agent was responsive to the user's feelings of frustration and offered alternative options for assistance. However, it didn't proactively ask about the user's current mood or state initially and missed th

In [18]:
file_path = f'{AGENT_NAME}_{PROMPT_VERSION}_overall_evaluate.csv'

df = pd.DataFrame(overall_evaluate, columns=['type', 'reason'])
df.insert(0, 'scenario_id', range(1, len(df) + 1))

df.to_csv(file_path, index=False)