In [1]:
OPENAI_API_KEY=''

In [2]:
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

## Settings

In [3]:
MAX_TOKEN = 6000
MAX_CHAT = 10
SCENARIO_ID = 1
PERSONA = 'emma'
AGENT_NAME = 'bizy'
PROMPT_VERSION = 'V1'

## User

In [4]:
from enum import Enum
from pydantic import BaseModel

class UserAction(str, Enum):
    say = "say"
    leave = "leave"

class UserResponse(BaseModel):
    action: UserAction
    answer: str

In [5]:
import json
class Person:
    def __init__(self, scenarioID: int, persona: str):

        scenarioFile = open(f'./user/{AGENT_NAME}/scenario{scenarioID}.txt', 'r')
        scenario = scenarioFile.read()
        scenarioFile.close()

        personaFile = open(f'./persona/{persona}.txt', 'r')
        persona = personaFile.read()
        personaFile.close()
        example = '''
                    Example1:
                    Input: Are you looking for relaxation techniques or some fun trivia games to de-stress? Let me help you find the best fit!
                    Output: {action: say, answer: 'I am looking for relaxation techniques'}
                    Example2:
                    Input: There's a wonderful bear named Bruno who specializes in relaxation techniques. Would you like to meet him for some calming mindfulness tips?
                    Output: {action: leave, answer: 'Yes, meeting Bruno sounds lovely! I would love to get some calming mindfulness tips from him.'}
                  '''
        systemPrompt = f'You are a user talking to AI APP which can help you deal with your problem during break time. \
                            This is your persona: {persona}\
                            Please play the role according to the scenario: {scenario}\
                            Use Action → Answer structure for responses.\
                            Available Actions:\
                            1. say: respond base on persona and scenario\
                            2. leave: leave the chat when you think the conversation is over, no need to continue\
                            Examples:\n{example}'
        # print(systemPrompt)

        self.messages = [
            {'role': 'system', 'content': systemPrompt}, 
        ]
        self.leaveChat = False
    
    def say(self):
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=self.messages,
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": UserResponse.model_json_schema()
                    }
            }
        )

        message = response.choices[0].message.content
        self.messages.append({'role': 'assistant', 'content': message})

        # str to dict
        message = json.loads(message)
        self.leaveChat = (message['action'] == 'leave')

        info = {
            'token': response.usage.total_tokens,
        }
        
        return message['answer'], info
    
    def listen(self, message: str):
        self.messages.append({'role': 'user', 'content': message})

        

## Agent

In [6]:
class tmpAction(str, Enum):
    hi = "hi"

In [7]:
class tmpResponseFormat(BaseModel):
    action: tmpAction
    answer: str

In [8]:
import json
class Animal:
    def __init__(self, name, version: str):

        self.agent_name = name
        agentFile = open(f'./agent/{name}_{version}.txt', 'r')
        agentPrompt = agentFile.read()
        agentFile.close()

        self.responseFormat = {
            'type': 'json_schema',
            'json_schema': 
                {
                    "name":"whocares", 
                    "schema": self.get_response_schema()
                }
        }
        self.systemPrompt = [
            {'role': 'system', 'content': agentPrompt}, 
        ]
        self.dialogues = []
    
    def get_response_schema(self):
        return tmpResponseFormat.model_json_schema()
    
    def say(self):
        
        prompt = self.systemPrompt + self.dialogues
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=prompt,
            response_format=self.responseFormat
        )

        message = response.choices[0].message.content
        
        self.dialogues.append({'role': 'assistant', 'content': message})
        message = json.loads(message)

        info = {
            'token': response.usage.total_tokens,
        }
        
        return message['answer'], info
    
    def listen(self, message: str):
        self.dialogues.append({'role': 'user', 'content': message})

        

### Sparky

In [9]:
class sparkyActionV0(str, Enum):
    call_bruno = "call_bruno"
    call_bizy = "call_bizy"
    ask_more = "ask_more"
    introduce_bruno = "introduce_bruno"
    introduce_bizy = "introduce_bizy"
    advise = "advise"

class sparkyActionV1(str, Enum):
    guide_to_bruno = "guide_to_bruno"
    guide_to_bizy = "guide_to_bizy"
    explore = "explore"
    introduce_bruno = "introduce_bruno"
    introduce_bizy = "introduce_bizy"

In [10]:
class sparkyResponseFormat(BaseModel):
    action: sparkyActionV1
    answer: str

In [11]:
class Sparky(Animal):

    def get_response_schema(self):
        return sparkyResponseFormat.model_json_schema()
    

### Bizy

In [12]:
class bizyActionV0(str, Enum):
    greet = "greet"
    start_analysis = "start_analysis"
    analysing = "analysing"
    finish_analysis = "finish_analysis"
    ask_excuse = "ask_excuse"
    change_excuse = "change_excuse"
    advise = "advise"

In [13]:
class bizy0ActionV1(str, Enum):
    greet = "greet"
    analysis = "analysis"
    advise = "advise"

In [14]:
class bizyResponseFormat(BaseModel):
    action: bizy0ActionV1
    answer: str

In [15]:
class bizy1ActionV1(str, Enum):
    start_analysis = "start_analysis"
    analysing = "analysing"
    finish_analysis = "finish_analysis"

In [16]:
class bizy1ResponseFormat(BaseModel):
    action: bizy1ActionV1
    answer: str

In [26]:
class Bizy(Animal):
    
    def get_response_schema(self):
        return bizyResponseFormat.model_json_schema()
    
    def say(self):
        prompt = self.systemPrompt + self.dialogues
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=prompt,
            response_format=self.responseFormat
        )

        message = response.choices[0].message.content
        self.dialogues.append({'role': 'assistant', 'content': message})

        message = json.loads(message)

        if message['action'] == 'analysis':
            with open(f'./agent/bizy1_{PROMPT_VERSION}.txt', 'r') as agentFile:
                self.systemPrompt = [
                    {'role': 'system', 'content': agentFile.read()}, 
                ]
            self.responseFormat = {
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": bizy1ResponseFormat.model_json_schema()
                    }
            }

            self.dialogues.append({'role': 'assistant', 'content': "You've been changed to little bee who responsible for analysis."})
            
                
        if message['action'] == 'finish_analysis':
            with open(f'./agent/bizy0_{PROMPT_VERSION}.txt', 'r') as agentFile:
                self.systemPrompt = [
                    {'role': 'system', 'content': agentFile.read()}, 
                ]
            self.responseFormat = {
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": self.get_response_schema()
                    }
            }
            self.dialogues.append({'role': 'assistant', 'content': "You've been changed Back to Bizy who's the leader of a group of bees."})
            #Todo: should add summarize for bizy0

        info = {
            'token': response.usage.total_tokens,
            'action': message['action']
        }
        
        return message['answer'], info


In [27]:
dialogue = "I haven't start study midterm"
bizy = Bizy('bizy0', PROMPT_VERSION)

bizy.listen(dialogue)
bizy.say()

("Buzz buzz! Let's see what's behind this delay. Little bee, take over!",
 {'token': 353, 'action': 'analysis'})

In [28]:
bizy.say()

('Buzz buzz! What feelings come up when you think about studying for your midterm?',
 {'token': 445, 'action': 'start_analysis'})

In [29]:
bizy.listen('it\'s boring')
bizy.say()

("Buzz buzz! Do you feel pressure to do well, or do you resist someone else's expectations?",
 {'token': 489, 'action': 'analysing'})

## Evaluator

In [30]:
class responseType(str, Enum):
    perfectly_match = "Perfectly Match"
    good_response = "Good Response"
    bad_response = "Bad Response"

In [31]:
class EvaluatorResponse(BaseModel):
    accuracy: int
    practicality: int

class OverallEvaluatorResponse(BaseModel):
    type: responseType
    reason: str

In [32]:
import json
class Evaluator():
    def __init__(self, agentName: str, scenarioID: int) -> None:

        systemPrompt = '''You are an evaluator. I will provide you with a user’s statement and an agent’s response.
                            You should evaluate the accuracy and practicality base on the scenario.
                            - Accuracy: Score from 0 to 10. This measures whether the model’s response appropriately addresses the user’s statement.
                            - Practicality: Score from 0 to 10. This evaluates whether the model’s suggestion is helpful to the user.
                        '''
        
        with open(f'./evaluator/{agentName}/scenario{scenarioID}.txt', 'r') as file:
            systemPrompt += file.read()
        with open(f'./evaluator/examples.txt', 'r') as file:
            systemPrompt += file.read()
    
        self.messages = [
            {'role': 'system', 'content': systemPrompt}, 
        ]
        
    def evaluate(self, personMessage: str, animalMessage: str):
        self.messages.append({'role': 'user', 'content': f'User: {personMessage}\nAgent: {animalMessage}'})
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=self.messages,
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": EvaluatorResponse.model_json_schema()
                    }
            }
        )

        message = json.loads(response.choices[0].message.content)
    
        return message['accuracy'], message['practicality']
    
    def overall_evaluate(self, dialogues: list, agentName, scenarioID):
        overall_systemPrompt = ''' You are an evaluator. Now You have to evaluate agent's behavior.
                                I will provide you a scenario with expect agent behaviors and a dialogue contains user's statement and an agent's response.
                                User's question and answer might lead to different types of agent responses. So it's important to consider both scenario expect behavior and the whole dialogue before making a decision.
                                You should classify the agent's response into one of the following types:
                                1. 'Perfectly Match' : Match at least one of the expected agent behaviors we provided.
                                2. 'Good Response' : Didn't match any of the expected agent behavior we provided, but still a good response that can help the user.
                                3. 'Bad Response' : Didn't match the expected agent behavior we provided, and can not help the user or not practical.

                                For Reason, please be concise and brief, reply less than 15 words. 
                                    Perfectly Match: Match which behavior.
                                    Good Response: How it can help the user.
                                    Bad Response: Why it can not help the user.
                                **Perform action call/guide_to directly without introduce is definetly bad response.**
                            '''

        prompt = ''
        with open(f'./evaluator/{agentName}/scenario{scenarioID}.txt', 'r') as file:
            prompt += 'Scenario:\n' + file.read()
        
        with open(f'./evaluator/{agentName}/{agentName}_eval.txt', 'r') as file:
            prompt += 'Agent Introduction:\n' + file.read()
        
        dialogue = '\n'.join(dialogues)
        prompt += 'Dialogue:\n' + dialogue
        
        message = [
            {'role': 'system', 'content': overall_systemPrompt},
            {'role': 'assistant', 'content': prompt}
        ]

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=message,
            response_format={
                'type': 'json_schema',
                'json_schema': 
                    {
                        "name":"whocares", 
                        "schema": OverallEvaluatorResponse.model_json_schema()
                    }
            }
        )

        message = json.loads(response.choices[0].message.content)
        print(message)
        return message['type'], message['reason']
        


## Report

In [33]:
import pandas as pd
class Report():
    def __init__(self) -> None:
        self.finish = True
        self.chatHistory = []

    def addHistory(self, chat):
        self.chatHistory.append(chat)
    
    def generateReport(self, filename:str):
        df = pd.DataFrame(self.chatHistory)
        df.to_csv(f'{filename}.csv', index=False)

## Experiment

In [34]:
from tqdm import trange

overall_evaluate = []
for i in trange(1,10):

    SCENARIO_ID = i
    person = Person(scenarioID = SCENARIO_ID, persona = PERSONA)
    bizy = Bizy('bizy0', 'V1')
    evaluator = Evaluator(agentName= AGENT_NAME, scenarioID= SCENARIO_ID)
    report = Report()

    totalToken = 0
    totalChat = 0
    dialogue = ''

    while not person.leaveChat:
        # chat
        personMessage, personInfo = person.say()
        bizy.listen(personMessage)
        animalMessage, animalInfo = bizy.say()
        if(animalInfo['action'] == 'analysis'):
            animalMessage, animalInfo = bizy.say()
        person.listen(animalMessage)

        history = {
            'person_say': personMessage,
            'animal_action': animalInfo['action'],
            'animal_say': animalMessage,
            'animal_token': animalInfo['token'],
        }
        dialogue += f'user: {personMessage}, agent: {animalMessage}'

        report.addHistory(history)
        totalChat += 1
        totalToken += animalInfo['token']

        if totalToken > MAX_TOKEN or totalChat > MAX_CHAT:
            report.finish = False
            break

    report.generateReport(filename=f'report_{AGENT_NAME}_{SCENARIO_ID}')
    
    classification = evaluator.overall_evaluate(dialogue, AGENT_NAME, SCENARIO_ID)
    overall_evaluate.append(classification)
    
    print(totalToken, totalChat)


 11%|█         | 1/9 [00:11<01:28, 11.10s/it]

{'type': 'Perfectly Match', 'reason': "Analyzed user's procrastination feelings and suggested manageable goals."}
3039 5


 22%|██▏       | 2/9 [00:27<01:39, 14.26s/it]

{'type': 'Perfectly Match', 'reason': 'Agent guided in creating a study plan and managing subjects.'}
4267 7


 33%|███▎      | 3/9 [00:43<01:29, 14.84s/it]

{'type': 'Perfectly Match', 'reason': 'Analyzed procrastination and suggested breaking down tasks effectively.'}
4591 7


 44%|████▍     | 4/9 [01:13<01:44, 20.82s/it]

{'type': 'Perfectly Match', 'reason': 'Provided strategies like breaking tasks into smaller steps.'}
6633 9


 56%|█████▌    | 5/9 [01:26<01:12, 18.03s/it]

{'type': 'Perfectly Match', 'reason': 'Suggested breaking projects into smaller goals and celebrating achievements.'}
3877 6


 67%|██████▋   | 6/9 [01:45<00:55, 18.52s/it]

{'type': 'Good Response', 'reason': 'Encourages goal-setting but lacks specific environment improvement suggestions.'}
5864 8


 78%|███████▊  | 7/9 [02:00<00:34, 17.43s/it]

{'type': 'Perfectly Match', 'reason': 'Provided methods to increase motivation and variation in learning.'}
3295 6


 89%|████████▉ | 8/9 [02:18<00:17, 17.38s/it]

{'type': 'Bad Response', 'reason': 'Did not suggest any time-tracking tools or daily plans.'}
5533 8


100%|██████████| 9/9 [02:28<00:00, 16.50s/it]

{'type': 'Perfectly Match', 'reason': 'Offers advice on estimating and breaking down tasks effectively.'}
2555 5





In [35]:
file_path = f'{AGENT_NAME}_{PROMPT_VERSION}_overall_evaluate.csv'

df = pd.DataFrame(overall_evaluate, columns=['type', 'reason'])
df.insert(0, 'scenario_id', range(1, len(df) + 1))

df.to_csv(file_path, index=False)