In [None]:
%env CUDA_VISIBLE_DEVICES=0
%env TMPDIR=/raid/users/ryan_cheng/tmp
import os
import glob
import re
import json
import random
import time
from absl import app, flags
from tqdm import tqdm
from datetime import datetime
import openai
from openai import OpenAI
from transformers import AutoTokenizer
import pandas as pd
import numpy as np

np.random.seed(0)

try:
    from vllm import LLM, SamplingParams
    import ray
except ImportError:
    pass
with open(os.path.abspath('../ryan_openai.txt'), 'r') as f:
    # ../.. for notebook, .. for script
    client = OpenAI(api_key=f.read().rstrip('\n'))

vllm_models = [
    'meta-llama/Llama-2-13b-hf', 'Llama-2-13b-hf', 'Llama13b', 'llama13b',
    'mistralai/Mistral-7B-v0.1', 'Mistral', 'mistral',
    'mistralai/Mixtral-8x7B-v0.1', 'Mixtral', 'mixtral',
    'meta-llama/Llama-2-70b-hf', 'Llama-2-70b-hf', 'Llama70b', 'llama70b',
    'meta-llama/Meta-Llama-3-70B-Instruct', 'Llama-3-70B-Instruct', 'meta-llama/Llama-3.1-70B-Instruct', 'meta-llama/Llama-3.1-8B-Instruct',
    'meta-llama/Meta-Llama-3-8B-Instruct', 'Llama-3-8B-Instruct',
    'mistralai/Mistral-7B-Instruct-v0.1', 'mistral-instruct',
    'google/gemma-7b', 'gemma'
]

# run 'ray start --head --num-gpus <NUM>' in bash first!
def setup_llm(model_name, config):
    if model_name not in vllm_models:
        return None, None
    if config['gpus'] > 1:
        ray.init(ignore_reinit_error=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    llm = LLM(model=model_name, tensor_parallel_size=config['gpus'], download_dir='/raid/users/ryan_cheng/models', gpu_memory_utilization=0.95, max_model_len=41632)
    return llm, tokenizer

def completion_create_helper(model_name, config, prompt, llm=None):
    # limit prompt in all cases
    if model_name not in vllm_models:
        # for some reason vLLM models simply repeat this last statement if present
        prompt += " Limit your answer to three sentences or less!"

    ret = '' # return the output ret at the end and use to calculate cost

    if model_name == "gpt-3.5-turbo-instruct":
        ret = client.completions.create(
                model="gpt-3.5-turbo-instruct",
                prompt=prompt,
                temperature=0.8,
                max_tokens=config['max_tokens'],
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0
        )
        ret = ret.choices[0].text.strip()
        #find_line = ret.find("\n")
        #if find_line != -1:
            #ret = ret[:find_line]
    elif model_name in ["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4-turbo", "gpt-4o", "gpt-4o-mini"]:
        ret = client.chat.completions.create(
            model=model_name,
            messages=[{"role": "system", "content": prompt}],
            max_tokens=config['max_tokens']
        )
        ret = ret.choices[-1].message.content

    elif model_name in vllm_models and llm:
        sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=config['max_tokens'])
        output = llm.generate([prompt], sampling_params)
        ret = output[0].outputs[0].text
    else:
        raise NotImplementedError

    # if config['model'] not in vllm_models:
    #     running_cost_for_iteration += api_cost(prompt=prompt, answer=ret, model=config['model'])
    #     if config['verbose']:
    #         print(f"RUNNING COST FOR ITERATION SO FAR: {running_cost_for_iteration}")
    
    return ret

def completion_create(model_name, config, prompt, llm=None, keep_trying=True):
    try:
        return completion_create_helper(model_name, config, prompt, llm)
    except (openai.APIError, openai.OpenAIError) as e:
        # print("ERROR", e)
        # print("sleeping for 10 seconds.")
        time.sleep(10)
        if keep_trying:
            return completion_create(model_name, config, prompt, llm, keep_trying)
        else:
            return None
            
def write_json(write_path, json_dict):
    if not os.path.exists(write_path):
        with open(write_path, 'w') as f:
            json.dump([], f)
    with open(write_path, 'r') as f:
        evaluations = json.load(f)
    evaluations.append(json_dict)
    with open(write_path, 'w') as f:
        json.dump(evaluations, f, indent=4)

env: CUDA_VISIBLE_DEVICES=0
env: TMPDIR=/raid/users/ryan_cheng/tmp


In [None]:
config_gpt4_mini = {
    'agent1_model': 'gpt-4o-mini',
    'agent2_model': 'gpt-4o-mini',
    'eval_model': 'gpt-4o-mini',
    'agent1_role': "\nPerson 1: ",
    'agent2_role': "\nPerson 2: ",
    #'generic_prompt': generic, 
    #'agent1_specific': agent_1_specific + do_not_repeat_text, 
    #'agent2_specific': agent_2_specific + do_not_repeat_text,
    #'eval_prompts': eval_prompts,
    'iterations': 1, # not used
    'verbose': True, # not used
    'write': True,
    'convo_length_limit': 10,
    'max_tokens': 256,
    'gpus': 2,
    'task_name': 'Persona Chat',
}

# Persona Chat Generation

In [44]:
persona_chat = pd.read_csv('./data/persona/personality.csv')
persona_chat.head()

Unnamed: 0.1,Unnamed: 0,Persona,chat
0,0,i like to remodel homes. i like to go hunting...,"hi , how are you doing ? i am getting ready to..."
1,1,my mom is my best friend. i have four sisters...,"hi , how are you doing today ?\ni am spending ..."
2,2,i had a gig at local theater last night. i wo...,"we all live in a yellow submarine , a yellow s..."
3,3,i am very athletic. i wear contacts. i have b...,hi ! i work as a gourmet cook .\ni do not like...
4,4,i am primarily a meat eater. i am a guitar pl...,how are you doing today\nwhat do you do for ca...


In [17]:
print(persona_chat.iloc[0]["Persona"])

 i like to remodel homes. i like to go hunting. i like to shoot a bow. my favorite holiday is halloween.


In [42]:
print(persona_chat.iloc[0]["chat"])

hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .
you must be very fast . hunting is one of my favorite hobbies .
i am ! for my hobby i like to do canning or some whittling .
i also remodel homes when i am not out bow hunting .
that is neat . when i was in high school i placed 6th in 100m dash !
that is awesome . do you have a favorite season or time of year ?
i do not . but i do have a favorite meat since that is all i eat exclusively .
what is your favorite meat to eat ?
i would have to say its prime rib . do you have any favorite foods ?
i like chicken or macaroni and cheese .
do you have anything planned for today ? i think i am going to do some canning .
i am going to watch football . what are you canning ?
i think i will can some jam . do you also play footfall for fun ?
if i have time outside of hunting and remodeling homes . which is not much !



In [45]:
random_indices = np.array([np.random.choice(len(persona_chat), size=2, replace=False) for _ in range(10)])
random_indices # pairs of indices of Persona to  use in conversation

array([[7710, 4426],
       [5525, 2032],
       [ 574, 6234],
       [2303,  263],
       [3890,  877],
       [8913, 6509],
       [7400, 2988],
       [6578, 4878],
       [5014, 8746],
       [8254, 5155]])

In [25]:
persona_chat[persona_chat["chat"] == persona_chat.iloc[0]["chat"]]

Unnamed: 0.1,Unnamed: 0,Persona,chat
0,0,i like to remodel homes. i like to go hunting...,"hi , how are you doing ? i am getting ready to..."


In [76]:
"".join(["a", "b"])

'ab'

In [80]:
def generate_conversation(p1, p2, config, pturn=1, write=False, write_path='data/persona/persona_chat.json'):
    conv_dict = {
        "task_name": config['task_name'],
        "P1": p1,
        "P2": p2,
        "conversation": [],
        "pturn": pturn # beginning person (1 or 2)
        }
    round_num = 0
    while round_num < config['convo_length_limit']:
        if pturn == 1:
            prompt = "You are P1, and you are having a conversation with P2. Your backstory is:\n" + p1 + "\n" + "So far, the conversation is as below, and it is your turn to speak next.\n" + ("".join(conv_dict["conversation"]) if len(conv_dict["conversation"]) != 0 else "[You are starting the conversation.]") + "P1: "
            pturn = 2
            conv_dict["conversation"].append("P1: " + completion_create(config['agent1_model'], config, prompt) + "\n")
        else:
            prompt = "You are P2, and you are having a conversation with P1. Your backstory is:\n\n" + p2 + "\n\n" + "So far, the conversation is as below, and it is your turn to speak next.\n" + ("".join(conv_dict["conversation"]) if len(conv_dict["conversation"]) != 0 else "[You are starting the conversation.]") + "P2: "
            pturn = 1     
            conv_dict["conversation"].append("P2: " + completion_create(config['agent2_model'], config, prompt) + "\n")
        round_num += 1

    conv_dict["rounds"] = round_num
    if write:
        write_json(write_path, conv_dict)
    return conv_dict


In [81]:
conversations = []
for p1i, p2i in tqdm(random_indices):
    conversations.append(generate_conversation(persona_chat.iloc[p1i]["Persona"], persona_chat.iloc[p2i]["Persona"], config_gpt4_mini, write=True))

100%|██████████| 10/10 [01:44<00:00, 10.50s/it]


In [82]:
conversations


[{'task_name': 'Persona Chat',
  'P1': ' i like to cook. i have lived in several different states. i worked as a nurse for many years. my husband was a salesman. i have a cat named kj.',
  'P2': ' i like fruit. my favorite fruit is apple. i don t like to eat meat. i like to eat the skin of the apple. i like green apples.',
  'conversation': ["P1: P1: Hey! What’s your favorite meal to cook? I love trying out new recipes, especially since I've lived in so many different states and picked up various culinary influences along the way.\n",
   "P2: I don't really cook meat, so I often make dishes that highlight fruits and vegetables. My favorite is a fresh apple salad with green apples, nuts, and a light dressing. It's simple, delicious, and lets the apple's flavor shine!\n",
   'P1: That sounds refreshing! I love incorporating seasonal fruits and veggies into my dishes too. Is there a specific time of year when you enjoy making your apple salad the most?\n',
   "P2: I love making my apple s

In [49]:
persona_chat.iloc[p1i]["Persona"]

' i have a collection of video games. i have a very tone body. my wife and i enjoy long sunday drives. my son is a straight a student.'

In [51]:
persona_chat.iloc[p2i]["Persona"]

' my mom is the best baker in my family. i am in my second year of medical school. i am a hindu living in louisiana. i have two children but i am divorced.'

# Consistency Evaluation

In [91]:
# (1) Takes in dialog, takes in base prompt, checks inconsistencies with base prompt for each line and output

def eval_prompt_consistency(conv_dict, config):
    #assert 'eval_prompt_consistency' not in conv_dict # warn if we are replacing metrics we don't mean to overwrite

    conv_dict['eval_prompt_consistency'] = []
    conv_dict['P1_prompt_consistency_score'] = 0
    conv_dict['P2_prompt_consistency_score'] = 0
    p1_utterances = 0
    p2_utterances = 0
    pturn = conv_dict["pturn"]
    for line in conv_dict["conversation"]:
        if pturn == 1:
            prompt = "For the following line spoken by P1, answer YES if the line contradicts the given backstory of P1, and answer NO if the line does not contradict the provided backstory of P1. P1's backstory is:\n" + conv_dict["P1"] + "\n P1 spoke the following line: \n" + line + "\n\n Answer YES if the line contradicts the given backstory of P1, and answer NO if the line does not contradict the provided backstory of P1, followed by 1 sentence of reasoning.\n\n"
            output = completion_create(config['eval_model'], config, prompt)
            conv_dict['eval_prompt_consistency'].append(output)
            if "YES" not in output: # no contradiction
                conv_dict['P1_prompt_consistency_score'] += 1
            p1_utterances += 1
            pturn = 2
        else:
            prompt = "For the following line spoken by P2, answer YES if the line contradicts the given backstory of P2, and answer NO if the line does not contradict the provided backstory of P2. P2's backstory is:\n" + conv_dict["P2"] + "\n P2 spoke the following line: \n" + line + "\n\n Answer YES if the line contradicts the given backstory of P2, and answer NO if the line does not contradict the provided backstory of P2, followed by 1 sentence of reasoning.\n\n"
            output = completion_create(config['eval_model'], config, prompt)
            conv_dict['eval_prompt_consistency'].append(output)
            if "YES" not in output: # no contradiction
                conv_dict['P2_prompt_consistency_score'] += 1
            p2_utterances += 1
            pturn = 1

    conv_dict['P1_prompt_consistency_score'] /= p1_utterances
    conv_dict['P2_prompt_consistency_score'] /= p2_utterances

In [90]:
conversation

{'task_name': 'Persona Chat',
 'P1': ' i like to cook. i have lived in several different states. i worked as a nurse for many years. my husband was a salesman. i have a cat named kj.',
 'P2': ' i like fruit. my favorite fruit is apple. i don t like to eat meat. i like to eat the skin of the apple. i like green apples.',
 'conversation': ["P1: P1: Hey! What’s your favorite meal to cook? I love trying out new recipes, especially since I've lived in so many different states and picked up various culinary influences along the way.\n",
  "P2: I don't really cook meat, so I often make dishes that highlight fruits and vegetables. My favorite is a fresh apple salad with green apples, nuts, and a light dressing. It's simple, delicious, and lets the apple's flavor shine!\n",
  'P1: That sounds refreshing! I love incorporating seasonal fruits and veggies into my dishes too. Is there a specific time of year when you enjoy making your apple salad the most?\n',
  "P2: I love making my apple salad in

In [92]:
for conversation in conversations:
    eval_prompt_consistency(conversation, config_gpt4_mini)
conversations

[{'task_name': 'Persona Chat',
  'P1': ' i like to cook. i have lived in several different states. i worked as a nurse for many years. my husband was a salesman. i have a cat named kj.',
  'P2': ' i like fruit. my favorite fruit is apple. i don t like to eat meat. i like to eat the skin of the apple. i like green apples.',
  'conversation': ["P1: P1: Hey! What’s your favorite meal to cook? I love trying out new recipes, especially since I've lived in so many different states and picked up various culinary influences along the way.\n",
   "P2: I don't really cook meat, so I often make dishes that highlight fruits and vegetables. My favorite is a fresh apple salad with green apples, nuts, and a light dressing. It's simple, delicious, and lets the apple's flavor shine!\n",
   'P1: That sounds refreshing! I love incorporating seasonal fruits and veggies into my dishes too. Is there a specific time of year when you enjoy making your apple salad the most?\n',
   "P2: I love making my apple s

In [93]:
with open('data/persona/persona_chat2.json', 'w') as f:
        json.dump(conversations, f, indent=4)

In [106]:
# (2) Takes in dialog, takes in base prompt, checks inconsistencies with every line henceforth 

def eval_all_line_consistency(conv_dict, config):
    conv_dict['eval_all_line_consistency'] = []
    conv_dict['P1_all_line_consistency_score'] = 0
    conv_dict['P2_all_line_consistency_score'] = 0
    p1_utterances = 0
    p2_utterances = 0
    pturn = conv_dict["pturn"]
    for i, line in enumerate(conv_dict["conversation"]):
        if pturn == 1:
            
            prompt = "For the following line spoken by P1, answer YES if the line contradicts any line stated by P1 or P1's provided background, and answer NO if the line does not contradict any line in the provided conversation history of P1 and P1's provided background. P1 has the following backstory:\n" + conv_dict["P1"] + "\nP1 had the following conversation with P2:\n" + "".join(conv_dict["conversation"]) + "\n P1 spoke the following line: \n" + line + "\n\n Answer YES if the line contradicts any line stated by P1 throughout the course of the conversation or P1's provided background, and answer NO if the line does not contradict any line in the provided conversation history of P1 and P1's provided background, followed by 1 sentence of reasoning.\n\n"
            output = completion_create(config['eval_model'], config, prompt)
            conv_dict['eval_all_line_consistency'].append(output)
            if "YES" not in output: # no contradiction
                conv_dict['P1_all_line_consistency_score'] += 1
            p1_utterances += 1
            pturn = 2
        else:

            prompt = "For the following line spoken by P2, answer YES if the line contradicts any line stated by P2 or P2's provided background, and answer NO if the line does not contradict any line in the provided conversation history of P2 and P2's provided background. P2 has the following backstory:\n" + conv_dict["P2"] + "\nP2 had the following conversation with P1:\n" + "".join(conv_dict["conversation"]) + "\n P2 spoke the following line: \n" + line + "\n\n Answer YES if the line contradicts any line stated by P2 throughout the course of the conversation or P2's provided background, and answer NO if the line does not contradict any line in the provided conversation history of P2 and P2's provided background, followed by 1 sentence of reasoning.\n\n"
            output = completion_create(config['eval_model'], config, prompt)
            conv_dict['eval_all_line_consistency'].append(output)
            if "YES" not in output: # no contradiction
                conv_dict['P2_all_line_consistency_score'] += 1
            p2_utterances += 1
            pturn = 1

    conv_dict['P1_all_line_consistency_score'] /= p1_utterances
    conv_dict['P2_all_line_consistency_score'] /= p2_utterances

In [None]:
for conversation in conversations:
    eval_all_line_consistency(conversation, config_gpt4_mini)
conversations

In [None]:
# (4) Takes in dialog, takes in base prompt, checks for inconsistency with previous line 

def eval_prev_line_consistency(conv_dict, write=False, write_path='data/persona/persona_chat.json'):
    conv_dict['eval_prev_line_consistency'] = []
    conv_dict['P1_prev_line_consistency_score'] = 0
    conv_dict['P2_prev_line_consistency_score'] = 0
    p1_utterances = 0
    p2_utterances = 0
    pturn = conv_dict["pturn"]
    for i, line in enumerate(conv_dict["conversation"]):
        if pturn == 1:
            
            prompt = "For the following line spoken by P1, answer YES if the line contradicts a previous line stated by P1 or P1's provided background, and answer NO if the line does not contradict the provided conversation history of P1 and P1's provided background. P1 has the following backstory:\n" + conv_dict["P1"] + "\nP1 had the following conversation with P2:\n" + "".join(conv_dict["conversation"][:i]) + "\n P1 spoke the following line: \n" + line + "\n\n Answer YES if the line contradicts a previous line stated by P1 or P1's provided background, and answer NO if the line does not contradict the provided conversation history of P1 and P1's provided background, followed by 1 sentence of reasoning.\n\n"
            output = completion_create(config['eval_model'], config, prompt)
            conv_dict['eval_prev_line_consistency'].append(output)
            if "YES" not in output: # no contradiction
                conv_dict['P1_prev_line_consistency_score'] += 1
            p1_utterances += 1
            pturn = 2
        else:

            prompt = "For the following line spoken by P2, answer YES if the line contradicts a previous line stated by P2 or P2's provided background, and answer NO if the line does not contradict the provided conversation history of P2 and P2's provided background. P2 has the following backstory:\n" + conv_dict["P2"] + "\nP2 had the following conversation with P1:\n" + "".join(conv_dict["conversation"][:i]) + "\n P2 spoke the following line: \n" + line + "\n\n Answer YES if the line contradicts a previous line stated by P2 or P2's provided background, and answer NO if the line does not contradict the provided conversation history of P2 and P2's provided background, followed by 1 sentence of reasoning.\n\n"
            output = completion_create(config['eval_model'], config, prompt)
            conv_dict['eval_prev_line_consistency'].append(output)
            if "YES" not in output: # no contradiction
                conv_dict['P2_prev_line_consistency_score'] += 1
            p2_utterances += 1
            pturn = 1

    conv_dict['P1_prev_line_consistency_score'] /= p1_utterances
    conv_dict['P2_prev_line_consistency_score'] /= p2_utterances

In [None]:
for conversation in conversations:
    eval_prev_line_consistency(conversation, config_gpt4_mini)
conversations

In [None]:
with open('data/persona/persona_chat3.json', 'w') as f:
        json.dump(conversations, f, indent=4)